In [1]:
import pandas as pd

# Intro and Memory

In [2]:
chi = pd.read_csv("chicago.csv")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [3]:
chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [4]:
chi["Department"].nunique()

35

In [5]:
# use .astype() to convert Department column values to category
chi["Department"] = chi["Department"].astype("category")

In [6]:
# memory usage reduced from 1002.0 KB to 784.4 KB
chi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


# Commons String Methods - .lower(), .upper(), .title, and .len()

In [7]:
chi = pd.read_csv("chicago.csv")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


## How They Work In Python

In [8]:
# .lower() converts all characters to lowercase
"HELLO WORLD".lower()

'hello world'

In [9]:
# .upper() converts all characters to uppercase
"hello world".upper()

'HELLO WORLD'

In [10]:
# .title() converts first character of each word to uppercase
"hello world".title()

'Hello World'

In [11]:
# use len() to get length of string
len("Hello World")

11

## In Pandas

### Must Prefix String Methods with .str

In [12]:
# to use string methods in Pandas you must prefix them with .str
chi["Name"].str.lower().head(3)

0      aaron,  elvia j
1    aaron,  jeffery m
2       aaron,  karina
Name: Name, dtype: object

In [13]:
# if chaining multiple string methods must prefix EACH ONE with .str
chi["Name"].str.lower().head(3).str.upper()

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

In [14]:
chi["Name"] = chi["Name"].str.title()

In [15]:
chi["Position Title"] = chi["Position Title"].str.title()

In [16]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00


In [17]:
# unlike Pyhton .len() is a method in Pandas
chi["Department"].str.len().head(3)

0    11.0
1     6.0
2     6.0
Name: Department, dtype: float64

In [18]:
# calling len() like Python will return number of rows not length of string in the column
len(chi["Department"])

32063

# The .str.replace() Method

In [19]:
# use .dropna() to remove rows with all Null values
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [20]:
# regular Python, first argument is what you want to replace second argument is wht you want to replace it with
"Hello World".replace("l", "7")

'He77o Wor7d'

In [21]:
chi["Department"].head(3)

0    WATER MGMNT
1         POLICE
2         POLICE
Name: Department, dtype: category
Categories (35, object): [ADMIN HEARNG, ANIMAL CONTRL, AVIATION, BOARD OF ELECTION, ..., STREETS & SAN, TRANSPORTN, TREASURER, WATER MGMNT]

In [22]:
# .str.replace() works the same way in Pandas as it does in regular Python
chi["Department"] = chi["Department"].str.replace("MGMNT", "MANAGEMENT").head(3)

In [23]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [24]:
# converting salary string to float. replace $ with nothing "" use .astype() to convert string to float
chi["Employee Annual Salary"] = chi["Employee Annual Salary"].str.replace("$", "").astype("float")

In [25]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0


In [26]:
# Employee Annual Salary are now floats, can perform math operations on them
chi["Employee Annual Salary"].sum()

2571506375.36

# Filtering with String Methods

In [27]:
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [28]:
# use .lower() to mormalize data and ensure all strings are formatted the same
# .str.contains() takes argument of string you want to look for, returns boolean Series
chi["Position Title"].str.lower().str.contains("water").head()

0     True
1    False
2    False
3    False
4    False
Name: Position Title, dtype: bool

In [29]:
# because we're searching with booleans string transformations don't need to be permanent
mask = chi["Position Title"].str.lower().str.contains("water")
chi[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00


In [30]:
# use .str.startswith() to only get strings that start with the specified characters, doesn't need to be full word
mask = chi["Position Title"].str.lower().str.startswith("water")
chi[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00


In [31]:
# use .str.endswith() to select strings that end with specified characters, doesn't need to be full word
mask = chi["Position Title"].str.lower().str.endswith("ist")
chi[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00


In [32]:
mask1 = ~chi["Position Title"].str.lower().str.startswith("water")
mask2 = chi["Position Title"].str.lower().str.endswith("taker")
mask3 = chi["Position Title"].str.lower().str.contains("water")
chi[mask1 & mask3].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


# More String Methods - .strip(), .lstrip(), and .rstrip()

In [39]:
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [34]:
# .lstrip() removes whitespace from beginning (left) of string
"         Hello World   ".lstrip()

'Hello World   '

In [35]:
# .rstrip() removes whitespace from end (right) of string
"         Hello World   ".rstrip()

'         Hello World'

In [36]:
# .strip() removes whitespace from both sides of string
"         Hello World   ".strip()

'Hello World'

In [40]:
# use .str.lstrip() and .str.rstrip() to remove white space around column values
chi["Name"] = chi["Name"].str.lstrip().str.rstrip()

In [41]:
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [42]:
# use .str.strip() to remove whitespace from both sides of a columns value
chi["Position Title"] = chi["Position Title"].str.strip()

In [43]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


# String Methods on Index and Columns

In [44]:
# setting Name column as row index
chi = pd.read_csv("chicago.csv", index_col="Name").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [48]:
# normalizing all index values
chi.index = chi.index.str.strip().str.title()

In [49]:
chi.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [52]:
# can also use string method on column names
chi.columns = chi.columns.str.upper()

In [53]:
chi.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


# Split Strings by Characters with .str.split() Method

In [54]:
# setting Name column as row index
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [55]:
# cut a string a specified character and return a list of separated values
"Hello this is a string!".split(" ")

['Hello', 'this', 'is', 'a', 'string!']

In [68]:
# use .str.get() after .str.split() to return specific index of split list
chi["Name"].str.split(",").str.get(0).head()

0      AARON
1      AARON
2      AARON
3      AARON
4    ABAD JR
Name: Name, dtype: object

In [72]:
# can just keep chaining methods
# find most common last name
chi["Name"].str.split(",").str.get(0).str.title().value_counts()

Williams       293
Johnson        244
Smith          241
Brown          185
Jones          183
Rodriguez      171
Jackson        136
Garcia         130
Davis          127
Hernandez      110
Martinez       108
Lopez          106
Gonzalez       104
Perez          100
Wilson          94
Rivera          90
Thomas          89
Anderson        82
Torres          81
Murphy          80
Robinson        79
Moore           78
Harris          76
Sanchez         76
Miller          75
Lewis           74
Taylor          73
Martin          72
Clark           66
White           66
              ... 
Sznura           1
Gaines Jr        1
Grieshaber       1
Trzepacz         1
Sommerville      1
Brzezicki        1
Addante          1
Huntley          1
Siwoku           1
Spallina         1
Palmer Iii       1
Wenseritt        1
Piatek           1
Suchecki         1
Balodimas        1
Del Marto        1
Rusiecka         1
Barbachen        1
Potesta          1
Uchwal           1
Tellis           1
Sencion     

In [80]:
# use negative index with .str.get() to get last word in lists of different lengths
chi["Position Title"].str.split(" ").str.get(-1).str.title().value_counts()

Officer                  9606
Driver                   1635
Laborer                  1437
Sergeant                 1209
Firefighter-Emt          1208
Ii                       1087
Detective)                896
Firefighter               671
I                         562
Guard                     560
Iii                       530
Aide                      507
Aide-Hourly               442
Engineer                  399
Lieutenant-Emt            391
Clerk                     387
Paramedic                 355
Lieutenant                332
Operator                  310
(Recruit)                 301
Iv                        274
I/C                       274
Engineer-Emt              261
Firefighter/Paramedic     259
Worker                    246
Inspector                 245
Mechanic                  240
Commissioner              222
Cba                       213
Asst                      211
                         ... 
Pilot                       1
Board                       1
Warehouse-

# More Practice with Splits

In [82]:
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [93]:
# chaining and normalizing strings to get most popular first names
chi["Name"].str.split(",").str.get(-1).str.strip().str.split(" ").str.get(0).str.title().value_counts().head()

Michael    1153
John        899
James       676
Robert      622
Joseph      537
Name: Name, dtype: int64

# The expand and n Parameters of the .str.split() Method

In [94]:
chi = pd.read_csv("chicago.csv").dropna(how="all")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [97]:
# setting .str.split() expand parameter to True will return a dataframe instead of a Series
chi["Name"].str.split(",", expand=True).head(3)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA


In [98]:
# can use dataframe created by expand to add new columns to original dataframe
chi[["First Name", "Last Name"]] = chi["Name"].str.split(",", expand=True)

In [100]:
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [103]:
# if .split() list have different length will get a dataframe with None in index positions of shorter lists
chi["Position Title"].str.split(" ", expand=True).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,


In [106]:
# can use .str.split() n parameter to specify the maximum number of splits to perform
chi["Position Title"].str.split(" ", expand=True, n=1).head()

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV


In [107]:
chi[["First Title Word", "Remaining Words"]] = chi["Position Title"].str.split(" ", expand=True, n=1).head()

In [108]:
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
