# Working with Text Data

In [29]:
import pandas as pd

In [30]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [31]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB


In [32]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [33]:
chicago["Department"] = chicago["Department"].astype("category")

In [34]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 1.0+ MB


In [35]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## Common String Methods

In [36]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [37]:
"borris".upper()

'BORRIS'

In [38]:
"boris".title()

'Boris'

In [39]:
chicago["Position Title"].str.lower()

0                      water rate taker
1                        police officer
2                        police officer
3              chief contract expediter
4                     civil engineer iv
                      ...              
32057    frm of machinists - automotive
32058                    police officer
32059                    police officer
32060                    police officer
32061           chief data base analyst
Name: Position Title, Length: 32062, dtype: object

In [40]:
chicago["Position Title"].str.len()

0        16
1        14
2        14
3        24
4        17
         ..
32057    30
32058    14
32059    14
32060    14
32061    23
Name: Position Title, Length: 32062, dtype: int64

In [41]:
chicago["Position Title"].str.strip()

0                      WATER RATE TAKER
1                        POLICE OFFICER
2                        POLICE OFFICER
3              CHIEF CONTRACT EXPEDITER
4                     CIVIL ENGINEER IV
                      ...              
32057    FRM OF MACHINISTS - AUTOMOTIVE
32058                    POLICE OFFICER
32059                    POLICE OFFICER
32060                    POLICE OFFICER
32061           CHIEF DATA BASE ANALYST
Name: Position Title, Length: 32062, dtype: object

## Filtering with string methods

In [42]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [43]:
# pull from position title words that contain water 
water_workers = chicago["Position Title"].str.lower().str.contains("water")
chicago[water_workers]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [44]:
# find any positions that start with the word civil 
civil_workers = chicago["Position Title"].str.lower().str.startswith("civil")
chicago[civil_workers]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
25,"ABDULSATTAR, MUDHAR",CIVIL ENGINEER II,WATER MGMNT,$58536.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
55,"ABUTALEB, AHMAD H",CIVIL ENGINEER II,WATER MGMNT,$89676.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
...,...,...,...,...
31623,"YANG, LUYANG",CIVIL ENGINEER V,TRANSPORTN,$116784.00
31656,"YEPEZ, JESUS",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
31662,"YESUFU, STEPHANIE A",CIVIL ENGINEER III,TRANSPORTN,$92784.00
31797,"ZAKE, JOSHUA S",CIVIL ENGINEER IV,TRANSPORTN,$106836.00


In [45]:
# find all the positions that end with IV
ends_iv = chicago["Position Title"].str.lower().str.endswith("iv")
chicago[ends_iv]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
34,"ABRAHAM, GIRLEY T",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
145,"ADAMS, SHERYLL A",LIBRARIAN IV,PUBLIC LIBRARY,$97812.00
147,"ADAMS, TANERA C",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
166,"ADENI, MOHAMED K",ACCOUNTANT IV,FINANCE,$97812.00
...,...,...,...,...
31777,"ZAFIRIS, CHRISTOPHER",ARCHITECT IV,DISABILITIES,$106836.00
31797,"ZAKE, JOSHUA S",CIVIL ENGINEER IV,TRANSPORTN,$106836.00
31870,"ZAVALA, FERNANDO",ACCOUNTANT IV,FINANCE,$97812.00
31884,"ZAWADSKI, JAMES",CLERK IV,LAW,$68028.00


## String methods on index and columns

In [46]:
chicago = pd.read_csv("chicago.csv", index_col = "Name").dropna(how="all").sort_index()
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [47]:
chicago.index = chicago.index.str.strip().str.title()

In [48]:
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [49]:
chicago.columns

Index(['Position Title', 'Department', 'Employee Annual Salary'], dtype='object')

## The split method

In [50]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [51]:
"Water Rate Taker".split(" ")

['Water', 'Rate', 'Taker']

In [52]:
# find the most common first word in our job positions/titles 

chicago["Position Title"].str.split(" ").str.get(0).value_counts()

Position Title
POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
POOL                 918
FIREFIGHTER          810
                   ...  
DENTIST                1
ASSOC                  1
TELEPHONE              1
MAYOR                  1
PREPRESS               1
Name: count, Length: 320, dtype: int64

## More practice with split method

In [53]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [54]:
# find the most common first name 

chicago["Name"].str.title().str.split(",  ").str.get(1).str.split(" ").str.get(0).value_counts()

Name
Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Deena          1
Cherrise       1
Eartha         1
Ernika         1
Mac            1
Name: count, Length: 5091, dtype: int64

## The expand and n parameters of the split method

In [55]:
chicago = pd.read_csv("chicago.csv").dropna(how="all")
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [58]:
chicago[["Last Name", "First Name"]] = chicago["Name"].str.split(",", expand = True)

In [59]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [64]:
chicago[["primary title", "secondary title"]] = chicago["Position Title"].str.split(" ", expand=True, n=1)

In [65]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name,primary title,secondary title
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
