## Pandas String Preprocessing

In [1]:
# common string method
"HELLO WORLD".lower(), "hello world".upper(), "hello world".title() 

('hello world', 'HELLO WORLD', 'Hello World')

In [2]:
import pandas as pd

In [3]:
chicago = pd.read_csv("./data/chicago.csv")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
# decrease memory by turn type to category
chicago["Department"] = chicago["Department"].astype("category")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [5]:
# .str
chicago["Name"].str.title()

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32057    Zygadlo,  Michael J
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
Name: Name, Length: 32062, dtype: object

In [6]:
# lower
chicago["Name"].str.lower().head(3)

0      aaron,  elvia j
1    aaron,  jeffery m
2       aaron,  karina
Name: Name, dtype: object

In [7]:
# upper
chicago["Name"].str.upper().head(3)

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

In [8]:
# title
chicago["Position Title"] = chicago["Position Title"].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00
3,"AARON, KIMBERLEI R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",Civil Engineer Iv,WATER MGMNT,$106836.00


In [9]:
# replace
chicago["Department"] = chicago["Department"].str.replace("MGMNT","MANAGEMENT")
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00
3,"AARON, KIMBERLEI R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",Civil Engineer Iv,WATER MANAGEMENT,$106836.00


In [10]:
# Retrieve data containing a specific string : startswith(), endswith()
chicago["Position Title"].str.lower()

0                      water rate taker
1                        police officer
2                        police officer
3              chief contract expediter
4                     civil engineer iv
                      ...              
32057    frm of machinists - automotive
32058                    police officer
32059                    police officer
32060                    police officer
32061           chief data base analyst
Name: Position Title, Length: 32062, dtype: object

In [11]:
#
mask = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MANAGEMENT,$90744.00
671,"ANDER, PERRY A",Water Chemist Ii,WATER MANAGEMENT,$82044.00
1054,"ASHLEY, KARMA T",Water Chemist Ii,WATER MANAGEMENT,$82044.00
1079,"ATKINS, JOANNA M",Water Chemist Ii,WATER MANAGEMENT,$82044.00
1181,"AZEEM, MOHAMMED A",Water Chemist Ii,WATER MANAGEMENT,$53172.00


In [12]:
# endswith()
mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask].head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",Psychiatrist,HEALTH,$99840.00
308,"ALARCON, LUIS J",Loan Processing Specialist,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",Senior Telecommunications Specialist,DoIT,$89880.00
472,"ALLEN, ROBERT",Machinist,WATER MANAGEMENT,$94328.00
705,"ANDERSON, EDWARD M",Sr Procurement Specialist,PROCUREMENT,$91476.00


In [13]:
# Split Strings by Characters
"Hello my name is jiayi".split(" ")

['Hello', 'my', 'name', 'is', 'jiayi']

In [14]:
# Split Strings by Characters in pandas
chicago["Name"].str.split(",").head()

0        [AARON,   ELVIA J]
1      [AARON,   JEFFERY M]
2         [AARON,   KARINA]
3    [AARON,   KIMBERLEI R]
4    [ABAD JR,   VICENTE M]
Name: Name, dtype: object

In [15]:
chicago["Name"].str.split(",").str.get(0).str.title()

0            Aaron
1            Aaron
2            Aaron
3            Aaron
4          Abad Jr
           ...    
32057      Zygadlo
32058     Zygowicz
32059     Zymantas
32060    Zyrkowski
32061    Zyskowski
Name: Name, Length: 32062, dtype: object