In [56]:
#import needed files
import pandas as pd

In [57]:
chicago = pd.read_csv("../data/pandas/chicago.csv").dropna(how = "all")
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [58]:
#check out what our data looks like
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1.2+ MB


1. notice that we have a leading "$" in the 'Employee Annual Salary' column.  Let's convert these to be able to work with the salaries as numbers.
2. let's make the department and position title columns a category


## convert columns to categories

In [59]:
#see how many unique values there are in our columns of interest
chicago['Department'].nunique()

35

In [60]:
#see how many total rows there are in the Department column
chicago['Department'].count()

32062

In [61]:
#convert 'Department' to a category
chicago['Department'] = chicago['Department'].astype('category')

In [62]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 1.0+ MB


That was better than a 20% savings in memory!

In [63]:
#Let's check to see if the 'Position Title' column is also a good candidate for a category type
print(chicago['Position Title'].nunique(), chicago['Position Title'].count())

1093 32062


In [64]:
#that's not going to be quite as good as the Department category, but will still give us pretty good savings.  
chicago['Position Title'] = chicago['Position Title'].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null category
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(2), object(2)
memory usage: 895.4+ KB


Wow!  We've already saved around 35% of the memory from our first trial.

# Common String Methods: .lower(), .upper(), .title, len()

In [65]:
#.lower() method to convert everything to lowercase.  e.g., 
'WHOA BABY!'.lower()

'whoa baby!'

In [66]:
#.upper() method
'whoa baby!'.upper()

'WHOA BABY!'

In [67]:
#change the first letter of each word to upper case
'whoa baby!'.title()

'Whoa Baby!'

In [68]:
# len to count the # of characters
len('Whoa baby')

9

### using these string methods with panda dataframes.
NOTE: when working with pandas, you must use the str. prefix

In [69]:
#convert to all lowercase
chicago['Name'].str.lower().head()

0        aaron,  elvia j
1      aaron,  jeffery m
2         aaron,  karina
3    aaron,  kimberlei r
4    abad jr,  vicente m
Name: Name, dtype: object

In [70]:
#convert 1st letters to uppercase
chicago['Name'].str.title().head()

0        Aaron,  Elvia J
1      Aaron,  Jeffery M
2         Aaron,  Karina
3    Aaron,  Kimberlei R
4    Abad Jr,  Vicente M
Name: Name, dtype: object

In [71]:
#change each of the columns and overwrite the original data
chicago.Name = chicago['Name'].str.title()
chicago.Department = chicago['Department'].str.title()
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Mgmnt,$106836.00


## .replace() method

In [72]:
text = "Hello world"

In [73]:
text.replace('l','4')

'He44o wor4d'

In [74]:
#use .replace on a series
chicago['Department'] = chicago['Department'].str.replace('Mgmnt','Management')
chicago['Department'] = chicago['Department'].str.replace("& San","& Sanitation")
chicago['Department'] = chicago['Department'].str.replace("Supvsr","Supervisor")
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace("$","").astype('float')

In [75]:
chicago.tail()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32057,"Zygadlo, Michael J",Frm Of Machinists - Automotive,General Services,99528.0
32058,"Zygowicz, Peter J",Police Officer,Police,87384.0
32059,"Zymantas, Mark E",Police Officer,Police,84450.0
32060,"Zyrkowski, Carlo E",Police Officer,Police,87384.0
32061,"Zyskowski, Dariusz",Chief Data Base Analyst,Doit,113664.0


In [76]:
chicago['Employee Annual Salary'].describe()

count     32062.000000
mean      80204.178634
std       25098.329868
min           0.960000
25%       72862.400000
50%       84450.000000
75%       93240.000000
max      300000.000000
Name: Employee Annual Salary, dtype: float64

In [78]:
#check for the top 10 salaries
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

## cleaning unwanted space with .strip(), .rstrip() and .lstrip() methods

In [88]:
#get rid of white spaces that users may have entered accidentally
chicago.Name = chicago.Name.str.lstrip().str.rstrip()
chicago['Position Title'] = chicago['Position Title'].str.strip()
chicago.Department = chicago.Department.str.strip()

## using string methods on index and columns

In [91]:
#set index to a string column
chicago.set_index('Name', inplace=True)
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
"Aaron, Jeffery M",Police Officer,Police,84450.0
"Aaron, Karina",Police Officer,Police,84450.0
"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0
"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0


In [93]:
#change all index values to upper case
chicago.index.str.upper()

Index(['AARON,  ELVIA J', 'AARON,  JEFFERY M', 'AARON,  KARINA',
       'AARON,  KIMBERLEI R', 'ABAD JR,  VICENTE M', 'ABARCA,  ANABEL',
       'ABARCA,  EMMANUEL', 'ABASCAL,  REECE E', 'ABBASI,  CHRISTOPHER',
       'ABBATACOLA,  ROBERT J',
       ...
       'ZWIT,  JEFFREY J', 'ZWOLFER,  MATTHEW W', 'ZYCH,  MATEUSZ',
       'ZYDEK,  BRYAN', 'ZYGADLO,  JOHN P', 'ZYGADLO,  MICHAEL J',
       'ZYGOWICZ,  PETER J', 'ZYMANTAS,  MARK E', 'ZYRKOWSKI,  CARLO E',
       'ZYSKOWSKI,  DARIUSZ'],
      dtype='object', name='Name', length=32062)

In [97]:
#to work with columns, use the .columns attribute
chicago.columns = chicago.columns.str.upper()
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
"Aaron, Jeffery M",Police Officer,Police,84450.0
"Aaron, Karina",Police Officer,Police,84450.0


In [106]:
#let's reset the index and put all the titles back to the way they were
chicago.reset_index(inplace = True)
chicago.columns = chicago.columns.str.title()

## .split() and .get() methods
.split() takes 1 argument, delimiter.  Python returns a list with each item that was separated by that delimiter

In [107]:
sentence = "Hello, my name is Peter Rich"

In [108]:
sentence.split(" ")

['Hello,', 'my', 'name', 'is', 'Peter', 'Rich']

In [111]:
#use .split() to find out what the  most common last name is in our dataframe.  This will return a list of values for each name
#use .get() to get the first item in these news lists
# use .value_counts() to count by unique name
chicago['Name'].str.split(",").str.get(0).value_counts()

Williams            293
Johnson             244
Smith               241
Brown               185
Jones               183
Rodriguez           171
Jackson             136
Garcia              130
Davis               127
Hernandez           110
Martinez            108
Lopez               106
Gonzalez            104
Perez               100
Wilson               94
Rivera               90
Thomas               89
Anderson             82
Torres               81
Murphy               80
Robinson             79
Moore                78
Sanchez              76
Harris               76
Miller               75
Lewis                74
Taylor               73
Martin               72
White                66
Clark                66
                   ... 
Lacoco                1
Zaller                1
Hancin                1
Ballew                1
Mckinney Henison      1
Ulloa Lopez           1
Staszewski            1
Mcdonell              1
Danielson             1
Peulecke              1
Awbrey          

In [112]:
#find the most common FIRST name
chicago['Name'].str.split(',').str.get(1).value_counts()

  Michael J     270
  Michael       165
  Michael A     158
  David         128
  Robert J      120
  Thomas J      120
  Robert        107
  Daniel J      102
  John          100
  Joseph         95
  John M         95
  Daniel         95
  John J         95
  Anthony        94
  Patrick J      93
  James M        85
  James          82
  John P         80
  John A         78
  Anthony J      75
  Mark A         70
  Michael P      66
  David A        65
  David J        65
  Thomas         63
  Michael R      63
  James P        62
  Brian J        62
  Joseph A       60
  Timothy J      60
               ... 
  Briget P        1
  Olga I          1
  Jun             1
  Taynicka L      1
  Darrin M        1
  Donna F         1
  Athena S        1
  Lydia C         1
  Ted B           1
  Edmundo R       1
  Terance         1
  Dora M          1
  Kip J           1
  Leisa M         1
  Dan A           1
  Patrice N       1
  Hal O           1
  Lenora          1
  Mcarthur        1


In [114]:
#notice that after splitting everything the second time, we get a lot of leading white spaces.  Let's use .strip() to clearen it up before splitting again
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(" ").str.get(0).value_counts()

Michael        1153
John            899
James           676
Robert          622
Joseph          537
David           506
Thomas          490
Daniel          472
William         397
Anthony         385
Kevin           331
Brian           320
Richard         314
Mark            310
Patrick         300
Matthew         247
Timothy         243
Jose            224
Steven          220
Christopher     217
Edward          205
Paul            205
Kenneth         201
Eric            170
Charles         163
Jeffrey         152
Gregory         150
George          149
Mary            147
Ronald          147
               ... 
Jaret             1
Virjilio          1
Oneal             1
Carroll           1
Antrinius         1
Serina            1
Nickolaos         1
Kahlill           1
Dara              1
Evans             1
Annalisia         1
Christophre       1
John-Long         1
Eddy              1
Charla            1
Aleli             1
Correy            1
Chaline           1
Joylanda          1


### .str.split() parameters

In [116]:
#expand = True will return a dataframe instead of a series
chicago.Name.str.split(",", expand=True)

Unnamed: 0,0,1
0,Aaron,Elvia J
1,Aaron,Jeffery M
2,Aaron,Karina
3,Aaron,Kimberlei R
4,Abad Jr,Vicente M
5,Abarca,Anabel
6,Abarca,Emmanuel
7,Abascal,Reece E
8,Abbasi,Christopher
9,Abbatacola,Robert J


In [118]:
#let's add this new dataframe to the existing df
chicago[['Last Name','Given Names']] = chicago.Name.str.split(",", expand=True)
#clean up the leading spaces from the Given names
chicago['Given Names'] = chicago['Given Names'].str.strip()
chicago.head()

Unnamed: 0,Level_0,Index,Name,Position Title,Department,Employee Annual Salary,Last Name,Given Names
0,0,0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0,Aaron,Elvia J
1,1,1,"Aaron, Jeffery M",Police Officer,Police,84450.0,Aaron,Jeffery M
2,2,2,"Aaron, Karina",Police Officer,Police,84450.0,Aaron,Karina
3,3,3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,89880.0,Aaron,Kimberlei R
4,4,4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Management,106836.0,Abad Jr,Vicente M


In [119]:
# the "n" parameter allows me to restrict the number of times the split runs on an object.  e.g., 
chicago['Position Title'].str.split(" ",expand = True, n = 1)

Unnamed: 0,0,1
0,Water,Rate Taker
1,Police,Officer
2,Police,Officer
3,Chief,Contract Expediter
4,Civil,Engineer Iv
5,Asst,To The Alderman
6,General,Laborer - Dss
7,Traffic,Control Aide-Hourly
8,Staff,Asst To The Alderman
9,Electrical,Mechanic


# Filtering textual data with string methods

In [79]:
#extract all the rows where position has "water"
chicago['Position Title']

0                                     Water Rate Taker
1                                       Police Officer
2                                       Police Officer
3                             Chief Contract Expediter
4                                    Civil Engineer Iv
5                                 Asst To The Alderman
6                                General Laborer - Dss
7                          Traffic Control Aide-Hourly
8                           Staff Asst To The Alderman
9                                  Electrical Mechanic
10                                   Fire Engineer-Emt
11                                      Police Officer
12                                  Foster Grandparent
13                                           Clerk Iii
14                              Investigator - Ipra Ii
15                                      Police Officer
16                                      Police Officer
17       Firefighter (Per Arbitrators Award)-Paramedic
18        

In [82]:
#first, convert all the values to lower so you don't gave false negatives.  Then, use .contains() method
mask = chicago['Position Title'].str.lower().str.contains('water')
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
554,"Aluise, Vincent G",Foreman Of Water Pipe Construction,Water Management,102440.0
671,"Ander, Perry A",Water Chemist Ii,Water Management,82044.0
685,"Anderson, Andrew J",District Superintendent Of Water Distribution,Water Management,109272.0
702,"Anderson, Donald",Foreman Of Water Pipe Construction,Water Management,102440.0
1054,"Ashley, Karma T",Water Chemist Ii,Water Management,82044.0
1079,"Atkins, Joanna M",Water Chemist Ii,Water Management,82044.0
1181,"Azeem, Mohammed A",Water Chemist Ii,Water Management,53172.0
1285,"Bajic, John A",Water Meter Machinist,Water Management,82576.0
2400,"Bolton, Brian E",Water Rate Taker,Water Management,78948.0


In [83]:
#look for all positions that START with "water"
mask = chicago['Position Title'].str.lower().str.startswith('water')
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Management,90744.0
671,"Ander, Perry A",Water Chemist Ii,Water Management,82044.0
1054,"Ashley, Karma T",Water Chemist Ii,Water Management,82044.0
1079,"Atkins, Joanna M",Water Chemist Ii,Water Management,82044.0
1181,"Azeem, Mohammed A",Water Chemist Ii,Water Management,53172.0
1285,"Bajic, John A",Water Meter Machinist,Water Management,82576.0
2400,"Bolton, Brian E",Water Rate Taker,Water Management,78948.0
2586,"Boyce, Adner L",Water Chemist Ii,Water Management,82044.0
2745,"Brandys, Daniel",Water Chemist Ii,Water Management,53172.0
3143,"Brown, Sharon L",Water Rate Taker,Water Management,82728.0


In [85]:
#extract all rows where the value in Position title columns ENDS with "ist"
mask = chicago['Position Title'].str.lower().str.endswith('ist')
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"Afroz, Nayyar",Psychiatrist,Health,99840.00
308,"Alarcon, Luis J",Loan Processing Specialist,Community Development,81948.00
422,"Allain, Carolyn",Senior Telecommunications Specialist,Doit,89880.00
472,"Allen, Robert",Machinist,Water Management,94328.00
705,"Anderson, Edward M",Sr Procurement Specialist,Procurement,91476.00
1022,"Arteaga, Paul",Machinist,Transportn,94328.00
1163,"Ayala Jr, Juan",Field Sanitation Specialist,Streets & Sanitation,78948.00
1285,"Bajic, John A",Water Meter Machinist,Water Management,82576.00
1558,"Barrett, Barbara J",Technical Training Specialist,Police,94200.00
1869,"Beltran, Mauricio",Procurement Specialist,Procurement,79596.00
