In [5]:
import pandas as pd

# 1) Intro to working with Text Data and Optimization of Data Frame

In [6]:
chicago = pd.read_csv('Data/chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [9]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [8]:
chicago.isnull().sum()

Name                      1
Position Title            1
Department                1
Employee Annual Salary    1
dtype: int64

In [10]:
chicago['Position Title'].nunique()

1093

In [11]:
chicago['Position Title'].unique()

array(['WATER RATE TAKER', 'POLICE OFFICER', 'CHIEF CONTRACT EXPEDITER',
       ..., 'PREPRESS TECHNICIAN', 'MECHANICAL ENGINEER IV', nan],
      dtype=object)

In [13]:
chicago['Department'].nunique()

35

In [14]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

# What's wrong with the data on first glance?
+ Name, Position Title and Department all in capital. We might need to change to lower case based on requirements.
+ Department has only 35 unique values and currently in Object type. We need to change it category to increase in speed and save memory usage.
+ Employee Annual Salary are in string /object type. We need to remove dollar sign and change it to integer which will allow us to make calculation, if necessary.
+ There is one missing row in the dataset. We need to remove it.

## Converting Department into Category datatype

In [16]:
chicago['Department'] = chicago['Department'].astype('category')

In [17]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.4+ KB


We can see huge decrease in memory usage too from memory usage: 1002.1+ KB to memory usage: 784.4+ KB.

## Remove missing row in dataset

In [18]:
chicago = chicago.dropna()

In [19]:
chicago.isnull().sum()

Name                      0
Position Title            0
Department                0
Employee Annual Salary    0
dtype: int64

------

# 2) Common String Methods - `.lower()`, `.upper()`, `.title()`, `.len()`

In [20]:
chicago = pd.read_csv('Data/chicago.csv')
chicago['Department'] = chicago['Department'].astype('category')
chicago = chicago.dropna()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [26]:
'Hello world'.lower()
'Hello world'.upper()
'Hello world'.title()
len('Hello world')

11

## Whenever we use String Method, we need to prefix with `str` then call the method
+ `.str.lower()`

In [34]:
chicago['Name'].str.lower()
chicago['Name'].str.title().str.upper()
chicago['Name'].str.title()

chicago['Position Title'] = chicago['Position Title'].str.title()

In [35]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


### Find the length of every Department

In [37]:
chicago['Department'].apply(lambda dept: len(dept)) # using 

chicago['Department'].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

-------

# 3) The `.str.replace()` Method
+ replace all occurances with another

In [57]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all') # drop rows where all values are NaN
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [58]:
'Hello world'.replace('l', '!')

'He!!o wor!d'

## String Manipulation on Department

### Changing MGMNT to Full Management

In [59]:
chicago['Department'].head(3)

0    WATER MGMNT
1         POLICE
2         POLICE
Name: Department, dtype: category
Categories (35, object): ['ADMIN HEARNG', 'ANIMAL CONTRL', 'AVIATION', 'BOARD OF ELECTION', ..., 'STREETS & SAN', 'TRANSPORTN', 'TREASURER', 'WATER MGMNT']

In [60]:
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')
chicago['Department'].head()

0    WATER MANAGEMENT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4    WATER MANAGEMENT
Name: Department, dtype: object

## Removing Dollar Sign in Employee Annual Salary column

In [61]:
chicago['Employee Annual Salary'].head()

0     $90744.00
1     $84450.00
2     $84450.00
3     $89880.00
4    $106836.00
Name: Employee Annual Salary, dtype: object

In [62]:
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)
chicago['Employee Annual Salary'].head()

0     90744.0
1     84450.0
2     84450.0
3     89880.0
4    106836.0
Name: Employee Annual Salary, dtype: float64

### Now we can use any calculation methods after converting

In [69]:
chicago['Employee Annual Salary'].sum()
chicago['Employee Annual Salary'].mean()
chicago['Employee Annual Salary'].std()
chicago['Employee Annual Salary'].nsmallest(10)
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

-----