In [2]:
import pandas as pd

# 1) Intro to working with Text Data and Optimization of Data Frame

In [6]:
chicago = pd.read_csv('Data/chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [9]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [8]:
chicago.isnull().sum()

Name                      1
Position Title            1
Department                1
Employee Annual Salary    1
dtype: int64

In [10]:
chicago['Position Title'].nunique()

1093

In [11]:
chicago['Position Title'].unique()

array(['WATER RATE TAKER', 'POLICE OFFICER', 'CHIEF CONTRACT EXPEDITER',
       ..., 'PREPRESS TECHNICIAN', 'MECHANICAL ENGINEER IV', nan],
      dtype=object)

In [13]:
chicago['Department'].nunique()

35

In [14]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

# What's wrong with the data on first glance?
+ Name, Position Title and Department all in capital. We might need to change to lower case based on requirements.
+ Department has only 35 unique values and currently in Object type. We need to change it category to increase in speed and save memory usage.
+ Employee Annual Salary are in string /object type. We need to remove dollar sign and change it to integer which will allow us to make calculation, if necessary.
+ There is one missing row in the dataset. We need to remove it.

## Converting Department into Category datatype

In [16]:
chicago['Department'] = chicago['Department'].astype('category')

In [17]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.4+ KB


We can see huge decrease in memory usage too from memory usage: 1002.1+ KB to memory usage: 784.4+ KB.

## Remove missing row in dataset

In [18]:
chicago = chicago.dropna()

In [19]:
chicago.isnull().sum()

Name                      0
Position Title            0
Department                0
Employee Annual Salary    0
dtype: int64

------

# 2) Common String Methods - `.lower()`, `.upper()`, `.title()`, `.len()`

In [20]:
chicago = pd.read_csv('Data/chicago.csv')
chicago['Department'] = chicago['Department'].astype('category')
chicago = chicago.dropna()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [26]:
'Hello world'.lower()
'Hello world'.upper()
'Hello world'.title()
len('Hello world')

11

## Whenever we use String Method, we need to prefix with `str` then call the method
+ `.str.lower()`

In [34]:
chicago['Name'].str.lower()
chicago['Name'].str.title().str.upper()
chicago['Name'].str.title()

chicago['Position Title'] = chicago['Position Title'].str.title()

In [35]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


### Find the length of every Department

In [37]:
chicago['Department'].apply(lambda dept: len(dept)) # using 

chicago['Department'].str.len()

0        11
1         6
2         6
3        16
4        11
         ..
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

-------

# 3) The `.str.replace()` Method
+ replace all occurances with another

In [57]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all') # drop rows where all values are NaN
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [58]:
'Hello world'.replace('l', '!')

'He!!o wor!d'

## String Manipulation on Department

### Changing MGMNT to Full Management

In [59]:
chicago['Department'].head(3)

0    WATER MGMNT
1         POLICE
2         POLICE
Name: Department, dtype: category
Categories (35, object): ['ADMIN HEARNG', 'ANIMAL CONTRL', 'AVIATION', 'BOARD OF ELECTION', ..., 'STREETS & SAN', 'TRANSPORTN', 'TREASURER', 'WATER MGMNT']

In [60]:
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')
chicago['Department'].head()

0    WATER MANAGEMENT
1              POLICE
2              POLICE
3    GENERAL SERVICES
4    WATER MANAGEMENT
Name: Department, dtype: object

## Removing Dollar Sign in Employee Annual Salary column

In [61]:
chicago['Employee Annual Salary'].head()

0     $90744.00
1     $84450.00
2     $84450.00
3     $89880.00
4    $106836.00
Name: Employee Annual Salary, dtype: object

In [62]:
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)
chicago['Employee Annual Salary'].head()

0     90744.0
1     84450.0
2     84450.0
3     89880.0
4    106836.0
Name: Employee Annual Salary, dtype: float64

### Now we can use any calculation methods after converting

In [69]:
chicago['Employee Annual Salary'].sum()
chicago['Employee Annual Salary'].mean()
chicago['Employee Annual Salary'].std()
chicago['Employee Annual Salary'].nsmallest(10)
chicago['Employee Annual Salary'].nlargest(10)

8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

-----

# 4) Filtering Dataframe rows with String Methods

In [71]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all') # drop rows where all values are NaN
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


## We want  to extract rows with `Water`

### TIPS: Good Practice is to normalize the data (such as lowering) before we do any filtering. We can't assume data will be good.

In [74]:
chicago['Position Title'].str.lower().str.contains('water')

0         True
1        False
2        False
3        False
4        False
         ...  
32057    False
32058    False
32059    False
32060    False
32061    False
Name: Position Title, Length: 32062, dtype: bool

## Using `.contains()` Method
+ `.str.contains()` will look for the substring in anywhere within the string.

In [75]:
is_water_contain = chicago['Position Title'].str.lower().str.contains('water')
chicago[is_water_contain]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


## Using `.startswith()` and `.endswith()` Methods

In [78]:
chicago[chicago['Position Title'].str.lower().str.startswith('water')]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [79]:
chicago[chicago['Position Title'].str.lower().str.endswith('ist')]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


------

# 5) More String Methods - `.strip()`, `.lstrip()`, and `.rstrip()`
+ remove white spaces

In [80]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all') # drop rows where all values are NaN
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [84]:
'         Hello World          '.lstrip() # remove white spaces from left hand side

'Hello World          '

In [85]:
'         Hello World          '.rstrip() # from right hand side

'         Hello World'

In [87]:
'         Hello  World          '.strip() # from both ends

'Hello  World'

In [90]:
chicago['Name'].str.lstrip()
chicago['Name'].str.rstrip()
chicago['Name'].str.lstrip().str.rstrip()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [91]:
chicago['Name'] = chicago['Name'].str.lstrip().str.rstrip()

In [94]:
chicago['Position Title'] = chicago['Position Title'].str.strip()

In [95]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


--------

# 6) String Methods on Index and Columns

In [3]:
chicago = pd.read_csv('Data/chicago.csv', index_col=['Name']).dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


### We can call any string method on `.index`

In [9]:
chicago.index.str.strip().str.title()

Index(['Aaron,  Elvia J', 'Aaron,  Jeffery M', 'Aaron,  Karina',
       'Aaron,  Kimberlei R', 'Abad Jr,  Vicente M', 'Abarca,  Anabel',
       'Abarca,  Emmanuel', 'Abascal,  Reece E', 'Abbasi,  Christopher',
       'Abbatacola,  Robert J',
       ...
       'Zwit,  Jeffrey J', 'Zwolfer,  Matthew W', 'Zych,  Mateusz',
       'Zydek,  Bryan', 'Zygadlo,  John P', 'Zygadlo,  Michael J',
       'Zygowicz,  Peter J', 'Zymantas,  Mark E', 'Zyrkowski,  Carlo E',
       'Zyskowski,  Dariusz'],
      dtype='object', name='Name', length=32062)

In [10]:
chicago.index = chicago.index.str.strip().str.title()
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


### Manipulation on `.columns`
+ let's say we want to change to upper case

In [11]:
chicago.columns

Index(['Position Title', 'Department', 'Employee Annual Salary'], dtype='object')

In [16]:
chicago.columns = chicago.columns.str.upper()

In [17]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


------

# 7) Split Strings by Characters with `.str.split()` Method
### Compliment method is `.str.get()` Method

In [18]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [25]:
'Hello My Name is Chicago'.split()
'Hello My Name is Chicago'.split(' ')

['Hello', 'My', 'Name', 'is', 'Chicago']

In [26]:
'Hello My Name is Chicago'.split('is')

['Hello My Name ', ' Chicago']

## What is the most popular Last Name?

In [34]:
 chicago['Name'].str.split(',')

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [40]:
 chicago['Name'].str.split(',').str.get(0).str.title().value_counts()

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
            ... 
Richied        1
Malabarba      1
Pietz          1
Schwocher      1
Kettering      1
Name: Name, Length: 13829, dtype: int64

## What is the most first common word in Position Title?

In [48]:
chicago['Position Title'].str.split().str.get(0).value_counts()

POLICE                10856
FIREFIGHTER-EMT        1509
SERGEANT               1186
POOL                    918
FIREFIGHTER             810
                      ...  
SUPERVISOR                1
TELECOMMUNICATIONS        1
OFFSET                    1
1ST                       1
DEVELOPMENT               1
Name: Position Title, Length: 320, dtype: int64

------


# 8) More Practice with Splits

In [49]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [55]:
chicago['Name'].str.split(',').str.get(0).value_counts().head(3) # top 3 last names

WILLIAMS    293
JOHNSON     244
SMITH       241
Name: Name, dtype: int64

### What are most common First Name?

In [75]:
chicago['Name'].str.split(',').str.get(1).str.split().str.get(0).value_counts()
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

-----

# 9) The `expand` and `n` Parameter of the `.str.split()` Method

In [76]:
chicago = pd.read_csv('Data/chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


### `expand=True` parameter will return dataframe

In [77]:
chicago['Name'].str.split(',')

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [78]:
chicago['Name'].str.split(',', expand=True)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA
3,AARON,KIMBERLEI R
4,ABAD JR,VICENTE M
...,...,...
32057,ZYGADLO,MICHAEL J
32058,ZYGOWICZ,PETER J
32059,ZYMANTAS,MARK E
32060,ZYRKOWSKI,CARLO E


## First Name and Last Name columns from Name

In [79]:
chicago[['Last Name', 'First Name']] = chicago['Name'].str.split(',', expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


## Splitting and Expand Position Title

We can see that this will result in fanky looking dataframe. The reason is there is one extremelly long position title and split up into 8 portions. However for title which are short needs to fill up those empty columns too. As a result, those are becoming None in those columns.

In [84]:
chicago['Position Title'].str.split(' ', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


## To avoid this, we need to control our split
We want to split only one split. After that there shouldn't be no more split and we want to put remaining in the second column.

### `n` parameter: maximum number of splits

In [86]:
chicago['Position Title'].str.split(' ', expand=True, n=1) # one split only

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
...,...,...
32057,FRM,OF MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER
32059,POLICE,OFFICER
32060,POLICE,OFFICER


In [87]:
chicago['Position Title'].str.split(' ', expand=True, n=2) 

Unnamed: 0,0,1,2
0,WATER,RATE,TAKER
1,POLICE,OFFICER,
2,POLICE,OFFICER,
3,CHIEF,CONTRACT,EXPEDITER
4,CIVIL,ENGINEER,IV
...,...,...,...
32057,FRM,OF,MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER,
32059,POLICE,OFFICER,
32060,POLICE,OFFICER,


In [88]:
chicago[['First Title Word', 'Remaining Words']] = chicago['Position Title'].str.split(' ', expand=True, n=1)

In [90]:
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R,CHIEF,CONTRACT EXPEDITER
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M,CIVIL,ENGINEER IV
