In [2]:
import pandas as pd

ufo = pd.read_csv('data/nuforc_reports.csv')
denver = pd.read_csv('data/denver_crime.csv')
titanic = pd.read_csv('data/titanic.csv')

## Part 1

Capitalize, upper/lower methods and removing blank spaces with `strip()`

In [3]:
# Lower text in each string. Note, that capitalize method 
# capitalizes only the first character in each string

denver['INCIDENT_ADDRESS'].str.lower()

0                   410 n acoma st
1               5571 n hannibal ct
2                   2101 market st
3         12300 block e albrook dr
4             2500 s colorado blvd
                    ...           
536350           3333 w regis blvd
536351           939 n galapago st
536352           3010 s madison st
536353               603 n inca st
536354          5995 w hampden ave
Name: INCIDENT_ADDRESS, Length: 536355, dtype: object

In [5]:
# To capitalize every first letter of a single word in a string use title-method

denver['INCIDENT_ADDRESS'].str.title()

0                   410 N Acoma St
1               5571 N Hannibal Ct
2                   2101 Market St
3         12300 Block E Albrook Dr
4             2500 S Colorado Blvd
                    ...           
536350           3333 W Regis Blvd
536351           939 N Galapago St
536352           3010 S Madison St
536353               603 N Inca St
536354          5995 W Hampden Ave
Name: INCIDENT_ADDRESS, Length: 536355, dtype: object

In [6]:
# Syntax for selecting the first letter of each string in a column

denver['INCIDENT_ADDRESS'].str[0]

0         4
1         5
2         2
3         1
4         2
         ..
536350    3
536351    9
536352    3
536353    6
536354    5
Name: INCIDENT_ADDRESS, Length: 536355, dtype: object

In [7]:
# Similarly it is possible to select precise number of letters in a string

denver['INCIDENT_ADDRESS'].str[6:11]

0         ACOMA
1          HANN
2         ARKET
3         BLOCK
4          COLO
          ...  
536350     REGI
536351    GALAP
536352     MADI
536353    INCA 
536354     HAMP
Name: INCIDENT_ADDRESS, Length: 536355, dtype: object

In [33]:
a = pd.Series(['1. One  ', '2.  TwO--?', '3:THREE   ', '   4_four'])
a

0      1. One  
1    2.  TwO--?
2    3:THREE   
3        4_four
dtype: object

In [9]:
# Strip method removes blank spaces from both right and left sides of a string

a.str.strip()

0        1. One
1    2.  TwO--?
2       3:THREE
3        4_four
dtype: object

In [10]:
# To remove specific characters use 'to_strip' method

a.str.strip(to_strip = '1234.?:_- ')

0      One
1      TwO
2    THREE
3     four
dtype: object

## Part 2

Splitting and replacing some parts of the text. `Str.contains` method

In [25]:
# Split method with 'expand' argument which converts Series into a Dataframe

titanic['home.dest'].str.split('/', expand = True)[[0,1]]

Unnamed: 0,0,1
0,"St Louis, MO",
1,"Montreal, PQ","Chesterville, ON"
2,"Montreal, PQ","Chesterville, ON"
3,"Montreal, PQ","Chesterville, ON"
4,"Montreal, PQ","Chesterville, ON"
...,...,...
1304,?,
1305,?,
1306,?,
1307,?,


In [40]:
titanic['home.dest']

0                          St Louis, MO
1       Montreal, PQ / Chesterville, ON
2       Montreal, PQ / Chesterville, ON
3       Montreal, PQ / Chesterville, ON
4       Montreal, PQ / Chesterville, ON
                     ...               
1304                                  ?
1305                                  ?
1306                                  ?
1307                                  ?
1308                                  ?
Name: home.dest, Length: 1309, dtype: object

In [32]:
# To split string by ':' sign only once use 'n' argument

ufo['stats'].str.split(':', expand = True, n = 1)

Unnamed: 0,0,1
0,Occurred,12/12/2019 18:43 (Entered as : 12/12/19 18:4...
1,Occurred,3/22/2019 18:30 (Entered as : 03/22/19 18:30...
2,Occurred,4/1/2019 15:45 (Entered as : April01.19) Rep...
3,Occurred,4/17/2019 02:00 (Entered as : 04-17-2019 2:0...
4,Occurred,3/15/2009 18:00 (Entered as : 03/15/09 18:00...
...,...,...
88120,Occurred,10/2/2019 20:00 (Entered as : 10/2/19 20:00)...
88121,Occurred,10/2/2019 20:00 (Entered as : 10/02/019 8:00...
88122,Occurred,10/2/2019 20:03 (Entered as : 10/02/19 20:03...
88123,Occurred,10/2/2019 22:00 (Entered as : 10/02/19 22:00...


In [38]:
# To replace portions of text at once use '|' operator

ufo['duration'].str.replace("seconds|minutes", "", regex = True)

0          5 
1        3-5 
2         NaN
3         10 
4          2 
         ... 
88120      3 
88121     20 
88122     20 
88123      2 
88124      3 
Name: duration, Length: 88125, dtype: object

In [39]:
ufo['duration']

0          5 seconds
1        3-5 seconds
2                NaN
3         10 seconds
4          2 minutes
            ...     
88120      3 minutes
88121     20 seconds
88122     20 seconds
88123      2 minutes
88124      3 minutes
Name: duration, Length: 88125, dtype: object

In [49]:
# The same operator '|' logic works with 'contains' method

ufo[ufo['duration'].str.contains("day|week", na = False)]['duration']

116                week
1636     till day break
3388             3 days
3996       several days
4921             3 days
              ...      
84021            2 days
84983            3 days
85717     Till daylight
86210            2 days
87707            4 days
Name: duration, Length: 173, dtype: object