# WORKING WITH TEXT

In [1]:
import numpy as np
import pandas as pd

In [2]:
# getting the data
inspections = pd.read_csv("./data/chicago_food_inspections.csv")
inspections.head()

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)


In [3]:
inspections["Name"].values

array([' MARRIOT MARQUIS CHICAGO   ', ' JETS PIZZA ', '   ROOM 1520 ',
       ..., ' Cafe 608 ', "  mr.daniel's ", '   TEMPO CAFE '],
      shape=(153810,), dtype=object)

In [4]:
inspections["Name"].str     # str is string accessor

<pandas.core.strings.accessor.StringMethods at 0x1ee5960f380>

In [5]:
inspections["Name"] = inspections["Name"].str.strip()

In [6]:
inspections.columns     # lists all the columns

Index(['Name', 'Risk'], dtype='object')

In [7]:
# stripping of extra spaces from each column
for col in inspections.columns:
    inspections[col] = inspections[col].str.strip()

In [8]:
inspections["Name"].str.lower()
inspections["Name"].str.upper()
inspections["Name"] = inspections["Name"].str.title()
inspections["Name"]

0              Marriot Marquis Chicago
1                           Jets Pizza
2                            Room 1520
3              Marriot Marquis Chicago
4                           Chartwells
                      ...             
153805                       Wolcott'S
153806    Dunkin Donuts/Baskin-Robbins
153807                        Cafe 608
153808                     Mr.Daniel'S
153809                      Tempo Cafe
Name: Name, Length: 153810, dtype: object

### String Slicing

In [9]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [10]:
inspections["Risk"] = inspections["Risk"].replace(
    to_replace="All", value="Risk 4 (Extreme)"
)
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)', nan], dtype=object)

In [11]:
# counting all the missing values (nan)
inspections["Risk"].isnull().sum()

np.int64(66)

In [12]:
len(inspections)        # total rows

153810

In [13]:
inspections = inspections.dropna()
len(inspections)

153744

In [14]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

In [15]:
inspections["Risk"].str.slice(8, -1)
inspections["Risk"].str[8:-1]
inspections["Risk"] = inspections["Risk"].str.slice(8, -1)
inspections.head()

Unnamed: 0,Name,Risk
0,Marriot Marquis Chicago,High
1,Jets Pizza,Medium
2,Room 1520,Low
3,Marriot Marquis Chicago,High
4,Chartwells,High


### BOOLEAN METHODS

In [16]:
inspections

Unnamed: 0,Name,Risk
0,Marriot Marquis Chicago,High
1,Jets Pizza,Medium
2,Room 1520,Low
3,Marriot Marquis Chicago,High
4,Chartwells,High
...,...,...
153805,Wolcott'S,High
153806,Dunkin Donuts/Baskin-Robbins,Medium
153807,Cafe 608,High
153808,Mr.Daniel'S,High


In [19]:
# filtering rows with names that contain the word "pizza"
pizzas = inspections["Name"].str.lower().str.contains("pizza")
inspections[pizzas]

Unnamed: 0,Name,Risk
1,Jets Pizza,Medium
19,Nancy'S Home Of Stuffed Pizza,High
27,"Nary'S Grill & Pizza ,Inc.",High
29,Narys Grill & Pizza,High
68,Colutas Pizza,High
...,...,...
153756,Angelo'S Stuffed Pizza Corp,High
153764,Cochiaros Pizza #2,High
153772,Fernando'S Mexican Grill & Pizza,High
153788,Reggio'S Pizza Express,High


In [20]:
# names that start with the word "tacos"
starts_with_tacos = inspections["Name"].str.lower().str.startswith("tacos")
inspections[starts_with_tacos]

Unnamed: 0,Name,Risk
69,Tacos Nietos,High
556,Tacos El Tio 2 Inc.,High
675,Tacos Don Gabino,High
958,Tacos El Tio 2 Inc.,High
1036,Tacos El Tio 2 Inc.,High
...,...,...
143587,Tacos De Luna,High
144026,Tacos Garcia,High
146174,Tacos Place'S 1,High
147810,Tacos Mario'S Limited,High


In [21]:
# names that end with the word "tacos"
ends_with_tacos = inspections["Name"].str.lower().str.endswith("tacos")
inspections[ends_with_tacos]

Unnamed: 0,Name,Risk
382,Lazo'S Tacos,High
569,Lazo'S Tacos,High
2652,Flying Tacos,Low
3250,Jony'S Tacos,High
3812,Paco'S Tacos,High
...,...,...
151121,Reyes Tacos,High
151318,El Macho Tacos,High
151801,El Macho Tacos,High
153087,Raymond'S Tacos,High


### Splitting the Strings

In [23]:
# load customers.csv data
customers = pd.read_csv("./data/customers.csv")
customers

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."
...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex..."
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ..."
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg..."
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916"


In [26]:
# applying split method
# the default delimiter is space " "
# The following two statements are equivalent
customers["Name"].str.split()
customers["Name"].str.split(pat=" ")    # pat is used to specify the delimiter

0              [Frank, Manning]
1          [Elizabeth, Johnson]
2            [Donald, Stephens]
3       [Michael, Vincent, III]
4             [Jasmine, Zamora]
                 ...           
9956           [Dana, Browning]
9957         [Amanda, Anderson]
9958              [Eric, Davis]
9959        [Taylor, Hernandez]
9960        [Sherry, Nicholson]
Name: Name, Length: 9961, dtype: object

In [27]:
# specify the number of splits using parameter 'n'
customers["Name"].str.split(pat=" ", n=1)

0             [Frank, Manning]
1         [Elizabeth, Johnson]
2           [Donald, Stephens]
3       [Michael, Vincent III]
4            [Jasmine, Zamora]
                 ...          
9956          [Dana, Browning]
9957        [Amanda, Anderson]
9958             [Eric, Davis]
9959       [Taylor, Hernandez]
9960       [Sherry, Nicholson]
Name: Name, Length: 9961, dtype: object

In [28]:
# use the get(index) method to extract the item from the given output
customers["Name"].str.split(pat=" ").str.get(0)     # first names

0           Frank
1       Elizabeth
2          Donald
3         Michael
4         Jasmine
          ...    
9956         Dana
9957       Amanda
9958         Eric
9959       Taylor
9960       Sherry
Name: Name, Length: 9961, dtype: object

In [29]:
# get the second component, which is also the last component
customers["Name"].str.split(pat=" ", n=1).str.get(1)    # .get(-1)

0           Manning
1           Johnson
2          Stephens
3       Vincent III
4            Zamora
           ...     
9956       Browning
9957       Anderson
9958          Davis
9959      Hernandez
9960      Nicholson
Name: Name, Length: 9961, dtype: object

In [31]:
# use the 'expand' parameter to expand the values into columns
customers["Name"].str.split(pat=" ", n=1, expand=True)

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora
...,...,...
9956,Dana,Browning
9957,Amanda,Anderson
9958,Eric,Davis
9959,Taylor,Hernandez


In [32]:
# Creating two new columns: First Name, Last Name
customers[["First Name", "Last Name"]] = customers["Name"].str.split(pat=" ", n=1, expand=True)
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [33]:
# dropping / deleting the "Name" column
customers = customers.drop(labels="Name", axis="columns")
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [36]:
# split the Address column into Street, City, State, Zip columns
customers[["Street", "City", "State", "Zip"]] = customers["Address"].str.split(pat=",", expand=True)
customers

Unnamed: 0,Address,First Name,Last Name,Street,City,State,Zip
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252
...,...,...,...,...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning,762 Andrew Views Apt. 254,North Paul,New Mexico,28889
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson,44188 Day Crest Apt. 901,Lake Marcia,Maine,37378
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis,73015 Michelle Squares,Watsonville,West Virginia,03933
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez,129 Keith Greens,Haleyfurt,Oklahoma,98916


In [37]:
# dropping Address column
customers = customers.drop(labels="Address", axis="columns")
customers

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
0,Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252
...,...,...,...,...,...,...
9956,Dana,Browning,762 Andrew Views Apt. 254,North Paul,New Mexico,28889
9957,Amanda,Anderson,44188 Day Crest Apt. 901,Lake Marcia,Maine,37378
9958,Eric,Davis,73015 Michelle Squares,Watsonville,West Virginia,03933
9959,Taylor,Hernandez,129 Keith Greens,Haleyfurt,Oklahoma,98916
