# Working with Text Data Sets

In [2]:
import numpy as np
import pandas as pd

In [3]:
inspections = pd.read_csv("data/chicago_food_inspections.csv")

In [5]:
inspections["Name"]

0                 MARRIOT MARQUIS CHICAGO   
1                                JETS PIZZA 
2                                 ROOM 1520 
3                  MARRIOT MARQUIS CHICAGO  
4                              CHARTWELLS   
                         ...                
153805                           WOLCOTT'S  
153806       DUNKIN DONUTS/BASKIN-ROBBINS   
153807                             Cafe 608 
153808                          mr.daniel's 
153809                           TEMPO CAFE 
Name: Name, Length: 153810, dtype: object

In [6]:
inspections["Risk"]

0           Risk 1 (High)
1         Risk 2 (Medium)
2            Risk 3 (Low)
3           Risk 1 (High)
4           Risk 1 (High)
               ...       
153805      Risk 1 (High)
153806    Risk 2 (Medium)
153807      Risk 1 (High)
153808      Risk 1 (High)
153809      Risk 1 (High)
Name: Risk, Length: 153810, dtype: object

In [8]:
inspections["Name"].str.strip()

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [9]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

In [10]:
inspections["Name"]

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [15]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [16]:
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


In [19]:
inspections = inspections.dropna(subset=["Risk"])
inspections.shape

(153744, 2)

In [20]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [24]:
inspections = inspections.replace(
    to_replace="All",
    value="Risk 4 (Extreme)"
)
inspections["Risk"].unique()

array(['High', 'Medium', 'Low', 'Extreme'], dtype=object)

In [None]:
#inspections["Risk"] = inspections["Risk"].str[8:-1]
inspections["Risk"] = inspections["Risk"].str.slice(8, -1)

In [23]:
inspections.head(3)

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,High
1,JETS PIZZA,Medium
2,ROOM 1520,Low


In [26]:
inspections[inspections["Name"].str.lower().str.contains("pizza")]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Medium
19,NANCY'S HOME OF STUFFED PIZZA,High
27,"NARY'S GRILL & PIZZA ,INC.",High
29,NARYS GRILL & PIZZA,High
68,COLUTAS PIZZA,High
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,High
153764,COCHIAROS PIZZA #2,High
153772,FERNANDO'S MEXICAN GRILL & PIZZA,High
153788,REGGIO'S PIZZA EXPRESS,High


In [27]:
inspections[inspections["Name"].str.lower().str.startswith("tacos")]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,High
556,TACOS EL TIO 2 INC.,High
675,TACOS DON GABINO,High
958,TACOS EL TIO 2 INC.,High
1036,TACOS EL TIO 2 INC.,High
...,...,...
143587,TACOS DE LUNA,High
144026,TACOS GARCIA,High
146174,Tacos Place's 1,High
147810,TACOS MARIO'S LIMITED,High


In [28]:
customers = pd.read_csv("data/customers.csv")
customers.head()

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."


In [30]:
customers["Name"].str.split(" ")
customers["Name"].str.split(pat=" ")

0              [Frank, Manning]
1          [Elizabeth, Johnson]
2            [Donald, Stephens]
3       [Michael, Vincent, III]
4             [Jasmine, Zamora]
                 ...           
9956           [Dana, Browning]
9957         [Amanda, Anderson]
9958              [Eric, Davis]
9959        [Taylor, Hernandez]
9960        [Sherry, Nicholson]
Name: Name, Length: 9961, dtype: object

In [42]:
customers[["First Name", "Last Name"]] = customers["Name"].str.split(pat=" ", n=1, expand=True)
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [44]:
customers = customers.drop(
    labels="Name",
    axis=1
)
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [45]:
customers[["Street", "City", "State", "Zip"]] = customers["Address"].str.split(pat=",", expand=True)

In [46]:
customers.head()

Unnamed: 0,Address,First Name,Last Name,Street,City,State,Zip
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [47]:
customers = customers.drop(
    labels="Address",
    axis=1,
    )
customers

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
0,Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252
...,...,...,...,...,...,...
9956,Dana,Browning,762 Andrew Views Apt. 254,North Paul,New Mexico,28889
9957,Amanda,Anderson,44188 Day Crest Apt. 901,Lake Marcia,Maine,37378
9958,Eric,Davis,73015 Michelle Squares,Watsonville,West Virginia,03933
9959,Taylor,Hernandez,129 Keith Greens,Haleyfurt,Oklahoma,98916


In [48]:
customers["Street"].str.replace(
    "\d{4,}", "*", regex=True
)

0                  * Quinn Groves
1         * Tracey Ports Apt. 419
2                * Fleming Manors
3                441 Olivia Creek
4         * Chelsey Ford Apt. 310
                  ...            
9956    762 Andrew Views Apt. 254
9957         * Day Crest Apt. 901
9958           * Michelle Squares
9959             129 Keith Greens
9960           355 Griffin Valley
Name: Street, Length: 9961, dtype: object