## Working With Text Data Sets:

In [30]:
import numpy as np
import pandas as pd

In [31]:
inspections = pd.read_csv("data/chicago_food_inspections.csv")

In [32]:
inspections["Name"]

0                 MARRIOT MARQUIS CHICAGO   
1                                JETS PIZZA 
2                                 ROOM 1520 
3                  MARRIOT MARQUIS CHICAGO  
4                              CHARTWELLS   
                         ...                
153805                           WOLCOTT'S  
153806       DUNKIN DONUTS/BASKIN-ROBBINS   
153807                             Cafe 608 
153808                          mr.daniel's 
153809                           TEMPO CAFE 
Name: Name, Length: 153810, dtype: object

In [33]:
inspections["Risk"]

0           Risk 1 (High)
1         Risk 2 (Medium)
2            Risk 3 (Low)
3           Risk 1 (High)
4           Risk 1 (High)
               ...       
153805      Risk 1 (High)
153806    Risk 2 (Medium)
153807      Risk 1 (High)
153808      Risk 1 (High)
153809      Risk 1 (High)
Name: Risk, Length: 153810, dtype: object

### un even spaces in name column:

In [34]:
inspections["Name"].str.strip()  ### un even spaces in name column:

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [35]:
inspections.dtypes

Name    object
Risk    object
dtype: object

In [36]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()
    
 # here we are stripping all the columns in the dataframe to remove any leading or trailing spaces using a for loop, with the str.strip() method.   

In [37]:
inspections["Name"]

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [38]:
inspections["Risk"].unique() 

# to check unique values in the risk column.

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

### here we see some hotesl have null values (NaN), no use of that hotels, so we Delete.

In [39]:
inspections = inspections.dropna(subset=["Risk"])
inspections.shape   #to check no of rows and columns after deleting null values in risk column.

(153744, 2)

In [40]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

### Now we replace the "ALL" by "Extreme".

In [41]:
inspections= inspections.replace(
    to_replace="All",
    value="Risk 4 (Extreme)"
)

In [42]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

### Now we dont want Risk mentioned, we only need High, Medium, Low, extreme.

#### We will do it with string accessor as "RISK X" have same format and same length. we use Slice method of a string.

In [43]:
inspections["Risk"] = inspections["Risk"].str.slice(8, -1)

# here we used 8 as it is index position where "High", "Medium", "Low", "Extreme" starts in the string 
# and -1 to remove the closing parenthesis.

In [44]:
inspections.head(3)

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,High
1,JETS PIZZA,Medium
2,ROOM 1520,Low


### To check all the restaurants with pizza in their name.

In [45]:
inspections[inspections["Name"].str.lower().str.contains("pizza")]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Medium
19,NANCY'S HOME OF STUFFED PIZZA,High
27,"NARY'S GRILL & PIZZA ,INC.",High
29,NARYS GRILL & PIZZA,High
68,COLUTAS PIZZA,High
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,High
153764,COCHIAROS PIZZA #2,High
153772,FERNANDO'S MEXICAN GRILL & PIZZA,High
153788,REGGIO'S PIZZA EXPRESS,High


### To check all the restaurants that starts with  Tacos in their name.

In [46]:
inspections[inspections["Name"].str.lower().str.startswith("tacos")]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,High
556,TACOS EL TIO 2 INC.,High
675,TACOS DON GABINO,High
958,TACOS EL TIO 2 INC.,High
1036,TACOS EL TIO 2 INC.,High
...,...,...
143587,TACOS DE LUNA,High
144026,TACOS GARCIA,High
146174,Tacos Place's 1,High
147810,TACOS MARIO'S LIMITED,High


## Change of Data Set: Customers.csv

In [47]:
customers = pd.read_csv("data/customers.csv")
customers.head()


Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."


In [48]:
customers["Name"].str.split(pat= " ", n=1)

# here we are splitting the names into first and last names based on the space between them.
# but 4th row Vincent III has a middle name which will create an issue.
# sowe will use pattern to split the names based on the last space in the string.

0             [Frank, Manning]
1         [Elizabeth, Johnson]
2           [Donald, Stephens]
3       [Michael, Vincent III]
4            [Jasmine, Zamora]
                 ...          
9956          [Dana, Browning]
9957        [Amanda, Anderson]
9958             [Eric, Davis]
9959       [Taylor, Hernandez]
9960       [Sherry, Nicholson]
Name: Name, Length: 9961, dtype: object

### now we will split the First name and last name into two different columns.

In [49]:
customers["Name"].str.split(pat= " ", n=1, expand = True)

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora
...,...,...
9956,Dana,Browning
9957,Amanda,Anderson
9958,Eric,Davis
9959,Taylor,Hernandez


### accessing only specified column.

In [50]:
customers["Name"].str.split(pat= " ", n=1, expand = True).get(0)

0           Frank
1       Elizabeth
2          Donald
3         Michael
4         Jasmine
          ...    
9956         Dana
9957       Amanda
9958         Eric
9959       Taylor
9960       Sherry
Name: 0, Length: 9961, dtype: object

### assigning it to the orginal dataframe.

In [51]:
customers[["First Name", "Last Name"]] = customers["Name"].str.split(pat=" ", n=1, expand = True)
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [52]:
# dropping name column

In [53]:
customers = customers.drop(columns=["Name"])
customers

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...
9956,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [54]:
customers[["Street", "City", "State", "Zip"]] = customers["Address"].str.split(pat=",", expand = True)

In [55]:
customers.head()

Unnamed: 0,Address,First Name,Last Name,Street,City,State,Zip
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [56]:
customers = customers.drop(
    columns=["Address"])

In [57]:
customers.head()

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
0,Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


#### Removing the door numbers in street column
### USING REGUALR EXPRESSION:

In [58]:
customers["Street"].str.replace(
    "\d{4}", "*", regex = True
)

# here \d is metacharacter., represents any digit and {4} represents number greater than equal to 4 digits.

  "\d{4}", "*", regex = True


0                  * Quinn Groves
1         * Tracey Ports Apt. 419
2               *0 Fleming Manors
3                441 Olivia Creek
4         * Chelsey Ford Apt. 310
                  ...            
9956    762 Andrew Views Apt. 254
9957        *8 Day Crest Apt. 901
9958          *5 Michelle Squares
9959             129 Keith Greens
9960           355 Griffin Valley
Name: Street, Length: 9961, dtype: object