In [1]:
# Dependencies
import pandas as pd

In [2]:
# Import the csv file as pandas dataframe
csv_file_one = "../00_input/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
restaurant_df = pd.read_csv(csv_file_one)
restaurant_df.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
2,AVwcopQoByjofQCxgfVa,2016-03-29T05:06:36Z,2018-06-26T02:59:52Z,206 Wears Valley Rd,Fast Food Restaurant,Pigeon Forge,US,us/tn/pigeonforge/206wearsvalleyrd/-864103396,35.803788,-83.580553,Taco Bell,37863,TN,https://www.yellowpages.com/pigeon-forge-tn/mi...,"http://www.tacobell.com,https://locations.taco..."
3,AVweXN5RByjofQCxxilK,2017-01-03T07:46:11Z,2018-06-26T02:59:51Z,3652 Parkway,Fast Food,Pigeon Forge,US,us/tn/pigeonforge/3652parkway/93075755,35.782339,-83.551408,Arby's,37863,TN,http://www.yellowbook.com/profile/arbys_163389...,"http://www.arbys.com,https://locations.arbys.c..."
4,AWQ6MUvo3-Khe5l_j3SG,2018-06-26T02:59:43Z,2018-06-26T02:59:43Z,2118 Mt Zion Parkway,Fast Food Restaurant,Morrow,US,us/ga/morrow/2118mtzionparkway/1305117222,33.562738,-84.321143,Steak 'n Shake,30260,GA,https://foursquare.com/v/steak-n-shake/4bcf77a...,http://www.steaknshake.com/locations/23851-ste...


In [3]:
# Overview of the dataframe before transformation --> There are 10,000 records in the dataframe
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10000 non-null  object 
 1   dateAdded    10000 non-null  object 
 2   dateUpdated  10000 non-null  object 
 3   address      10000 non-null  object 
 4   categories   10000 non-null  object 
 5   city         10000 non-null  object 
 6   country      10000 non-null  object 
 7   keys         10000 non-null  object 
 8   latitude     10000 non-null  float64
 9   longitude    10000 non-null  float64
 10  name         10000 non-null  object 
 11  postalCode   10000 non-null  object 
 12  province     10000 non-null  object 
 13  sourceURLs   10000 non-null  object 
 14  websites     10000 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


## Restaurant address cleaning to ensure its uniqueness

* Drop duplicates in "keys" column so that restaurant addresses will be unique

In [4]:
# There are only 9343 unique restaurant addresses out of 10,000 rows in the dataframe.
restaurant_df["keys"].nunique()

9343

In [5]:
# Drop duplicates in restaurant addresses
restaurant_df.drop_duplicates(subset="keys", keep = 'first', inplace = True) 

In [6]:
# The total count has been reduced from 10,000 to 9,343. 
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           9343 non-null   object 
 1   dateAdded    9343 non-null   object 
 2   dateUpdated  9343 non-null   object 
 3   address      9343 non-null   object 
 4   categories   9343 non-null   object 
 5   city         9343 non-null   object 
 6   country      9343 non-null   object 
 7   keys         9343 non-null   object 
 8   latitude     9343 non-null   float64
 9   longitude    9343 non-null   float64
 10  name         9343 non-null   object 
 11  postalCode   9343 non-null   object 
 12  province     9343 non-null   object 
 13  sourceURLs   9343 non-null   object 
 14  websites     9343 non-null   object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


## Zip code cleaning

In [7]:
# Check the minimum and maximum length of values in column postalCode
zip_check = restaurant_df['postalCode'].apply(len)
print(f"Max zip_code length is {zip_check.max()}.")
print(f"Min zip_code length is {zip_check.min()}.")

Max zip_code length is 11.
Min zip_code length is 5.


In [8]:
# A look into rows with length of values in column postalCode > 5 
restaurant_df[restaurant_df.postalCode.str.len() > zip_check.min()]

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
2759,AVwcifLrByjofQCxfXKm,2015-11-29T18:21:27Z,2018-06-12T21:04:05Z,1400 O St,Fast Food Restaurant,Lincoln,US,us/ne/lincoln/1400ost/959819527,40.813868,-96.700892,Gourmet Grill,68508-3807,NE,http://www.yellowpages.com/lincoln-ne/mip/gour...,"http://www.lincolngourmetgrill.com/,http://lin..."
5675,AWOnnZGLRxPSIh2Ru31g,2018-05-28T16:28:34Z,2018-05-28T16:28:34Z,3340 Mall Loop Dr Louis Joliet Mall Louis Joli...,Fast Food Restaurant,Joliet,US,us/il/joliet/3340mallloopdrlouisjolietmallloui...,41.575031,-88.156167,KFC,60431-1057,IL,https://foursquare.com/v/kfc/4bc8a87692b376b09...,http://www.kfc.com
5721,AVwdiC82IN2L1WUfwvBm,2016-06-04T19:42:46Z,2018-05-28T16:25:47Z,3826 US Highway 23,Fast Food Restaurant,Portsmouth,US,us/oh/portsmouth/3826ushighway23/106088,38.783775,-82.984373,KFC,45662-8620,OH,http://www.citysearch.com/profile/8242163/port...,http://api.citygridmedia.com/content/places/v2...
6049,AVwdR1x4IN2L1WUfuIpc,2015-10-20T00:19:17Z,2018-05-28T16:14:59Z,844B Carl Eller Rd,Fast Food Restaurant and Burger Joint,Mars Hill,US,us/nc/marshill/844bcarlellerrd/696490760,35.818208,-82.538384,Hardee's,28754-6003,NC,https://foursquare.com/v/hardees/4b9197eaf964a...,http://www.hardees.com
7990,AVwdiI2PIN2L1WUfwv2W,2017-02-01T03:27:41Z,2018-05-07T11:26:45Z,1814 Chicago Rd,Fast Food,Chicago Heights,US,us/il/chicagoheights/1814chicagord/-418103476,41.49855,-87.64039,Gyros Express,60411-3407,IL,https://foursquare.com/v/gyros-express/4cb94e2...,http://gyrosexpress.org
8146,AV1XXEZh-gnIPe8DWw8_,2017-07-18T20:20:51Z,2018-05-01T23:11:27Z,358 E Dupont Hwy Unit 10,Fast Food Restaurants,Millsboro,US,us/de/millsboro/358eduponthwyunit10/696490760,38.57831,-75.28907,Hardee's,19966-4737,DE,https://foursquare.com/v/hardees/4c2a9948ae682...,http://www.hardees.com
8960,AVwdXtyVIN2L1WUfvHLU,2016-06-04T20:48:35Z,2018-04-18T14:23:27Z,4850 SH 6,Fast Food Restaurant,Missouri City,US,us/tx/missouricity/4850sh6/-394404301,29.577967,-95.580184,Popeyes,77459-3990,TX,https://foursquare.com/v/popeyes/4c90f36e2626a...,http://popeyes.com


* As can be seen, some zip_codes are in xxxxx-xxxx format. They need to be diced to match the standard format. Digits after '-' will be dropped.

In [9]:
# Cleanse ZIP Code to retain only 5 characters
restaurant_df["postalCode"] = restaurant_df["postalCode"].apply(lambda x: x.split("-")[0].strip())

In [10]:
# Verify ZIP Codes all cleansed to 5 characters only
zip_check = restaurant_df['postalCode'].apply(len)
print(f"Max zip_code length is {zip_check.max()}.")
print(f"Min zip_code length is {zip_check.min()}.")

Max zip_code length is 5.
Min zip_code length is 5.


## Data normalisation

### Normalise zip_code

If keeping zip_code, city and state in the same table, there will be transitive dependency. City and state depend on the restaurant address but also on zip_code. Hence to achieve the Third Normal Form, we'll put zip, city and state into a seperate table.

In [11]:
# Extract ZIP Code, City and State to a separate table
zip_code_df = restaurant_df[['postalCode','city','province']]
zip_code_df.head()

Unnamed: 0,postalCode,city,province
0,70301,Thibodaux,LA
2,37863,Pigeon Forge,TN
3,37863,Pigeon Forge,TN
4,30260,Morrow,GA
5,48204,Detroit,MI


In [12]:
# Rename ZIP Code table column headings so they are more user friendly
zip_code_df = zip_code_df.rename(columns={"postalCode":"zip_code", "province":"state"})

In [13]:
# Identify if there are any duplicate ZIP Codes --> Yes
zip_code_df["zip_code"].nunique()

5426

In [14]:
# Drop duplicate ZIP Codes from the ZIP Code dataframe
zip_code_df.drop_duplicates(subset ='zip_code', keep = 'first', inplace=True) 

In [15]:
zip_code_df.head()

Unnamed: 0,zip_code,city,state
0,70301,Thibodaux,LA
2,37863,Pigeon Forge,TN
4,30260,Morrow,GA
5,48204,Detroit,MI
6,48235,Detroit,MI


In [16]:
zip_code_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5426 entries, 0 to 9996
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  5426 non-null   object
 1   city      5426 non-null   object
 2   state     5426 non-null   object
dtypes: object(3)
memory usage: 169.6+ KB


### Test merging the zip_code table against zip_code in the zip_to_zcta table

In [17]:
# Import zip_to_zcta mapping table
zip_to_zcta_df = pd.read_csv("../02_transform_zip_zcta/zcta_to_match.csv", dtype="str")
zip_to_zcta_df.head()

Unnamed: 0,zip_code,PO_NAME,STATE,zcta
0,501,Holtsville,NY,11742
1,544,Holtsville,NY,11742
2,601,Adjuntas,PR,601
3,602,Aguada,PR,602
4,603,Aguadilla,PR,603


In [18]:
# An overview of the zip_to_zcta mapping table
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41098 entries, 0 to 41097
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  41098 non-null  object
 1   PO_NAME   41098 non-null  object
 2   STATE     41098 non-null  object
 3   zcta      41098 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB


In [19]:
# Merge the zip_to_zcta table with the zip_code table using inner merge to find the common zip_codes between the two tables
common_zip_mix = pd.merge(zip_code_df, zip_to_zcta_df, on="zip_code")
common_zip_mix.head()

Unnamed: 0,zip_code,city,state,PO_NAME,STATE,zcta
0,70301,Thibodaux,LA,Thibodaux,LA,70301
1,37863,Pigeon Forge,TN,Pigeon Forge,TN,37863
2,30260,Morrow,GA,Morrow,GA,30260
3,48204,Detroit,MI,Detroit,MI,48204
4,48235,Detroit,MI,Detroit,MI,48235


In [20]:
# Overview of the common zip_code table
common_zip_mix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5426 entries, 0 to 5425
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  5426 non-null   object
 1   city      5426 non-null   object
 2   state     5426 non-null   object
 3   PO_NAME   5426 non-null   object
 4   STATE     5426 non-null   object
 5   zcta      5426 non-null   object
dtypes: object(6)
memory usage: 296.7+ KB


> Comparing the equal number of records in the common_zip_mix table with the zip_code_df table (5426 vs. 5426), we can say that all of the zip_codes in the restaurant address table and in the zip_code table are covered in the zip_to_zcta table.

In [21]:
# Overview of the differences between values in city column and PO_NAME column in the common_zip_miz
common_zip_mix[common_zip_mix["city"] != common_zip_mix["PO_NAME"]]

Unnamed: 0,zip_code,city,state,PO_NAME,STATE,zcta
112,32435,DeFuniak Springs,FL,Defuniak Springs,FL,32435
308,32720,DeLand,FL,Deland,FL,32720
317,27103,Winston-Salem,NC,Winston Salem,NC,27103
486,60115,DeKalb,IL,Dekalb,IL,60115
620,83814,Coeur d'Alene,ID,Coeur D Alene,ID,83814
2883,96786,WahiawƒÅ,HI,Wahiawa,HI,96786
2887,96753,Kƒ´hei,HI,Kihei,HI,96753
3100,70634,DeRidder,LA,Deridder,LA,70634
3702,27105,Winston-Salem,NC,Winston Salem,NC,27105
4083,50266,West Des Moines,IA,West des Moines,IA,50266


> Most of the values between the city and PO_NAME columns in the common_zip_mix table are the same. Even with the above "word-styling" differences, they are actually the same.

> We can now confirm that all of the zip_codes in the zip_code table are in the zip_to_zcta table, and the city in the zip_code table are the same as the PO_NAME name in the zip_to_zcta table. Hence, we can turn the zip_code, PO_NAME, and STATE from the zip_to_zcta table into the zip_code table.

> We'd like to keep the ZCTA seperate from the zip_code table due to the subjective nature of the zip_to_zcta mapping. From our understanding of the differences between zip and ZCTA, one zip can have many ZCTA and vice versa. Also while ZCTAs are more stable, zip_code are more subject to changes by US Postal Service. ZCTAs are mainly used for census purpose, while zip_code for postal services. For more information, please click [here](https://atcoordinates.info/2020/05/11/the-trouble-with-zip-codes-solutions-for-data-analysis-and-mapping/).

In [22]:
# Extract zip_code, PO_NAME and STATE into a zip_code table and rename
final_zip_code = zip_to_zcta_df[["zip_code", "PO_NAME", "STATE"]].rename(columns={"PO_NAME": "city", "STATE": "state"})

In [23]:
final_zip_code.head()

Unnamed: 0,zip_code,city,state
0,501,Holtsville,NY
1,544,Holtsville,NY
2,601,Adjuntas,PR
3,602,Aguada,PR
4,603,Aguadilla,PR


In [24]:
# Overview of the final_zip_code table
final_zip_code.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41098 entries, 0 to 41097
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  41098 non-null  object
 1   city      41098 non-null  object
 2   state     41098 non-null  object
dtypes: object(3)
memory usage: 963.4+ KB


In [25]:
# Recheck zip_code uniqueness
final_zip_code["zip_code"].is_unique

True

In [26]:
# Export final_zip_code into csv file
final_zip_code.to_csv("final_zip_code.csv", index=False)

### Normalise restaurant name table

As can be observed from above analysis, there are many repeated restaurant names in the restaurant data. Hence we can seperate them into a seperate table with only restaurant name tracked by restaurant_id. **This allows us to enforce referential integrity between restaurant addresses and restaurant name in our sql database**. The restaurant_id will then be mapped back into the restaurant address table. The restaurant address table will be tracked by unique restaurant address IDs. The restaurant address IDs are set as serial type in the database, hence it will be auto-generated once the table is loaded into the database.

In [27]:
# Extract from the restaurant dataframe to create a restaurant address dataframe
restaurant_clean_df = restaurant_df[["name","address","postalCode"]]
restaurant_clean_df.head()

Unnamed: 0,name,address,postalCode
0,SONIC Drive In,800 N Canal Blvd,70301
2,Taco Bell,206 Wears Valley Rd,37863
3,Arby's,3652 Parkway,37863
4,Steak 'n Shake,2118 Mt Zion Parkway,30260
5,Wendy's,9768 Grand River Ave,48204


In [28]:
# Extract unique restaurant names to a seperate table
restaurant_uniques = pd.Series(restaurant_clean_df.name.unique(), name="restaurant_name").to_frame()
restaurant_uniques

Unnamed: 0,restaurant_name
0,SONIC Drive In
1,Taco Bell
2,Arby's
3,Steak 'n Shake
4,Wendy's
...,...
566,Ben & Jerry's
567,Mr. Gyros Greek Food & Pastry
568,Legends Burgers
569,Plato's Closet


In [29]:
# Add restaurant ID column
restaurant_uniques["restaurant_id"] = np.arange(1, len(restaurant_uniques)+1)

In [30]:
# Reorder columns in the restaurant name table
clean_restaurant = restaurant_uniques[["restaurant_id", "restaurant_name"]]
clean_restaurant.head()

Unnamed: 0,restaurant_id,restaurant_name
0,1,SONIC Drive In
1,2,Taco Bell
2,3,Arby's
3,4,Steak 'n Shake
4,5,Wendy's


In [31]:
# Export the restaurant name table to csv
clean_restaurant.to_csv('restaurant_id.csv', index=False)

In [32]:
# Merge restaurant address table against restaurant name in the restaurant name table
address_name_id_merge = pd.merge(restaurant_uniques, restaurant_clean_df, left_on="restaurant_name", right_on="name")

In [33]:
address_name_id_merge.head()

Unnamed: 0,restaurant_name,restaurant_id,name,address,postalCode
0,SONIC Drive In,1,SONIC Drive In,800 N Canal Blvd,70301
1,SONIC Drive In,1,SONIC Drive In,124 John R Rd,48083
2,SONIC Drive In,1,SONIC Drive In,909 N Wood,75644
3,SONIC Drive In,1,SONIC Drive In,97 Gateway Blvd,82901
4,SONIC Drive In,1,SONIC Drive In,6557 S Staples St,78413


In [34]:
# Remove restaurant_name column
short_address_name_id_merge = address_name_id_merge[["restaurant_id", "address", "postalCode"]]
short_address_name_id_merge.head()

Unnamed: 0,restaurant_id,address,postalCode
0,1,800 N Canal Blvd,70301
1,1,124 John R Rd,48083
2,1,909 N Wood,75644
3,1,97 Gateway Blvd,82901
4,1,6557 S Staples St,78413


In [35]:
# Seperate street No and street name from address for future use
street_no_name = short_address_name_id_merge.address.str.split(" ", n=1, expand=True)
street_no_name.head()

Unnamed: 0,0,1
0,800,N Canal Blvd
1,124,John R Rd
2,909,N Wood
3,97,Gateway Blvd
4,6557,S Staples St


In [36]:
# Merge the street_no_name back to the restaurant address table
concatenated_restaurant = pd.concat([short_address_name_id_merge, street_no_name], axis=1)
concatenated_restaurant.head()

Unnamed: 0,restaurant_id,address,postalCode,0,1
0,1,800 N Canal Blvd,70301,800,N Canal Blvd
1,1,124 John R Rd,48083,124,John R Rd
2,1,909 N Wood,75644,909,N Wood
3,1,97 Gateway Blvd,82901,97,Gateway Blvd
4,1,6557 S Staples St,78413,6557,S Staples St


In [37]:
# Remove the full address column
final_restaurant_address = concatenated_restaurant[["restaurant_id", 0, 1, "postalCode"]]
final_restaurant_address.head()

Unnamed: 0,restaurant_id,0,1,postalCode
0,1,800,N Canal Blvd,70301
1,1,124,John R Rd,48083
2,1,909,N Wood,75644
3,1,97,Gateway Blvd,82901
4,1,6557,S Staples St,78413


In [38]:
# Rename the columns
clean_restaurant_address = final_restaurant_address.rename(columns={"postalCode":"zip_code", 0: "street_no", 1: "street_name"})

In [39]:
# Overview of the clean restaurant address
clean_restaurant_address.head()

Unnamed: 0,restaurant_id,street_no,street_name,zip_code
0,1,800,N Canal Blvd,70301
1,1,124,John R Rd,48083
2,1,909,N Wood,75644
3,1,97,Gateway Blvd,82901
4,1,6557,S Staples St,78413


In [40]:
# Overview of the clean restaurant address
clean_restaurant_address.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9342
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   restaurant_id  9343 non-null   int32 
 1   street_no      9343 non-null   object
 2   street_name    9343 non-null   object
 3   zip_code       9343 non-null   object
dtypes: int32(1), object(3)
memory usage: 328.5+ KB


In [41]:
# Export restaurant address data to CSV
clean_restaurant_address.to_csv('restaurant_address.csv', index=False)