In [1]:
# Dependencies
import pandas as pd

### Transform the zip_to_zcta dataframe

In [2]:
# Import ZIP to ZCTA conversion table. All fields were imported as string to ensure the ZIP Codes retains the "00" prefixes 
zip_to_zcta_df = pd.read_excel("../00_input/zip_to_zcta_2019.xlsx", dtype = 'str')

In [3]:
# Check the ZIP/ZCTA dataframe data, particularly for retention of "00" for ZIP Codes
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601,Zip Matches ZCTA
3,602,Aguada,PR,Zip Code Area,602,Zip Matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603,Zip Matches ZCTA


In [4]:
# Review data types in ZIP/ZCTA dataframe
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41107 entries, 0 to 41106
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ZIP_CODE       41107 non-null  object
 1   PO_NAME        41107 non-null  object
 2   STATE          41106 non-null  object
 3   ZIP_TYPE       41107 non-null  object
 4   ZCTA           41107 non-null  object
 5   zip_join_type  41107 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [5]:
# Check the minimum and maximum length of values in column ZIP_CODE
zip_code_len = zip_to_zcta_df["ZIP_CODE"].apply(len)
print(f"Max zip_code length is {zip_code_len.max()}.")
print(f"Min zip_code length is {zip_code_len.min()}.")

Max zip_code length is 5.
Min zip_code length is 5.


In [6]:
# Check the minimum and maximum length of values in column ZCTA
zcta_len = zip_to_zcta_df["ZCTA"].apply(len)
print(f"Max zcta length is {zcta_len.max()}.")
print(f"Min zcta length is {zcta_len.min()}.")

Max zcta length is 7.
Min zcta length is 5.


In [7]:
# A look into rows with length of values in column ZCTA > 5 
zip_to_zcta_df[zip_to_zcta_df.ZCTA.str.len() > zcta_len.min()]

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
41098,96939,Ngerulmud,PW,Zip Code Area,No ZCTA,"territory zip, no ZCTA available"
41099,96940,Koror,PW,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41100,96941,Pohnpei,FM,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41101,96942,Chuuk,FM,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41102,96943,Yap,FM,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41103,96944,Kosrae,FM,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41104,96960,Majuro,MH,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41105,96970,Ebeye,MH,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"
41106,96898,Minor Outlying Islands,,Post Office or large volume customer,No ZCTA,"territory zip, no ZCTA available"


In [8]:
# Remove all rows with ZCTA = "No ZCTA"
zip_to_zcta_df = zip_to_zcta_df[~(zip_to_zcta_df.ZCTA.str.len() > zcta_len.min())]

In [9]:
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41098 entries, 0 to 41097
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ZIP_CODE       41098 non-null  object
 1   PO_NAME        41098 non-null  object
 2   STATE          41098 non-null  object
 3   ZIP_TYPE       41098 non-null  object
 4   ZCTA           41098 non-null  object
 5   zip_join_type  41098 non-null  object
dtypes: object(6)
memory usage: 2.2+ MB


In [10]:
# Identify any duplicated ZIP Codes as all ZIP Codes should be unique (many ZIP Codes to one ZCTA relationship)
zip_to_zcta_df['ZIP_CODE'].nunique()

41098

In [11]:
zip_to_zcta_df['ZCTA'].nunique()

33166

### Test merging against zip_code in the zip_code table

In [12]:
# Import cleaned restaurant data
zip_code_df = pd.read_csv("../02_transform_restaurant/zip_code.csv", dtype={'zip_code': 'str'})
zip_code_df.head()

Unnamed: 0,zip_code,city,state
0,70301,Thibodaux,LA
1,37863,Pigeon Forge,TN
2,30260,Morrow,GA
3,48204,Detroit,MI
4,48235,Detroit,MI


In [13]:
# An overview of restaurant data
zip_code_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  5426 non-null   object
 1   city      5426 non-null   object
 2   state     5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [14]:
common_zip_mix = pd.merge(zip_code_df, zip_to_zcta_df, left_on="zip_code", right_on="ZIP_CODE")
common_zip_mix.head()

Unnamed: 0,zip_code,city,state,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,70301,Thibodaux,LA,70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA
1,37863,Pigeon Forge,TN,37863,Pigeon Forge,TN,Zip Code Area,37863,Zip Matches ZCTA
2,30260,Morrow,GA,30260,Morrow,GA,Zip Code Area,30260,Zip Matches ZCTA
3,48204,Detroit,MI,48204,Detroit,MI,Zip Code Area,48204,Zip Matches ZCTA
4,48235,Detroit,MI,48235,Detroit,MI,Zip Code Area,48235,Zip Matches ZCTA


In [15]:
common_zip_mix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5426 entries, 0 to 5425
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   zip_code       5426 non-null   object
 1   city           5426 non-null   object
 2   state          5426 non-null   object
 3   ZIP_CODE       5426 non-null   object
 4   PO_NAME        5426 non-null   object
 5   STATE          5426 non-null   object
 6   ZIP_TYPE       5426 non-null   object
 7   ZCTA           5426 non-null   object
 8   zip_join_type  5426 non-null   object
dtypes: object(9)
memory usage: 423.9+ KB


In [16]:
common_zip_mix[common_zip_mix["city"] != common_zip_mix["PO_NAME"]]

Unnamed: 0,zip_code,city,state,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
112,32435,DeFuniak Springs,FL,32435,Defuniak Springs,FL,Zip Code Area,32435,Zip Matches ZCTA
308,32720,DeLand,FL,32720,Deland,FL,Zip Code Area,32720,Zip Matches ZCTA
317,27103,Winston-Salem,NC,27103,Winston Salem,NC,Zip Code Area,27103,Zip Matches ZCTA
486,60115,DeKalb,IL,60115,Dekalb,IL,Zip Code Area,60115,Zip Matches ZCTA
620,83814,Coeur d'Alene,ID,83814,Coeur D Alene,ID,Zip Code Area,83814,Zip Matches ZCTA
2883,96786,WahiawƒÅ,HI,96786,Wahiawa,HI,Zip Code Area,96786,Zip Matches ZCTA
2887,96753,Kƒ´hei,HI,96753,Kihei,HI,Zip Code Area,96753,Zip Matches ZCTA
3100,70634,DeRidder,LA,70634,Deridder,LA,Zip Code Area,70634,Zip Matches ZCTA
3702,27105,Winston-Salem,NC,27105,Winston Salem,NC,Zip Code Area,27105,Zip Matches ZCTA
4083,50266,West Des Moines,IA,50266,West des Moines,IA,Zip Code Area,50266,Zip Matches ZCTA


As can be seen from above, most of the "city" names in the zip_code table are actually the same as the "PO_NAME" names in zip_to_zcta table. Even with some differences, a closer look tells us that they are also the same.

As all zip_code in zip_code table are in zip_to_zcta table, we can just use zip_to_zcta table to map zip_code in restaurant address table to ZCTA code in the census table.

In [17]:
zip_to_zcta_df["ZIP_TYPE"].unique()

array(['Post Office or large volume customer', 'Zip Code Area',
       'populated ZCTA, missing zip', 'Territory ZCTA Add'], dtype=object)

In [18]:
zip_to_zcta_df["zip_join_type"].unique()

array(['Spatial join to ZCTA', 'Zip Matches ZCTA',
       'populated ZCTA, missing zip', 'Territory ZCTA Add'], dtype=object)

In [20]:
# Remove unnecessary columns to refine dataset
final_zip_to_zcta = zip_to_zcta_df[["ZIP_CODE", "PO_NAME", "STATE", "ZCTA"]]
final_zip_to_zcta.head()

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZCTA
0,501,Holtsville,NY,11742
1,544,Holtsville,NY,11742
2,601,Adjuntas,PR,601
3,602,Aguada,PR,602
4,603,Aguadilla,PR,603


In [21]:
# Change ZIP_CODE into zip_code
clean_zip_to_zcta = final_zip_to_zcta.rename(columns={"ZIP_CODE": "zip_code", "PO_NAME": "city_or_po_name", "STATE": "state"})
clean_zip_to_zcta.head()

Unnamed: 0,zip_code,city_or_po_name,state,ZCTA
0,501,Holtsville,NY,11742
1,544,Holtsville,NY,11742
2,601,Adjuntas,PR,601
3,602,Aguada,PR,602
4,603,Aguadilla,PR,603


In [22]:
clean_zip_to_zcta.to_csv("clean_zip_to_zcta.csv", index=False)

### Test merging against ZCTA in the census table

In [None]:
census_df = pd.read_csv("../02_transform_census/clean_census.csv", dtype={'ZCTA': 'str'})
census_df.head()

In [None]:
census_df.info()

In [None]:
zcta_census = pd.merge(census_df, zip_to_zcta_df, how="outer", on="ZCTA")
zcta_census.head()

In [None]:
zcta_census.info()

Comparing the number of non-null ZCTA (41099) and non-null zip_code (41098), there is one ZCTA in the census table without a matching ZCTA in the zip_to_zcta table.

In [None]:
zcta_census[zcta_census["zip_code"].isnull()]

In [None]:
zcta_to_remove = zcta_census[zcta_census["zip_code"].isnull()]["ZCTA"].to_list()
zcta_to_remove

In [None]:
census_df = census_df[~census_df["ZCTA"].isin(zcta_to_remove)]

In [None]:
census_df.info()

In [None]:
# Update to census csv
census_df.to_csv('../02_transform_census/clean_census.csv', index=False)

### Export cleaned zip_to_zcta_df to csv

In [None]:
# Export ZIP_to_ZCTA dataframe into CSV file
zip_to_zcta_df.to_csv('zcta.csv', index=False)