In [1]:
# Dependencies
import pandas as pd

### Transform the zip_to_zcta dataframe

In [2]:
# Import ZIP to ZCTA conversion table. All fields were imported as string to ensure the ZIP Codes retains the "00" prefixes 
zip_to_zcta_df = pd.read_excel("../00_input/zip_to_zcta_2019.xlsx", dtype = 'str')

In [3]:
# Check the ZIP/ZCTA dataframe data, particularly for retention of "00" for ZIP Codes
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601,Zip Matches ZCTA
3,602,Aguada,PR,Zip Code Area,602,Zip Matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603,Zip Matches ZCTA


In [4]:
# Review data types in ZIP/ZCTA dataframe
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41107 entries, 0 to 41106
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ZIP_CODE       41107 non-null  object
 1   PO_NAME        41107 non-null  object
 2   STATE          41106 non-null  object
 3   ZIP_TYPE       41107 non-null  object
 4   ZCTA           41107 non-null  object
 5   zip_join_type  41107 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [5]:
# Remove unnecessary columns to refine dataset
zip_to_zcta_df.drop(columns=['PO_NAME', 'STATE', 'ZIP_TYPE', 'zip_join_type'], inplace=True)
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,ZCTA
0,501,11742
1,544,11742
2,601,601
3,602,602
4,603,603


In [6]:
# Check the minimum and maximum length of values in column ZIP_CODE
zip_code_len = zip_to_zcta_df["ZIP_CODE"].apply(len)
print(f"Max zip_code length is {zip_code_len.max()}.")
print(f"Min zip_code length is {zip_code_len.min()}.")

Max zip_code length is 5.
Min zip_code length is 5.


In [7]:
# Check the minimum and maximum length of values in column ZCTA
zcta_len = zip_to_zcta_df["ZCTA"].apply(len)
print(f"Max zcta length is {zcta_len.max()}.")
print(f"Min zcta length is {zcta_len.min()}.")

Max zcta length is 7.
Min zcta length is 5.


In [8]:
# A look into rows with length of values in column ZCTA > 5 
zip_to_zcta_df[zip_to_zcta_df.ZCTA.str.len() > zcta_len.min()]

Unnamed: 0,ZIP_CODE,ZCTA
41098,96939,No ZCTA
41099,96940,No ZCTA
41100,96941,No ZCTA
41101,96942,No ZCTA
41102,96943,No ZCTA
41103,96944,No ZCTA
41104,96960,No ZCTA
41105,96970,No ZCTA
41106,96898,No ZCTA


In [9]:
# Remove all rows with ZCTA = "No ZCTA"
zip_to_zcta_df = zip_to_zcta_df[~(zip_to_zcta_df.ZCTA.str.len() > zcta_len.min())]

In [10]:
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41098 entries, 0 to 41097
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ZIP_CODE  41098 non-null  object
 1   ZCTA      41098 non-null  object
dtypes: object(2)
memory usage: 963.2+ KB


In [11]:
# Identify any duplicated ZIP Codes as all ZIP Codes should be unique (many ZIP Codes to one ZCTA relationship)
zip_to_zcta_df['ZIP_CODE'].nunique()

41098

In [12]:
# Change ZIP_CODE into zip_code
zip_to_zcta_df = zip_to_zcta_df.rename(columns={"ZIP_CODE": "zip_code"})

### Test merging against zip_code in the restaurant table

In [14]:
# Import cleaned restaurant data
restaurant_clean_df = pd.read_csv("../02_transform_restaurant/restaurant_address.csv", dtype={'zip_code': 'str'})
restaurant_clean_df.head()

Unnamed: 0,restaurant_id,street_no,street_name,zip_code
0,1,800,N Canal Blvd,70301
1,1,124,John R Rd,48083
2,1,909,N Wood,75644
3,1,97,Gateway Blvd,82901
4,1,6557,S Staples St,78413


In [15]:
# An overview of restaurant data
restaurant_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9343 entries, 0 to 9342
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   restaurant_id  9343 non-null   int64 
 1   street_no      9343 non-null   object
 2   street_name    9343 non-null   object
 3   zip_code       9343 non-null   object
dtypes: int64(1), object(3)
memory usage: 292.1+ KB


In [16]:
# Merge restaurant data with zip_to_zcta data based on zip_code
restaurant_df_with_zcta = pd.merge(restaurant_clean_df, zip_to_zcta_df, on="zip_code")
restaurant_df_with_zcta.head()

Unnamed: 0,restaurant_id,street_no,street_name,zip_code,ZCTA
0,1,800,N Canal Blvd,70301,70301
1,17,1020,S Acadia Rd,70301,70301
2,96,204,N Canal Blvd,70301,70301
3,542,612,N Canal Blvd,70301,70301
4,1,124,John R Rd,48083,48083


In [17]:
# Overview of restaurant data after merging --> all zip_codes in restaurant data have a matching ZCTA
restaurant_df_with_zcta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9343 entries, 0 to 9342
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   restaurant_id  9343 non-null   int64 
 1   street_no      9343 non-null   object
 2   street_name    9343 non-null   object
 3   zip_code       9343 non-null   object
 4   ZCTA           9343 non-null   object
dtypes: int64(1), object(4)
memory usage: 438.0+ KB


In [18]:
# Overview of rows with zip_codes different from ZCTA
restaurant_df_with_zcta[restaurant_df_with_zcta["zip_code"] != restaurant_df_with_zcta["ZCTA"]]

Unnamed: 0,restaurant_id,street_no,street_name,zip_code,ZCTA
2100,2,3197,W 5400 S,84129,84118
2705,3,4303,Prospect Ave,62524,62526
3163,3,787,Erie Blvd W,13201,13202
4096,5,5757,Wayne Newton Blvd,89111,89119
6802,17,250,Summit Park Dr,15275,15108
6803,17,100,Davis Blvd,15275,15108
6970,17,9500,Euclid Ave,44195,44106
7494,26,6986,Chestnut St,95021,95020
8612,77,169,International Ctr,48824,48825
8656,77,1400,R St,68588,68508


### Export cleaned zip_to_zcta_df to csv

In [19]:
# Export ZIP_to_ZCTA dataframe into CSV file
zip_to_zcta_df.to_csv('zcta.csv', index=False)