In [1]:
# Dependencies
import pandas as pd

### Transform the zip_to_zcta dataframe

In [2]:
# Import ZIP to ZCTA conversion table. All fields were imported as string to ensure the ZIP Codes retains the "00" prefixes 
zip_to_zcta_df = pd.read_excel("../00_input/zip_to_zcta_2019.xlsx", dtype = 'str')

In [3]:
# Check the ZIP/ZCTA dataframe data, particularly for retention of "00" for ZIP Codes
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601,Zip Matches ZCTA
3,602,Aguada,PR,Zip Code Area,602,Zip Matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603,Zip Matches ZCTA


In [4]:
# Review data types in ZIP/ZCTA dataframe
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41107 entries, 0 to 41106
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ZIP_CODE       41107 non-null  object
 1   PO_NAME        41107 non-null  object
 2   STATE          41106 non-null  object
 3   ZIP_TYPE       41107 non-null  object
 4   ZCTA           41107 non-null  object
 5   zip_join_type  41107 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [5]:
# Remove unnecessary columns to refine dataset
zip_to_zcta_df.drop(columns=['PO_NAME', 'STATE', 'ZIP_TYPE', 'zip_join_type'], inplace=True)
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,ZCTA
0,501,11742
1,544,11742
2,601,601
3,602,602
4,603,603


In [6]:
# Check the minimum and maximum length of values in column ZIP_CODE
zip_code_len = zip_to_zcta_df["ZIP_CODE"].apply(len)
print(f"Max zip_code length is {zip_code_len.max()}.")
print(f"Min zip_code length is {zip_code_len.min()}.")

Max zip_code length is 5.
Min zip_code length is 5.


In [7]:
# Check the minimum and maximum length of values in column ZCTA
zcta_len = zip_to_zcta_df["ZCTA"].apply(len)
print(f"Max zcta length is {zcta_len.max()}.")
print(f"Min zcta length is {zcta_len.min()}.")

Max zcta length is 7.
Min zcta length is 5.


In [8]:
# A look into rows with length of values in column ZCTA > 5 
zip_to_zcta_df[zip_to_zcta_df.ZCTA.str.len() > zcta_len.min()]

Unnamed: 0,ZIP_CODE,ZCTA
41098,96939,No ZCTA
41099,96940,No ZCTA
41100,96941,No ZCTA
41101,96942,No ZCTA
41102,96943,No ZCTA
41103,96944,No ZCTA
41104,96960,No ZCTA
41105,96970,No ZCTA
41106,96898,No ZCTA


In [9]:
# Remove all rows with ZCTA = "No ZCTA"
zip_to_zcta_df = zip_to_zcta_df[~(zip_to_zcta_df.ZCTA.str.len() > zcta_len.min())]

In [10]:
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41098 entries, 0 to 41097
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ZIP_CODE  41098 non-null  object
 1   ZCTA      41098 non-null  object
dtypes: object(2)
memory usage: 963.2+ KB


In [11]:
# Identify any duplicated ZIP Codes as all ZIP Codes should be unique (many ZIP Codes to one ZCTA relationship)
zip_to_zcta_df['ZIP_CODE'].nunique()

41098

In [32]:
zip_to_zcta_df['ZCTA'].nunique()

33166

In [12]:
# Change ZIP_CODE into zip_code
zip_to_zcta_df = zip_to_zcta_df.rename(columns={"ZIP_CODE": "zip_code"})

### Test merging against zip_code in the zip_code table

In [13]:
# Import cleaned restaurant data
zip_code_df = pd.read_csv("../02_transform_restaurant/zip_code.csv", dtype={'zip_code': 'str'})
zip_code_df.head()

Unnamed: 0,zip_code,city,state
0,70301,Thibodaux,LA
1,37863,Pigeon Forge,TN
2,30260,Morrow,GA
3,48204,Detroit,MI
4,48235,Detroit,MI


In [14]:
# An overview of restaurant data
zip_code_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  5426 non-null   object
 1   city      5426 non-null   object
 2   state     5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [15]:
# Merge restaurant data with zip_to_zcta data based on zip_code
zip_zcta_to_zip = pd.merge(zip_code_df, zip_to_zcta_df, how="outer", on="zip_code")
zip_zcta_to_zip.head()

Unnamed: 0,zip_code,city,state,ZCTA
0,70301,Thibodaux,LA,70301
1,37863,Pigeon Forge,TN,37863
2,30260,Morrow,GA,30260
3,48204,Detroit,MI,48204
4,48235,Detroit,MI,48235


In [16]:
# Overview of restaurant data after merging --> all zip_codes in restaurant data have a matching ZCTA
zip_zcta_to_zip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41098 entries, 0 to 41097
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  41098 non-null  object
 1   city      5426 non-null   object
 2   state     5426 non-null   object
 3   ZCTA      41098 non-null  object
dtypes: object(4)
memory usage: 1.6+ MB


As can be seen from above, all zip_codes in zip_code table have a matched zip_code in the zip_to_zcta table. However, there are many zip_codes in the zip_to_zcta table that do not have an equivalent zip_code in the zip_code table.

### Test merging against ZCTA in the census table

In [17]:
census_df = pd.read_csv("../02_transform_census/clean_census.csv", dtype={'ZCTA': 'str'})
census_df.head()

Unnamed: 0,ZCTA,population,median_age,median_household_income,per_capita_income,poverty_count,unemployment_count
0,601,17242,40.5,13092.0,6999.0,10772,2316
1,602,38442,42.3,16358.0,9277.0,19611,1927
2,603,48814,41.1,16603.0,11307.0,24337,3124
3,606,6437,43.3,12832.0,5943.0,4163,230
4,610,27073,42.1,19309.0,10220.0,11724,1290


In [18]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32525 entries, 0 to 32524
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ZCTA                     32525 non-null  object 
 1   population               32525 non-null  int64  
 2   median_age               32525 non-null  float64
 3   median_household_income  30891 non-null  float64
 4   per_capita_income        32456 non-null  float64
 5   poverty_count            32525 non-null  int64  
 6   unemployment_count       32525 non-null  int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 1.7+ MB


In [19]:
zcta_census = pd.merge(census_df, zip_to_zcta_df, how="outer", on="ZCTA")
zcta_census.head()

Unnamed: 0,ZCTA,population,median_age,median_household_income,per_capita_income,poverty_count,unemployment_count,zip_code
0,601,17242.0,40.5,13092.0,6999.0,10772.0,2316.0,601
1,602,38442.0,42.3,16358.0,9277.0,19611.0,1927.0,602
2,603,48814.0,41.1,16603.0,11307.0,24337.0,3124.0,603
3,603,48814.0,41.1,16603.0,11307.0,24337.0,3124.0,604
4,603,48814.0,41.1,16603.0,11307.0,24337.0,3124.0,605


In [20]:
zcta_census.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41099 entries, 0 to 41098
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ZCTA                     41099 non-null  object 
 1   population               40368 non-null  float64
 2   median_age               40368 non-null  float64
 3   median_household_income  38669 non-null  float64
 4   per_capita_income        40294 non-null  float64
 5   poverty_count            40368 non-null  float64
 6   unemployment_count       40368 non-null  float64
 7   zip_code                 41098 non-null  object 
dtypes: float64(6), object(2)
memory usage: 2.8+ MB


Comparing the number of non-null ZCTA (41099) and non-null zip_code (41098), there is one ZCTA in the census table without a matching ZCTA in the zip_to_zcta table.

In [22]:
zcta_census[zcta_census["zip_code"].isnull()]

Unnamed: 0,ZCTA,population,median_age,median_household_income,per_capita_income,poverty_count,unemployment_count,zip_code
38173,95314,95.0,19.4,,8711.0,0.0,0.0,


In [27]:
zcta_to_remove = zcta_census[zcta_census["zip_code"].isnull()]["ZCTA"].to_list()
zcta_to_remove

['95314']

In [28]:
census_df = census_df[~census_df["ZCTA"].isin(zcta_to_remove)]

In [29]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32524 entries, 0 to 32524
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ZCTA                     32524 non-null  object 
 1   population               32524 non-null  int64  
 2   median_age               32524 non-null  float64
 3   median_household_income  30891 non-null  float64
 4   per_capita_income        32455 non-null  float64
 5   poverty_count            32524 non-null  int64  
 6   unemployment_count       32524 non-null  int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 2.0+ MB


In [30]:
# Update to census csv
census_df.to_csv('../02_transform_census/clean_census.csv', index=False)

### Export cleaned zip_to_zcta_df to csv

In [31]:
# Export ZIP_to_ZCTA dataframe into CSV file
zip_to_zcta_df.to_csv('zcta.csv', index=False)