In [8]:
"""
Merge Number of fast food restaurants, population, education level, and income datasets.
We will only analyze data for the areas included in the fast food restaurants.
In other words, we will merge with respect to the zip codes for which we have at least
one restaurant location.
"""

import pandas as pd

food_file = "cleaned_fastFood3.csv"
income_file = "cleaned_income.csv"
edu_file = "cleaned_education2.csv"
pop_file = "cleaned_zip_population.csv"

food_data = pd.read_csv(food_file)
income_data = pd.read_csv(income_file)
edu_data = pd.read_csv(edu_file)
pop_data = pd.read_csv(pop_file)

In [9]:

# Convert Zip Code from numpy.int64 to str
food_data["zipcode"] = food_data["zip_code"].apply(lambda x: str(x).zfill(5))
pop_data["zipcode"] = pop_data["zipcode_str"].apply(lambda x: str(x).zfill(5))
edu_data["zipcode"] = edu_data["zip code"].apply(lambda x: str(x).zfill(5))
income_data["zipcode"] = income_data["zip_code"].apply(lambda x: str(x).zfill(5))

# Remove unnecessary columns
#income_data = income_data.drop(columns=["Unnamed: 0", "Zip_Code"])
pop_data = pop_data.drop(columns=["Unnamed: 0", "zipcode_str"])
food_data = food_data.drop(columns=["zip_code"])
edu_data = edu_data.drop(columns=["zip code"])
income_data = income_data.drop(columns=["zip_code"])

# Rename columns
#income_data = income_data.rename(index=str, columns={
    #"State_ab":"state",
    #"City":"city",
    #"Lat":"lat",
    #"Lon":"lon",
    #"Mean":"mean",
    #"Median":"median",
    #"Stdev":"stdev"})

pop_data = pop_data.rename(index=str, columns={
    "popultion":"population"
})

In [10]:
food_data.head()

Unnamed: 0,number_of_fastfood,zipcode
0,2,1020
1,1,1027
2,1,1040
3,1,1060
4,1,1073


In [11]:
income_data.head()

Unnamed: 0,mean,median,stdev,lat,lon,zipcode
0,22496.33,13144.33,27148.67,18.1718,-66.7133,601
1,15078.4,11323.6,12503.6,18.3847,-67.1932,602
2,26046.8,15386.4,29969.6,18.4395,-67.123,603
3,27497.5,9461.0,41453.5,18.4723,-67.1297,605
4,19194.0,15233.0,15261.0,18.1922,-66.9809,606


In [12]:
edu_data.head()

Unnamed: 0,graduate_or_professional,highSchool_or_higher,bachelors_or_higher,zipcode
0,5.0,92.1,14.1,49347
1,18.0,96.0,45.4,63126
2,10.1,88.7,23.9,21921
3,9.2,74.7,13.9,37880
4,20.8,95.0,51.5,27455


In [13]:
pop_data.head()

Unnamed: 0,population,state,county,zipcode
0,17800,PR,Adjuntas Municipio,601
1,39716,PR,Aguada Municipio,602
2,51565,PR,Aguadilla Municipio,603
3,6320,PR,Maricao Municipio,606
4,27976,PR,Aasco Municipio,610


In [14]:
# Merge Fast Food and Population data
food_pop = food_data.merge(pop_data, left_on="zipcode", right_on="zipcode", how="left")

# Code to check rows for missing data
# food_pop[food_pop.isna().any(axis=1) == True]

# Drop rows with missing data
food_pop = food_pop.drop(food_pop.index[food_pop.isna().any(axis=1) == True])
food_pop.count()

# food_pop_edu = food_pop.merge(edu_data, left_on="zipcode", right_on="zipcode", how="left")
# food_pop_edu_income = food_pop_edu.merge(income_data, left_on="zipcode", right_on="zipcode", how="left")
# food_pop_edu_income.head()

number_of_fastfood    5185
zipcode               5185
population            5185
state                 5185
county                5185
dtype: int64

In [18]:
# Merge with education data
food_pop_edu = food_pop.merge(edu_data, left_on="zipcode", right_on="zipcode", how="left")

# Code to check rows for missing data
# food_pop_edu[food_pop_edu.isna().any(axis=1) == True]

# No rows with missing data
# Drop rows with missing data
# food_pop_edu = food_pop_edu.drop(food_pop_edu.index[food_pop_edu.isna().any(axis=1) == True])
food_pop_edu.count()
food_pop_edu["zipcode"].value_counts().head()

02215    1
38703    1
24614    1
80863    1
28403    1
Name: zipcode, dtype: int64

In [37]:
# Merge with income data.
food_pop_edu_income1 = food_pop_edu.merge(income_data, left_on="zipcode", right_on="zipcode", how="left")

# show the dataset
food_pop_edu_income1.head()

Unnamed: 0,number_of_fastfood,zipcode,population,state,county,graduate_or_professional,highSchool_or_higher,bachelors_or_higher,mean,median,stdev,lat,lon
0,2,1020,30629.0,MA,Hampden County,6.1,87.9,16.2,63987.5,57298.5,40508.0,42.173,-72.5854
1,1,1027,17872.0,MA,Hampshire County,12.8,95.2,35.5,93036.5,190748.5,66716.5,42.26,-72.6939
2,1,1040,40280.0,MA,Hampden County,9.4,79.0,24.1,55908.5,79691.5,44096.83,42.2047,-72.6244
3,1,1060,15407.0,MA,Hampshire County,34.5,95.4,61.8,85456.0,300000.0,59449.0,42.3091,-72.6678
4,1,1073,6081.0,MA,Hampshire County,19.1,94.4,37.8,,,,,


In [38]:
# check the duplicates
food_pop_duplicates = food_pop_edu_income1[food_pop_edu_income.duplicated(subset=["zipcode"], keep=False)]
food_pop_duplicates.sort_values(by=["zipcode"])

Unnamed: 0,number_of_fastfood,zipcode,population,state,county,graduate_or_professional,highSchool_or_higher,bachelors_or_higher,mean,median,stdev,lat,lon


In [39]:
# check the merged data sets for missing data
food_pop_edu_income1.count()

number_of_fastfood          5185
zipcode                     5185
population                  5185
state                       5185
county                      5185
graduate_or_professional    5181
highSchool_or_higher        5181
bachelors_or_higher         5181
mean                        4526
median                      4526
stdev                       4526
lat                         4526
lon                         4526
dtype: int64

In [40]:
# delet the missing data
food_pop_edu_income_final = food_pop_edu_income1.dropna(how = "any")
food_pop_edu_income_final.count()

number_of_fastfood          4526
zipcode                     4526
population                  4526
state                       4526
county                      4526
graduate_or_professional    4526
highSchool_or_higher        4526
bachelors_or_higher         4526
mean                        4526
median                      4526
stdev                       4526
lat                         4526
lon                         4526
dtype: int64