In [1]:
# Dependencies
import requests
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
from statistics import mean

# I could not figure out all those "attempt to modify a slice of a view" warnings.
# I may be doing some things in a non-ideal way, but they are working.
# Writing code that not only works but works well will have to be a future task.
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Google developer API key
from config import google_api_key

In [2]:
# variable to indicate if we need to call google api or not
# if not, will assume csv files exist and data can be pulled from there, not api
api_needed = False

In [5]:
# Slice NJ data down to only Essex county, our sample set for correlation testing
# Below data was generated via calls to the Census API, which is in the census_api.ipynb
census_csv = "Resources/NJ_Census_Coord_Data.csv"

census_nj_df = pd.read_csv(census_csv, index_col=0)
census_essex_df = census_nj_df[census_nj_df["County"] == "Essex"]

In [6]:
# google api call function
# will write to a csv upon completion so we do not have to keep calling the api

def get_google_data(in_census_df, out_census_csv):
    # new lists to hold average price and average rating for each zip code
    avg_price_data = []
    avg_ratings_data = []

    # iterate through each zip code of input dataframe
    for index, row in in_census_df.iterrows():
        # google places cannot search by zip code, so pass coordinates of zip code
        target_coordinates = str(row["Latitude"]) + ", " + str(row["Longitude"])
    
        # look for restaurants
        target_type = "restaurant"
        
        # interested in restaurants closest to coordinates
        # default is some kind of google ranking of restaurants that can
        # jump around a bit, geographically
        # we want to try to keep this as objective as possible and get closest restaurants,
        # not google recommended restaurants that may be in next town over
        target_rankby = "distance"        

        # set up a parameters dictionary
        params = {
            "location": target_coordinates,
            "rankby": target_rankby,
            "type": target_type,
            "key": google_api_key
        }

        # base url
        base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

        # run a request using our params dictionary
        response = requests.get(base_url, params=params)    
    
        # turn it into a json
        response_json = response.json()
    
        # make sure we got results
        try:
            num_results = len(response_json["results"])
        except KeyError:
            num_results = 0
    
        # new lists to hold prices and ratings of individual restaurants
        prices = []
        ratings = []    
    
        if num_results > 0:
            results_df = response_json["results"]
            
            # loop through results for zip code
            for result_loop in range(len(results_df)):
                try:
                    # append price to list
                    prices.append(results_df[result_loop]["price_level"])                    
                except KeyError:
                    # if no price for this restaurant, just move along
                    pass
            
                try:
                    # append rating to list
                    ratings.append(results_df[result_loop]["rating"])
                except KeyError:
                    # if no rating for this restaurant, just move along
                    pass
        
            # calculate average of restaurant prices for zip code
            if len(prices) > 0:
                price_mean = mean(prices)
            else:
                price_mean = 0
            
            # append this average for zip code to list of all zip codes
            avg_price_data.append(price_mean)
            
            # calculate average of restaurant ratings for zip code
            if len(ratings) > 0:
                ratings_mean = mean(ratings)
            else:
                ratings_mean = 0
            
            # append this average for zip code to list of all zip codes
            avg_ratings_data.append(ratings_mean)
    
    # add averages for each zip code to input dataframe
    in_census_df["Average Price"] = avg_price_data
    in_census_df["Average Rating"] = avg_ratings_data
    
    # output to csv, so we do not have to run this api again
    in_census_df.to_csv(out_census_csv)

In [7]:
if api_needed == True:
    get_google_data(census_essex_df, "Resources/NJ_Census_Google_Essex.csv")
else:
    census_essex_df = pd.read_csv("Resources/NJ_Census_Google_Essex.csv", index_col=0)

In [9]:
# Slice out columns we are interested in and do a pandas dataframe correlation
# Baljit is doing graphing of this data that will tell story better,
# but I wanted some R numbers for the correlations

slice_df = census_essex_df[["Median Age", "Household Income", "Foreign Percentage", "Average Price", "Average Rating"]]
slice_corr_df = slice_df.corr()
slice_corr_df = slice_corr_df.drop(columns=["Median Age", "Household Income", "Foreign Percentage"])
slice_corr_df

Unnamed: 0,Average Price,Average Rating
Median Age,0.471073,0.26942
Household Income,0.618979,0.429359
Foreign Percentage,-0.262561,-0.146734
Average Price,1.0,0.011307
Average Rating,0.011307,1.0


In [10]:
# Several zip codes with low population have a negative household income,
# usually accompanied by a few other fields with bad data.
# Slice them out. The data is incomplete and the populations small enough,
# that it should have minimal effects on our conculsions

clean_nj_df = census_nj_df[(census_nj_df["Household Income"] >= 0)]

In [11]:
# sort nj zip codes by median age.
# place top and bottom 25 zip codes in dataframes

age_df = clean_nj_df.sort_values("Median Age")
age_young_df = age_df.head(25)
age_old_df = age_df.tail(25)

In [12]:
# sort nj zip codes by household income.
# place top and bottom 25 zip codes in dataframes

income_df = clean_nj_df.sort_values("Household Income")
income_poor_df = income_df.head(25)
income_rich_df = income_df.tail(25)

In [13]:
# sort nj zip codes by percentage of foreign born residents.
# place top and bottom 25 zip codes in dataframes

foreign_df = clean_nj_df.sort_values("Foreign Percentage")
foreign_low_df = foreign_df.head(25)
foreign_high_df = foreign_df.tail(25)

In [14]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(age_young_df, "Resources/NJ_Age_Young_Data.csv")
else:
    age_young_df = pd.read_csv("Resources/NJ_Age_Young_Data.csv", index_col = 0)

In [15]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(age_old_df, "Resources/NJ_Age_Old_Data.csv")
else:
    age_old_df = pd.read_csv("Resources/NJ_Age_Old_Data.csv", index_col=0)

In [16]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(income_poor_df, "Resources/NJ_Income_Poor_Data.csv")
else:
    income_poor_df = pd.read_csv("Resources/NJ_Income_Poor_Data.csv", index_col=0)

In [17]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(income_rich_df, "Resources/NJ_Income_Rich_Data.csv")
else:
    income_rich_df = pd.read_csv("Resources/NJ_Income_Rich_Data.csv", index_col=0)

In [18]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(foreign_low_df, "Resources/NJ_Foreign_Low_Data.csv")
else:
    foreign_low_df = pd.read_csv("Resources/NJ_Foreign_Low_Data.csv", index_col=0)

In [19]:
# add restaurant price and ratings data, either via api or from previously built csv
if api_needed == True:
    get_google_data(foreign_high_df, "Resources/NJ_Foreign_High_Data.csv")
else:
    foreign_high_df = pd.read_csv("Resources/NJ_Foreign_High_Data.csv")