In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from us import states

# Dependencies
import requests
import json

# Census API Key
from config import api_key,gkey
c = Census(api_key, year=2018)

ModuleNotFoundError: No module named 'config'

In [None]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels

In [None]:
 # retrieve the census data using the for / in in the Fips format
    mdcheck = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E"),
           geo={'for': 'zip code tabulation area:*',
                        'in': 'state:{}'.format(states.IN.fips)})

 # Convert to DataFrame
census_pd2 = pd.DataFrame(mdcheck)

census_pd2.head()
 len(census_pd2)

In [None]:
# Add in Poverty Rate (Poverty Count / Population)
census_pd2["Poverty Rate"] = 100 * \
    census_pd2["B17001_002E"].astype(
        int) / census_pd2["B01003_001E"].astype(int)

In [None]:
census_pd2.head()

In [None]:
# Column Reordering
census_pd2 = census_pd2.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})
# Final DataFrame
census_pd_final = census_pd2[["Zipcode", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Count", "Poverty Rate"]]

# Visualize
print(len(census_pd_final))
census_pd_final.head()

In [None]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_pd_final.to_csv("census_data_indiana_2014.csv", encoding="utf-8", index=False)

In [None]:
# Read CSV file into DataFrame df for university data
uni_df_raw = pd.read_csv('universities.csv', index_col=0)

# Show dataframe
uni_df_raw.reset_index().head()

In [None]:
# re-read CSV file into DataFrame df for purposes of grabbing city only (this is inefficient, but it's already built, sooo)
df = pd.read_csv('universities.csv')
df

In [None]:
# Create new dataframe with just the city name
city_list = df[["City"]]
city_list

# Remove any duplicates before feeding it through the API
city_list_dedup = city_list.drop_duplicates()
city_list_dedup

In [None]:
city_list_dedup['Lat']=" "
city_list_dedup['Lng']=" "

city_list_dedup

In [None]:
# Run a request to endpoint and convert result to json

lonely_city = []


for index,row in city_list_dedup.iterrows():
    
    target_city_row = row["City"]
    
    target_city = f"{target_city_row}, Indiana"

# Build the endpoint URL
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'address={0}&key={1}').format(target_city, gkey)
    
    geo_data = requests.get(target_url).json()
    
# Extract latitude and longitude
    try:
        city_list_dedup.loc[index,"Lat"] = geo_data["results"][0]["geometry"]["location"]["lat"]
        city_list_dedup.loc[index,"Lng"] = geo_data["results"][0]["geometry"]["location"]["lng"]
        print(f"Record found at {target_city}")
    except (KeyError, IndexError):
        print(f"Record could not be found at {target_city}")
        lonely_city = target_city_row

In [None]:
city_list_dedup

In [None]:
#Set Index to City to complete the merge with uni_df
city_list_dedup2 = city_list_dedup.set_index('City')
city_list_dedup2

#complete the merge
merged_df = uni_df_raw.join(city_list_dedup2, on='City')
merged_df2 = merged_df.reset_index()

# Note to avoid any issues later, use encoding="utf-8"
merged_df2.to_csv("city_lat_long_ind.csv", encoding="utf-8", index=False)


merged_df2

In [None]:
for index,row in merged_df2.iterrows():
    
    target_lat = row["Lat"]
    target_lng = row["Lng"]
    target_school = row["School"]

    # geocoordinates
    target_coordinates = f"{target_lat}, {target_lng}"
    target_search = f"{target_school}"
    target_radius = 10000

    # set up a parameters dictionary
    params = {
        "location": target_coordinates,
        "keyword": target_search,
        "radius": target_radius,
        "key": gkey
    }

    # base url
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # run a request using our params dictionary
    response = requests.get(base_url, params=params)
    
    places_data = response.json()
    
    # Extract place_id which will be used to find ZIP Code
    try:
        merged_df2.loc[index,"place_id"] = places_data["results"][0]["place_id"]
        print(f"Record found at {target_school}")
    except (KeyError, IndexError):
        print(f"Record could not be found at {target_school} at {target_lat} and {target_lng}")

In [None]:
#Remove Davenport college as further research shows it's in Michigan and and online only
clean_merge = merged_df2[merged_df2['School']!="DAVENPORT COLLEGE"]
clean_merge.head(15)

In [None]:
for index,row in clean_merge.iterrows():
    
    target_place_id = row["place_id"]
    

# Build the endpoint URL
    target_url = (f'https://maps.googleapis.com/maps/api/place/details/json?place_id={target_place_id}&key={gkey}')
    
    zip_data = requests.get(target_url).json()
    
# Extract latitude and longitude
    try:
        if zip_data["result"]["address_components"][7]["long_name"] == "United States":
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][8]["long_name"]
        else:
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][7]["long_name"]
        print(f"Record found at {target_place_id}")
    except (KeyError, IndexError):
#         if IndexError:
#             lat_long_df2.loc[index,"Zip Code"] = zip_data["result"]["address_components"][6]["long_name"]
#             print(f"Record found with 6 index for {target_place_id}")
#         else:
        print(f"Record could not be found for{target_place_id}")


In [None]:
#reviewing record not found issues
missing_zips = clean_merge[(clean_merge['place_id']=="ChIJa8kQbDA_EogRaj5xfkug230")|(clean_merge['place_id']=="ChIJUe6QWPnLFogRinKUEbLx5oY")]
missing_zips

In [None]:
# quick research shows that these 2 items have a different api dictionary range than the others, will fix with a separate call
for index,row in missing_zips.iterrows():
    
    target_place_id = row["place_id"]
    

# Build the endpoint URL
    target_url = (f'https://maps.googleapis.com/maps/api/place/details/json?place_id={target_place_id}&key={gkey}')
    
    zip_data = requests.get(target_url).json()
    
# Extract ZIP code using the secondary retry logic and placing it back into the clean_merge with the same index values
    try:
        if zip_data["result"]["address_components"][5]["long_name"] == "United States":
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][6]["long_name"]
        else:
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][5]["long_name"]
    except (KeyError, IndexError):
        print(f"Record could not be found for{target_place_id}")


In [None]:
# final clean merge with a zip code for each school
clean_merge

clean_merge.to_csv("colleges_unis_with_zips.csv", encoding="utf-8", index=False)

In [None]:
#start to get the normalized data
schools_by_zip = clean_merge["Zip Code"].value_counts()
schools_by_zip_df = pd.DataFrame(schools_by_zip)

#schools summarized by zip
schools_by_zip_df.tocsv('normalized_uni_zip_data.csv', encoding='utf-8', index=False)