In [2]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from us import states

# Dependencies
import requests
import json

# Census API Key
from config import api_key,gkey
c = Census(api_key, year=2018)

In [3]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels

In [4]:
 # retrieve the census data using the for / in in the Fips format
    mdcheck = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E"),
           geo={'for': 'zip code tabulation area:*',
                        'in': 'state:{}'.format(states.IN.fips)})

 # Convert to DataFrame
census_pd2 = pd.DataFrame(mdcheck)

census_pd2.head()
 len(census_pd2)

775

In [5]:
# Add in Poverty Rate (Poverty Count / Population)
census_pd2["Poverty Rate"] = 100 * \
    census_pd2["B17001_002E"].astype(
        int) / census_pd2["B01003_001E"].astype(int)

In [6]:
census_pd2.head()

Unnamed: 0,NAME,B19013_001E,B01003_001E,B01002_001E,B19301_001E,B17001_002E,state,zip code tabulation area,Poverty Rate
0,ZCTA5 46511,52570.0,4254.0,46.9,32929.0,348.0,18,46511,8.180536
1,ZCTA5 46526,54187.0,32513.0,36.0,24312.0,4495.0,18,46526,13.825239
2,ZCTA5 46528,63537.0,27165.0,32.6,26499.0,2439.0,18,46528,8.978465
3,ZCTA5 46544,47778.0,31359.0,38.0,25268.0,4324.0,18,46544,13.788705
4,ZCTA5 46553,62829.0,3212.0,36.7,24039.0,78.0,18,46553,2.428394


In [7]:
# Column Reordering
census_pd2 = census_pd2.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})
# Final DataFrame
census_pd_final = census_pd2[["Zipcode", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Count", "Poverty Rate"]]

# Visualize
print(len(census_pd_final))
census_pd_final.head()

775


Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate
0,46511,4254.0,46.9,52570.0,32929.0,348.0,8.180536
1,46526,32513.0,36.0,54187.0,24312.0,4495.0,13.825239
2,46528,27165.0,32.6,63537.0,26499.0,2439.0,8.978465
3,46544,31359.0,38.0,47778.0,25268.0,4324.0,13.788705
4,46553,3212.0,36.7,62829.0,24039.0,78.0,2.428394


In [8]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_pd_final.to_csv("census_data_indiana_2014.csv", encoding="utf-8", index=False)

In [9]:
# Read CSV file into DataFrame df for university data
uni_df_raw = pd.read_csv('universities.csv', index_col=0)

# Show dataframe
uni_df_raw.reset_index().head()

Unnamed: 0,School,City,State
0,ANCILLA COLLEGE,DONALDSON,IN
1,ANDERSON UNIVERSITY,ANDERSON,IN
2,BALL STATE UNIVERSITY,MUNCIE,IN
3,BETHEL COLLEGE,MISHAWAKA,IN
4,BUTLER UNIVERSITY,INDIANAPOLIS,IN


In [14]:
# re-read CSV file into DataFrame df for purposes of grabbing city only (this is inefficient, but it's already built, sooo)
df = pd.read_csv('universities.csv')
df

Unnamed: 0,School,City,State
0,ANCILLA COLLEGE,DONALDSON,IN
1,ANDERSON UNIVERSITY,ANDERSON,IN
2,BALL STATE UNIVERSITY,MUNCIE,IN
3,BETHEL COLLEGE,MISHAWAKA,IN
4,BUTLER UNIVERSITY,INDIANAPOLIS,IN
...,...,...,...
58,UNIVERSITY OF NOTRE DAME,SOUTH BEND,IN
59,UNIVERSITY OF SOUTHERN INDIANA,EVANSVILLE,IN
60,VALPARAISO UNIVERSITY,VALPARAISO,IN
61,VINCENNES UNIVERSITY,VINCENNES,IN


In [19]:
# Create new dataframe with just the city name
city_list = df[["City"]]
city_list

# Remove any duplicates before feeding it through the API
city_list_dedup = city_list.drop_duplicates()
city_list_dedup

Unnamed: 0,City
0,DONALDSON
1,ANDERSON
2,MUNCIE
3,MISHAWAKA
4,INDIANAPOLIS
5,WHITING
6,FORT WAYNE
7,GRANGER
8,EVANSVILLE
9,GREENCASTLE


In [20]:
city_list_dedup['Lat']=" "
city_list_dedup['Lng']=" "

city_list_dedup

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_list_dedup['Lat']=" "
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_list_dedup['Lng']=" "


Unnamed: 0,City,Lat,Lng
0,DONALDSON,,
1,ANDERSON,,
2,MUNCIE,,
3,MISHAWAKA,,
4,INDIANAPOLIS,,
5,WHITING,,
6,FORT WAYNE,,
7,GRANGER,,
8,EVANSVILLE,,
9,GREENCASTLE,,


In [21]:
# Run a request to endpoint and convert result to json

lonely_city = []


for index,row in city_list_dedup.iterrows():
    
    target_city_row = row["City"]
    
    target_city = f"{target_city_row}, Indiana"

# Build the endpoint URL
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'address={0}&key={1}').format(target_city, gkey)
    
    geo_data = requests.get(target_url).json()
    
# Extract latitude and longitude
    try:
        city_list_dedup.loc[index,"Lat"] = geo_data["results"][0]["geometry"]["location"]["lat"]
        city_list_dedup.loc[index,"Lng"] = geo_data["results"][0]["geometry"]["location"]["lng"]
        print(f"Record found at {target_city}")
    except (KeyError, IndexError):
        print(f"Record could not be found at {target_city}")
        lonely_city = target_city_row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Record found at DONALDSON, Indiana
Record found at ANDERSON, Indiana
Record found at MUNCIE, Indiana
Record found at MISHAWAKA, Indiana
Record found at INDIANAPOLIS, Indiana
Record found at WHITING, Indiana
Record found at FORT WAYNE, Indiana
Record found at GRANGER, Indiana
Record found at EVANSVILLE, Indiana
Record found at GREENCASTLE, Indiana
Record found at RICHMOND, Indiana
Record found at CHESTERTON, Indiana
Record found at FRANKLIN, Indiana
Record found at GOSHEN, Indiana
Record found at WINONA LAKE, Indiana
Record found at HANOVER, Indiana
Record found at SOUTH BEND, Indiana
Record found at HUNTINGTON, Indiana
Record found at CROWN POINT, Indiana
Record found at TERRE HAUTE, Indiana
Record found at KOKOMO, Indiana
Record found at GARY, Indiana
Record found at NEW ALBANY, Indiana
Record found at BLOOMINGTON, Indiana
Record found at MARION, Indiana
Record found at LAFAYETTE, Indiana
Record found at NORTH MANCHESTER, Indiana
Record found at JEFFERSONVILLE, Indiana
Record found at

In [22]:
city_list_dedup

Unnamed: 0,City,Lat,Lng
0,DONALDSON,41.364484,-86.444133
1,ANDERSON,40.10532,-85.680254
2,MUNCIE,40.193377,-85.38636
3,MISHAWAKA,41.661993,-86.158616
4,INDIANAPOLIS,39.768403,-86.158068
5,WHITING,41.679758,-87.494487
6,FORT WAYNE,41.079273,-85.139351
7,GRANGER,41.753382,-86.110838
8,EVANSVILLE,37.971559,-87.57109
9,GREENCASTLE,39.64449,-86.864732


In [25]:
#Set Index to City to complete the merge with uni_df
city_list_dedup2 = city_list_dedup.set_index('City')
city_list_dedup2

#complete the merge
merged_df = uni_df_raw.join(city_list_dedup2, on='City')
merged_df2 = merged_df.reset_index()

# Note to avoid any issues later, use encoding="utf-8"
merged_df2.to_csv("city_lat_long_ind.csv", encoding="utf-8", index=False)


merged_df2

Unnamed: 0,School,City,State,Lat,Lng
0,ANCILLA COLLEGE,DONALDSON,IN,41.364484,-86.444133
1,ANDERSON UNIVERSITY,ANDERSON,IN,40.10532,-85.680254
2,BALL STATE UNIVERSITY,MUNCIE,IN,40.193377,-85.38636
3,BETHEL COLLEGE,MISHAWAKA,IN,41.661993,-86.158616
4,BUTLER UNIVERSITY,INDIANAPOLIS,IN,39.768403,-86.158068
...,...,...,...,...,...
58,UNIVERSITY OF NOTRE DAME,SOUTH BEND,IN,41.676355,-86.25199
59,UNIVERSITY OF SOUTHERN INDIANA,EVANSVILLE,IN,37.971559,-87.57109
60,VALPARAISO UNIVERSITY,VALPARAISO,IN,41.473095,-87.061141
61,VINCENNES UNIVERSITY,VINCENNES,IN,38.677269,-87.528633


In [26]:
for index,row in merged_df2.iterrows():
    
    target_lat = row["Lat"]
    target_lng = row["Lng"]
    target_school = row["School"]

    # geocoordinates
    target_coordinates = f"{target_lat}, {target_lng}"
    target_search = f"{target_school}"
    target_radius = 10000

    # set up a parameters dictionary
    params = {
        "location": target_coordinates,
        "keyword": target_search,
        "radius": target_radius,
        "key": gkey
    }

    # base url
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # run a request using our params dictionary
    response = requests.get(base_url, params=params)
    
    places_data = response.json()
    
    # Extract place_id which will be used to find ZIP Code
    try:
        merged_df2.loc[index,"place_id"] = places_data["results"][0]["place_id"]
        print(f"Record found at {target_school}")
    except (KeyError, IndexError):
        print(f"Record could not be found at {target_school} at {target_lat} and {target_lng}")

Record found at ANCILLA COLLEGE
Record found at ANDERSON UNIVERSITY
Record found at BALL STATE UNIVERSITY
Record found at BETHEL COLLEGE
Record found at BUTLER UNIVERSITY
Record found at CALUMET COLLEGE
Record found at CONCORDIA THEOLOGICAL SEMINARY
Record could not be found at DAVENPORT COLLEGE at 41.7533819 and -86.11083769999999
Record found at DEACONESS HOSPITAL SCHOOL OF NURSING
Record found at DEPAUW UNIVERSITY
Record found at EARLHAM COLLEGE
Record found at FAIRHAVEN COLLEGE
Record found at FRANKLIN COLLEGE
Record found at GOSHEN COLLEGE
Record found at GRACE COLLEGE
Record found at GRACE THEOLOGICAL SEMINARY
Record found at HANOVER COLLEGE
Record found at HOLY CROSS JUNIOR COLLEGE
Record found at HUNTINGTON COLLEGE
Record found at HYLES-ANDERSON COLLEGE
Record found at INDIANA INSTITUTE OF TECHNOLOGY
Record found at INDIANA STATE UNIVERSITY
Record found at INDIANA UNIVERSITY AT KOKOMO
Record found at INDIANA UNIVERSITY AT SOUTH BEND
Record found at INDIANA UNIVERSITY EAST
Recor

In [36]:
#Remove Davenport college as further research shows it's in Michigan and and online only
clean_merge = merged_df2[merged_df2['School']!="DAVENPORT COLLEGE"]
clean_merge.head(15)

Unnamed: 0,School,City,State,Lat,Lng,place_id,Zip Code
0,ANCILLA COLLEGE,DONALDSON,IN,41.364484,-86.444133,ChIJNykO2EVaEYgR6iuHCTAnQGc,46563
1,ANDERSON UNIVERSITY,ANDERSON,IN,40.10532,-85.680254,ChIJvRvrHQzZFIgRaG89vy-ipXg,46012
2,BALL STATE UNIVERSITY,MUNCIE,IN,40.193377,-85.38636,ChIJIR58aWY9FYgR9ImfGJvu4OQ,47306
3,BETHEL COLLEGE,MISHAWAKA,IN,41.661993,-86.158616,ChIJf2dC58HNFogRb1W460Xop8g,46545
4,BUTLER UNIVERSITY,INDIANAPOLIS,IN,39.768403,-86.158068,ChIJr8OliPpTa4gRPkUtyy7TxQM,46208
5,CALUMET COLLEGE,WHITING,IN,41.679758,-87.494487,ChIJ4-zpriPZEYgRVKD5Q2T6y2k,46394
6,CONCORDIA THEOLOGICAL SEMINARY,FORT WAYNE,IN,41.079273,-85.139351,ChIJBZUXQdXiFYgR41Zsgf0DWw0,46825
8,DEACONESS HOSPITAL SCHOOL OF NURSING,EVANSVILLE,IN,37.971559,-87.57109,ChIJC31ykRrVcYgR82uJ6BZUfyA,47710
9,DEPAUW UNIVERSITY,GREENCASTLE,IN,39.64449,-86.864732,ChIJp5coR7LfbIgRmbxAHZy330s,46135
10,EARLHAM COLLEGE,RICHMOND,IN,39.828937,-84.890238,ChIJPbhwJrEBQIgRDMD-hfH0UW0,47374


In [37]:
for index,row in clean_merge.iterrows():
    
    target_place_id = row["place_id"]
    

# Build the endpoint URL
    target_url = (f'https://maps.googleapis.com/maps/api/place/details/json?place_id={target_place_id}&key={gkey}')
    
    zip_data = requests.get(target_url).json()
    
# Extract latitude and longitude
    try:
        if zip_data["result"]["address_components"][7]["long_name"] == "United States":
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][8]["long_name"]
        else:
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][7]["long_name"]
        print(f"Record found at {target_place_id}")
    except (KeyError, IndexError):
#         if IndexError:
#             lat_long_df2.loc[index,"Zip Code"] = zip_data["result"]["address_components"][6]["long_name"]
#             print(f"Record found with 6 index for {target_place_id}")
#         else:
        print(f"Record could not be found for{target_place_id}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Record found at ChIJNykO2EVaEYgR6iuHCTAnQGc
Record found at ChIJvRvrHQzZFIgRaG89vy-ipXg
Record found at ChIJIR58aWY9FYgR9ImfGJvu4OQ
Record found at ChIJf2dC58HNFogRb1W460Xop8g
Record found at ChIJr8OliPpTa4gRPkUtyy7TxQM
Record found at ChIJ4-zpriPZEYgRVKD5Q2T6y2k
Record found at ChIJBZUXQdXiFYgR41Zsgf0DWw0
Record found at ChIJC31ykRrVcYgR82uJ6BZUfyA
Record found at ChIJp5coR7LfbIgRmbxAHZy330s
Record found at ChIJPbhwJrEBQIgRDMD-hfH0UW0
Record found at ChIJU8qfPU-jEYgRnX0LaK6qvc4
Record found at ChIJuWxvGOZua4gR79lD0GBEX7c
Record found at ChIJ0fC3UPTsFogRswXigO3sjZA
Record found at ChIJoQ-m_6GdFogRPu_aJZRUL_E
Record found at ChIJE4WWBMSdFogRcBmzi2VR5Hc
Record found at ChIJRyZwm94vaogRisLnrKcsabY
Record found at ChIJ275JTtHSFogRg_purimcnLU
Record found at ChIJk_5rlZbJFYgRbenfs0Si_Mc
Record found at ChIJXYPfxb3lEYgRcqOc71zBdHY
Record found at ChIJS0ChKLjkFYgR0t6rco0F5Oc
Record found at ChIJQ6w2DTplbYgReJ_3x_lCBPM
Record found at ChIJjcWkJ7OFFIgREhty2rlkKDo
Record found at ChIJT1rtZQHNFogR

In [59]:
#reviewing record not found issues
missing_zips = clean_merge[(clean_merge['place_id']=="ChIJa8kQbDA_EogRaj5xfkug230")|(clean_merge['place_id']=="ChIJUe6QWPnLFogRinKUEbLx5oY")]
missing_zips

Unnamed: 0,School,City,State,Lat,Lng,place_id,Zip Code
50,ST JOSEPH'S COLLEGE,RENSSELAER,IN,40.936704,-87.150856,ChIJa8kQbDA_EogRaj5xfkug230,47978
58,UNIVERSITY OF NOTRE DAME,SOUTH BEND,IN,41.676355,-86.25199,ChIJUe6QWPnLFogRinKUEbLx5oY,46556


In [60]:
# quick research shows that these 2 items have a different api dictionary range than the others, will fix with a separate call
for index,row in missing_zips.iterrows():
    
    target_place_id = row["place_id"]
    

# Build the endpoint URL
    target_url = (f'https://maps.googleapis.com/maps/api/place/details/json?place_id={target_place_id}&key={gkey}')
    
    zip_data = requests.get(target_url).json()
    
# Extract ZIP code using the secondary retry logic and placing it back into the clean_merge with the same index values
    try:
        if zip_data["result"]["address_components"][5]["long_name"] == "United States":
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][6]["long_name"]
        else:
            clean_merge.loc[index,"Zip Code"] = zip_data["result"]["address_components"][5]["long_name"]
    except (KeyError, IndexError):
        print(f"Record could not be found for{target_place_id}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [63]:
# final clean merge with a zip code for each school
clean_merge

clean_merge.to_csv("colleges_unis_with_zips.csv", encoding="utf-8", index=False)

In [72]:
#start to get the normalized data
schools_by_zip = clean_merge["Zip Code"].value_counts()
schools_by_zip_df = pd.DataFrame(schools_by_zip)

#schools summarized by zip
schools_by_zip_df

Unnamed: 0,Zip Code
46805,4
46556,3
47374,2
46590,2
47710,2
46208,2
46218,1
46202,1
46222,1
46750,1
