# Data Science Capstone

## Problem: 
The experience of people in Chicago, Illinois in the United States varies wildly from neighborhood to neighborhood. I'd like to explore the relationship between the most common venues in a neighborhood and its key economic and sociological indicators. 

Specifically

1. What venues are most common in the areas with the lowest per-capita income 
2. how many grocery stores are there in the highest and lowest income areas per-capita

## Data:
1. Foursquare API: foursquare.com


This data is a JSON file that will set the boundaries that are fundamental to this analysis

2. Census Data - Selected socioeconomic indicators in Chicago, 2008 – 2012: https://data.cityofchicago.org/Health-Human-Services/Census-Data-Selected-socioeconomic-indicators-in-C/kn9c-c2s2

This data contains multiple measures of socioeconic health, including education, housing and income. I will be leveraging the income data primarily.
3. Spreadsheet: 2010 Census Data Summarized :https://datahub.cmap.illinois.gov/dataset/2010-census-data-summarized-to-chicago-community-areas/resource/b30b47bf-bb0d-46b6-853b-47270fb7f626?inner_span=True

this contains the total poulation of each of the community areas related to my analysis

This data will allows us to separate out locations and see which venues are most prevalent. I look forward to evaluating the data and gaining deeper understanidng of a premier global city. The venue data will come from the Foursquare API. The income information will be from the socioeconomic indicators and the population totals will come from the summarized census data.

In [58]:
import requests
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt 
import json 
import folium
from pandas.io.json import json_normalize

In [59]:
#read in data files
censusData = pd.read_excel('2010_chicago_census.xlsx',header = 1)
socioData = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv')
Commareas = pd.read_csv('Commareas.csv')


In [60]:
socioData.head(2)

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2.0,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0


In [61]:
geolocator = Nominatim(user_agent="foursquare_agent")#set agent for later location services

df1 = censusData.copy()
a=pd.DataFrame({'Neighborhood':df1['Geog'],'Income':socioData['PER CAPITA INCOME '].copy()})
#a=pd.DataFrame({'Neighborhood':df1['Geog'],'Income':socioData['HARDSHIP INDEX'].copy()})
a['Location'] =  a['Neighborhood']+', Chicago Illinois'
a.dropna(axis=0,inplace=True)#drop the NA row
a

Unnamed: 0,Neighborhood,Income,Location
0,Rogers Park,23939,"Rogers Park, Chicago Illinois"
1,West Ridge,23040,"West Ridge, Chicago Illinois"
2,Uptown,35787,"Uptown, Chicago Illinois"
3,Lincoln Square,37524,"Lincoln Square, Chicago Illinois"
4,North Center,57123,"North Center, Chicago Illinois"
5,Lake View,60058,"Lake View, Chicago Illinois"
6,Lincoln Park,71551,"Lincoln Park, Chicago Illinois"
7,Near North Side,88669,"Near North Side, Chicago Illinois"
8,Edison Park,40959,"Edison Park, Chicago Illinois"
9,Norwood Park,32875,"Norwood Park, Chicago Illinois"


In [62]:
locLat=[]
locLong=[]
for i in range(len(a)):    
    x = geolocator.geocode(a['Location'][i])
    locLat.append(x.latitude)
    locLong.append(x.longitude)
    if i % 3 == 0:
        print( round(i/len(a)*100,),'% complete')


0 % complete
4 % complete
8 % complete
12 % complete
16 % complete
19 % complete


GeocoderTimedOut: Service timed out

for i in range(30,55):    
    x = geolocator.geocode(a['Location'][i])
    locLat.append(x.latitude)
    locLong.append(x.longitude)
    if i % 3 == 0:
        print( round(i/len(a)*100,),'% complete')

for i in range(55,len(a)):    
    x = geolocator.geocode(a['Location'][i])
    locLat.append(x.latitude)
    locLong.append(x.longitude)
    if i % 3 == 0:
        print( round(i/len(a)*100,),'% complete')
print('100% Complete')

In [None]:
a['Lat'] = locLat #add latitute and longitute to dataframe
a['Long'] = locLong
a.head()

In [None]:
#foursqaure API info 
CLIENT_ID = 'VYCOUITIGXOONWV3KJ34HKMDN2IAXOLBLRKF0CRIJSXXR1SJ' #  Foursquare ID
CLIENT_SECRET = 'H0ATIVHEOUNIBDCHAVN1YOT2K5GLUT4QT1HNO3VLPU00EIKB' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version
search_query = 'Grocery'
radius = 3218
LIMIT = 50
address = 'Lincoln Square ,Chicago IL'
Category= '4bf58dd8d48988d118951735'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}&CategoryID={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT,Category)


In [None]:
results = requests.get(url).json()
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)


In [None]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered

In [None]:
col1=[]#name of neighborhood
col2=[]#number of grocery stores
CLIENT_ID = 'VYCOUITIGXOONWV3KJ34HKMDN2IAXOLBLRKF0CRIJSXXR1SJ' #  Foursquare ID
CLIENT_SECRET = 'H0ATIVHEOUNIBDCHAVN1YOT2K5GLUT4QT1HNO3VLPU00EIKB' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version
search_query = 'Grocery'
radius = 3218
LIMIT = 50


for i in range(len(a)):
    Category= '4bf58dd8d48988d118951735, 52f2ab2ebcbc57f1066b8b46, 52f2ab2ebcbc57f1066b8b45'
    geolocator = Nominatim(user_agent="foursquare_agent")
    latitude = a['Lat'][i]
    longitude = a['Long'][i]
    #print(latitude, longitude)
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}&CategoryID={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT,Category)
    results = requests.get(url).json()
    # assign relevant part of JSON to venues
    venues = results['response']['venues']
    # tranform venues into a dataframe
    df_data = json_normalize(venues)
    nresult= len(df_data)
    col1.append(a['Neighborhood'][i])
    col2.append(nresult)
    print(a['Location'][i],'  ',nresult)
    

In [None]:
a.dropna(axis=0,inplace=True)#drop the NA row

In [None]:
a[30:]

In [None]:
dl = {'Neighborhood': col1, '# Grocers': col2,'PCI':a['Income']}
df_result = pd.DataFrame(data=dl)

In [None]:
df_result.head()

In [None]:
y = df_result['PCI']
x = df_result['# Grocers']

mpl.pyplot.scatter(x, y)
df3.corr(method='pearson', min_periods=1)

In [None]:
mlat =np.mean(a['Lat'])
mlong = np.mean(a['Long'])
venues_map = folium.Map(location=[mlat,mlong], zoom_start=9)
for Lat, Long, Neighborhood in zip(a.Lat, a.Long, a.Neighborhood):
    folium.CircleMarker(
        [Lat, Long],
        radius=.5,
        color='blue',
        #popup=Neighborhood,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

In [None]:
venues_map

In [None]:
df_result.corr(method='pearson', min_periods=1)

In [None]:
df3.corr(method='pearson', min_periods=1)

In [None]:
	# Grocers	Income
# Grocers	1.000000	0.118635