In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 25)
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 15)

# Function to return table from the html url passed to the function
def html_table_to_df(url, table_no):
    html_content=BeautifulSoup(requests.get(url).content, "lxml")
    tbody=html_content.find_all('tbody')[table_no]
    #print(tbody)
    th = tbody.find_all('th')
    #print(th)
    col_head = [x.text.strip() for x in th]
    #print(col_head)
    table = []
    tr = tbody.find_all('tr')
    #print('tr processed')
    for row in tr:
        td=row.find_all('td')
        col_val=[x.text.strip() for x in td]
        table.append(col_val)
    #print(table)
    return pd.DataFrame(table, columns = col_head)

# Data Extraction
url_neighborhoods = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_neighborhoods = html_table_to_df(url_neighborhoods, 0)

# Data Cleaning
df_neighborhoods=df_neighborhoods[df_neighborhoods.Neighbourhood != 'Not assigned'].reset_index(drop = True)
df_neighborhoods.drop([0],inplace=True)
df_neighborhoods.reset_index(drop=True, inplace= True)
df_neighborhoods.set_index("Neighbourhood", inplace = True)
df_neighborhoods.columns=['Postal Code','Borough']
df_neighborhoods

# Data Extraction
url_demographics = 'https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods'
df_demographics = html_table_to_df(url_demographics, 1)

# Data Cleaning
df_demographics=df_demographics[['Name','Population','Land area (km2)','Density (people/km2)','Average Income']]
df_demographics.drop([0],inplace=True)
df_demographics.drop([1],inplace=True)
df_demographics.reset_index(drop = True, inplace= True)
df_demographics.columns=['Neighborhood','Population','Land Area (Km Sq.)','Density(People/Km Sq.)','Average Income']
df_demographics.set_index('Neighborhood', inplace = True)



dr_dg_barh = []
for neighborhood, row in df_demographics.iterrows():
    dr_dg_barh.append([neighborhood, int(row['Population'].replace(',','')), int(row['Density(People/Km Sq.)'].replace(',',''))])
dr_dg_barh=pd.DataFrame(data = dr_dg_barh, columns = ['Neighborhood','Population','Density(People/Km Sq.)'])
dr_dg_barh= dr_dg_barh.sort_values(by = ['Population','Density(People/Km Sq.)'], ascending = False ).reset_index(drop = True)

dr_dg_barh.set_index('Neighborhood', inplace = True)

n_population=dr_dg_barh.Population/dr_dg_barh.Population.max()
n_density = dr_dg_barh["Density(People/Km Sq.)"]/dr_dg_barh["Density(People/Km Sq.)"].max()

dr_dg_barh['Population Normalized'] = n_population
dr_dg_barh['Density Normalized'] = n_density

dr_dg_barh['Population to Density Ratio']=(dr_dg_barh['Population Normalized']/dr_dg_barh['Density Normalized']).round(2)
dr_dg_barh.sort_values(by ='Population to Density Ratio',ascending = False, inplace = True )

dr_dg_barh[0:10].drop(columns = {'Population','Density(People/Km Sq.)','Population Normalized','Density Normalized'}).plot.bar(  figsize = (15, 6))
plt.show()



dr_dg_barh[0:10]
# Data Joining
df_toronto=df_neighborhoods.join(df_demographics)
df_toronto=df_toronto[df_toronto.Population.notnull()]
df_toronto

geolocator = Nominatim(user_agent="my-application")
df_neighborhoods_ll=[]
for neighborhood in df_toronto.index:
    location = geolocator.geocode(neighborhood)
    df_neighborhoods_ll.append([neighborhood, location.latitude, location.longitude])
df_neighborhoods_ll=pd.DataFrame(df_neighborhoods_ll, columns = ['Neighborhood','Latitude','Longitude'])
df_neighborhoods_ll.set_index('Neighborhood', inplace = True)
df_neighborhoods_ll.head()

df_geodata=pd.read_csv('http://cocl.us/Geospatial_data')
df_geodata.set_index("Postal Code", inplace = True)
df_neighborhoods_ll=df_neighborhoods.join(df_geodata, on="Postal Code")
df_neighborhoods_ll

# function that extracts the category of the venue
def get_venue_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

df_neighborhoods_venues=pd.DataFrame()

for neighborhood, row in df_neighborhoods_ll.iterrows():
    CLIENT_ID = 'UFNACJPWWMJG4FOMNIIWCBGLVD1Z3LB5HTUQBBB35U3KMAZI' # your Foursquare ID
    CLIENT_SECRET = 'ZYGCW1HY5CXFPGIH5XBOPXFU2H5VRA5K3J0M33143PVM3HWW' # your Foursquare Secret
    VERSION = '20180605' # Foursquare API version
    limit = 500 # limit of number of venues returned by Foursquare API
    radius = 500 # define radius
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        row['Latitude'], 
        row['Longitude'], 
        radius,
        limit)
    try:
        results = requests.get(url).json()
        venues = results['response']['groups'][0]['items']
        nearby_venues = json_normalize(venues) # flatten JSON
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        nearby_venues =nearby_venues.loc[:, filtered_columns]
        nearby_venues['venue.categories'] = nearby_venues.apply(get_venue_category_type, axis=1)
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
        nearby_venues.columns=["Venue Name","Venue Category","Latitude","Longitude"]
        nearby_venues['Neighborhood']=neighborhood
        nearby_venues.set_index('Neighborhood', inplace= True)
    except:
        nearby_venues=pd.DataFrame()
    df_neighborhoods_venues=df_neighborhoods_venues.append(nearby_venues)
df_neighborhoods_venues


df_ng_v_counts = df_neighborhoods_venues.reset_index()[['Venue Category']]
df_ng_v_counts['Count']=''
df_ng_v_counts = df_ng_v_counts.groupby(by = 'Venue Category').count()
pd.set_option('display.max_rows', 278)
df_ng_v_counts
df_ng_v_counts.sort_values(by = ['Count'], ascending = False, inplace = True)

df_ng_v_counts[0:10].plot.bar( figsize = (15, 6))
plt.show()