In [2]:
import random
import math
from math import sin, cos, sqrt, atan2, radians
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import locale
from locale import atof
import requests
import json 
%matplotlib inline
import folium
import os 
import plotly.plotly as py
import plotly.graph_objs as go
from gmplot import gmplot

# Raw Data 

## Yelp

In [3]:
df_yelp = pd.read_json('data/yelp_tor_data.json')
df_yelp = df_yelp.reset_index()
coordinates = df_yelp['coordinates']
new_lat = []
new_long = []
for i in range(0,len(coordinates)):
    temp_lat = coordinates[i]['latitude']
    temp_long = coordinates[i]['longitude']
    new_lat.append(temp_lat)
    new_long.append(temp_long)
df_yelp['lat'] = new_lat
df_yelp['long'] = new_long

## Neighbourhoods

In [4]:
df_n_profiles = pd.read_csv('data/2016_neighbourhood_profiles-test.csv',encoding='ANSI')

## Neighborhood Coordinates

In [5]:
df_neighborhoods = pd.read_csv('data/to_neighbor_coordinates.csv')
neighborhood_ids = df_n_profiles[df_neighborhoods['name']][0:1].T
df_neighborhoods['id'] = neighborhood_ids.values

## Join Yelp and Neigborhood Data 

In [6]:
# approximate radius of earth in km
def calc_distance(lat1,lon1,lat2,lon2):
    R = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return(distance)

In [7]:
list_of_hoods = []
list_of_ids = []
for i in range(0,len(df_yelp['lat'])):
    current_id = 0
    current_hood = ''
    current_distance = 1000000000
    for j in range(0,len(df_neighborhoods['lat'])):
        distance = calc_distance(df_neighborhoods['lat'][j],df_neighborhoods['lng'][j],df_yelp['lat'][i],df_yelp['long'][i])
        if distance<current_distance:
            current_distance = distance
            current_hood = df_neighborhoods['name'][j]
            current_id = int(df_neighborhoods['id'][j])
    list_of_hoods.append(current_hood)
    list_of_ids.append(current_id)
df_yelp['neighborhoods'] = list_of_hoods
df_yelp['neighborhood_ids'] = list_of_ids

# Geovizulations

## Google Maps Plot to Verify

In [8]:
# Place map
gmap = gmplot.GoogleMapPlotter(43.652466, -79.378781,13)

gmap.scatter(df_neighborhoods['lat'], df_neighborhoods['lng'],'#ff0000',marker=False,size=80)
gmap.scatter(df_yelp['lat'], df_yelp['long'], '#3B0B39', size=40, marker=False)

# Marker
hidden_gem_lat, hidden_gem_lon = 37.770776, -122.461689
gmap.marker(37.770776, -122.461689, 'cornflowerblue')

# Draw
gmap.draw("google_maps.html")

## Find most densley populated restaurants

In [11]:
df_neighborhoods_agg = df_yelp.groupby(['neighborhood_ids'],as_index=False).count()
df_neighborhoods_agg = df_neighborhoods_agg[['neighborhood_ids','level_0']]
restaurants_greater_than_100 = df_neighborhoods_agg[df_neighborhoods_agg['level_0'] > 100]

In [12]:
restaurants_greater_than_100_df = df_neighborhoods[df_neighborhoods['id'].isin(restaurants_greater_than_100['neighborhood_ids'])]
restaurants_greater_than_100_df.head()

Unnamed: 0.1,Unnamed: 0,lat,lng,name,id
3,3,43.669833,-79.407585,Annex,95.0
14,14,43.675863,-79.338106,Blake-Jones,69.0
19,19,43.653203,-79.38114,Cabbagetown-South St. James Town,71.0
43,43,43.653226,-79.383184,Glenfield-Jane Heights,25.0
44,44,43.679545,-79.32684,Greenwood-Coxwell,65.0


## Plot Choropleth of Restaurant Density

In [11]:
# import the library
neighborhoods_geo = os.path.join('data', 'toronto_geojson.json')

m = folium.Map(location=[43.652466, -79.378781],zoom_start=13)

m.choropleth(
    geo_data=open(neighborhoods_geo),
    topojson='objects.toronto',
    name='choropleth',
    data=df_neighborhoods_agg,
    columns=['neighborhood_ids','level_0'],
    key_on='feature.properties.id',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Number of Restaurants'
)

folium.LayerControl().add_to(m)
m.save('number_of_restaurants_map.html')

# Income

## Income for All of Toronto

In [29]:
df_income_distribution = pd.read_csv('data/income_distribution.csv')

In [33]:
data = [go.Bar(
            x=df_income_distribution['Category'],
            y=df_income_distribution['Amount']
    )]

layout = go.Layout(
    title='Toronto Income Distribution',
    xaxis=dict(title='Income Category ($)',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)')),
    yaxis=dict(
        title='Amount',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        )
    ),
    width=1000,
    height=600,
     margin=go.Margin(
        l=100,
        r=100,
        b=150,
        t=50,
        pad=4
    ),
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='income_bar')

## Average Income for Top Neighborhoods

In [15]:
df_income = df_n_profiles[df_n_profiles.Characteristic=="  After-tax income: Average amount ($)"]
list_of_neigborhoods = list(restaurants_greater_than_100_df['name'])
df_income = df_income[list_of_neigborhoods]
columns = ['Category','Topic','Data Source',"City of Toronto"]
df_income = df_income.T
df_income.columns = ['After-tax income: Average amount ($)']
df_income = df_income.sort_values(by='After-tax income: Average amount ($)',ascending=False)

In [34]:
data = [go.Bar(
            x=df_income.index.values,
            y=df_income.values
    )]

layout = go.Layout(
    title='Average Income by Neighborhood',
    xaxis=dict(title='Neighborhood',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        )),
    yaxis=dict(
        title='CAD',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        )
    ),
    width=1000,
    height=600,
     margin=go.Margin(
        l=50,
        r=50,
        b=150,
        t=50,
        pad=4
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='income-by-hood')

# Ethnic Origin

In [16]:
df_ethnic_origin = df_n_profiles[df_n_profiles.Category=="Ethnic origin"]
list_of_neigborhoods = list(restaurants_greater_than_100_df['name'])
list_of_neigborhoods.append('Characteristic')
df_ethnic_origin = df_ethnic_origin[list_of_neigborhoods]
df_ethnic_origin = df_ethnic_origin.set_index('Characteristic').T.drop(['Total - Ethnic origin for the population in private households - 25% sample data'],axis=1)

## Top Ethnicities in Toronto

In [20]:
df_ethnic_origins = pd.read_csv('data/ethnic_origins.csv')

In [21]:
df_ethnic_origins

Unnamed: 0,Ethnic Origin,Amount
0,North American Aboriginal origins,35630
1,Other North American origins,345710
2,British Isles origins,597295
3,French origins,122870
4,Western European origins,187190
5,Northern European origins,36720
6,Eastern European origins,302485
7,Southern European origins,441485
8,Other European origins,42130
9,Caribbean origins,165735


In [32]:
data = [go.Bar(
            x=df_ethnic_origins['Ethnic Origin'],
            y=df_ethnic_origins['Amount']
    )]

layout = go.Layout(
    title='Top 20 Ethnic Origins',
    xaxis=dict(
        title='Origin',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        )),
    yaxis=dict(
        title='Amount',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        )
    ),
    width=1000,
    height=600,
     margin=go.Margin(
        l=100,
        r=50,
        b=150,
        t=50,
        pad=4
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='ethnic_bar_chart')

## Ethnicities By Neighborhood

In [35]:
df_ethnic_origin_by_hood = pd.read_csv('data/ethnic_origins_by_hood.csv',encoding='ANSI')

In [36]:
list_of_neigborhoods = list(restaurants_greater_than_100_df['name'])
list_of_neigborhoods.append('Category')
df_ethnic_origin_by_hood = df_ethnic_origin_by_hood[list_of_neigborhoods]
df_ethnic_origin_by_hood = df_ethnic_origin_by_hood.fillna(0)
df_ethnic_origin_by_hood = df_ethnic_origin_by_hood.set_index('Category')

In [59]:
trace = go.Heatmap(z=df_ethnic_origin_by_hood.values,
                   y=list(df_ethnic_origin_by_hood.index.values),
                   x=list(df_ethnic_origin_by_hood.columns),
                   colorscale='Reds')

layout = go.Layout(
    title='Ethnic Origins Distribution by Neighborhood',
    width=1000,
    height=800,
     margin=go.Margin(
        l=300,
        r=150,
        b=250,
        t=50,
        pad=4
    ),
)
data=[trace]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='labelled-heatmap')

In [96]:
df_yelp[['categories']].set_index('categories').to_csv('categories_all.csv')

# Ratings

## Ratings Overall Toronto

In [22]:
df_ratings = df_yelp.groupby('rating').agg(['count'])

In [23]:
df_ratings = df_ratings[['level_0']]
df_ratings.columns = ['count']

In [29]:
labels = ['0','1','1.5','2','2.5','3','3.5','4','4.5','5']
values = df_ratings['count'].values
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

layout = go.Layout(
    title='Rating Distribution of Restaurants in Toronto from Yelp',
    width=800,
    height=800
)


data=[trace]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='ratings_pie_chart')

## Ratings by Neighborhood

In [38]:
df_ratings_hood = df_yelp[df_yelp['neighborhood_ids'].isin(restaurants_greater_than_100['neighborhood_ids'])]
df_ratings_hood = df_ratings_hood.groupby('neighborhoods')['rating'].agg(['mean'])
df_ratings_hood = df_ratings_hood.sort_values(by='mean',ascending=False)

In [39]:
df_neighborhoods_rating = df_yelp.pivot_table(index='neighborhoods', columns='rating', values='transactions',aggfunc='count')
df_neighborhoods_rating = df_neighborhoods_rating.fillna(0)

In [40]:
df_neighborhoods_rating = df_neighborhoods_rating.loc[restaurants_greater_than_100_df['name']]

In [43]:
trace = go.Heatmap(z=df_neighborhoods_rating.values,
                   y=list(df_neighborhoods_rating.index.values),
                   x=list(df_neighborhoods_rating.columns),
                   colorscale='Reds')

layout = go.Layout(
    title='Distribution of Ratings (1-5) by Neighborhood',
    width=1000,
    height=800,
     margin=go.Margin(
        l=300,
        r=150,
        b=150,
        t=50,
        pad=4
    ),
)
data=[trace]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='ratings_heatmap')

## Ratings Box Plot

In [44]:
df_ratings_hood2 = df_yelp[df_yelp['neighborhood_ids'].isin(restaurants_greater_than_100['neighborhood_ids'])]

In [45]:
list_of_hoods = df_ratings_hood2['neighborhoods']
list_of_hoods = list(list_of_hoods.unique())
list_of_lists = []
for i in range(0,len(list_of_hoods)):
    list_of_lists.append(list(df_ratings_hood2[df_ratings_hood2['neighborhoods']==list_of_hoods[i]]['rating'].values))

In [46]:
trace0 = go.Box(y=list_of_lists[0],name = list_of_hoods[0])
trace1 = go.Box(y=list_of_lists[1],name = list_of_hoods[1])
trace2 = go.Box(y=list_of_lists[2],name = list_of_hoods[2])
trace3 = go.Box(y=list_of_lists[3],name = list_of_hoods[3])
trace4 = go.Box(y=list_of_lists[4],name = list_of_hoods[4])
trace5 = go.Box(y=list_of_lists[5],name = list_of_hoods[5])
trace6 = go.Box(y=list_of_lists[6],name = list_of_hoods[6])
trace7 = go.Box(y=list_of_lists[7],name = list_of_hoods[7])
trace8 = go.Box(y=list_of_lists[8],name = list_of_hoods[8])
trace9 = go.Box(y=list_of_lists[9],name = list_of_hoods[9])
trace10 = go.Box(y=list_of_lists[10],name = list_of_hoods[10])
trace11 = go.Box(y=list_of_lists[11],name = list_of_hoods[11])
trace12 = go.Box(y=list_of_lists[12],name = list_of_hoods[12])
trace13 = go.Box(y=list_of_lists[13],name = list_of_hoods[13])
trace14 = go.Box(y=list_of_lists[14],name = list_of_hoods[14])
trace15 = go.Box(y=list_of_lists[15],name = list_of_hoods[15])
trace16 = go.Box(y=list_of_lists[16],name = list_of_hoods[16])
trace17 = go.Box(y=list_of_lists[17],name = list_of_hoods[17])
trace18 = go.Box(y=list_of_lists[18],name = list_of_hoods[18])
trace19 = go.Box(y=list_of_lists[19],name = list_of_hoods[19])
trace20 = go.Box(y=list_of_lists[20],name = list_of_hoods[20])

data = [trace0,trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,
        trace10,trace11,trace12,trace13,trace14,trace15,trace16,trace17,trace18,trace19,
        trace20]

layout = go.Layout(
    title='Rating Distribution of Restaurants in Toronto from Yelp',
    width=1000,
    height=800,
    margin=go.Margin(
        l=50,
        r=150,
        b=150,
        t=50,
        pad=4
    ),
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='ratings_box')

# Prices

## Price level distribution for all toronto

In [48]:
list_of_prices = []
prices = df_yelp['price']
for i in range(0,len(df_yelp['price'])):
    if prices[i] == '$':
        list_of_prices.append(1)
    elif prices[i] == '$$':
        list_of_prices.append(2)
    elif prices[i] == '$$$':
        list_of_prices.append(3)
    elif prices[i] == '$$$$':
        list_of_prices.append(4)
    else:
        list_of_prices.append(0)
df_yelp['price_level'] = list_of_prices

In [49]:
df_price_level = df_yelp.groupby(['price_level']).count()
df_price_level['price'] = ['Unknown','Cheap','Medium','High','Expensive']
df_price_level = df_price_level.set_index('price') 

In [97]:
labels = ['Unknown','Cheap','Normal','Expensive','Very Expensive']
values = df_price_level.level_0.values
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

layout = go.Layout(
    title='Price Distribution of Restaurants in Toronto from Yelp',
    width=800,
    height=800
)


data=[trace]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='prices_piechart')

In [None]:
df_price_level.plot.pie(y='neighborhoods', figsize=(15, 15),title="Price Distribution of Restaurants in Toronto from Yelp");

## Prices by Toronto Neighborhood

In [50]:
df_neighborhoods_price = df_yelp.pivot_table(index='neighborhoods', columns='price', values='price_level',aggfunc='count')
df_neighborhoods_price = df_neighborhoods_price.fillna(0)
df_neighborhoods_price.columns = ['Cheap','Normal','Expensive','Very Expensive']
df_neighborhoods_price = df_neighborhoods_price.loc[restaurants_greater_than_100_df['name']]

In [53]:
trace = go.Heatmap(z=df_neighborhoods_price.values,
                   y=list(df_neighborhoods_price.index.values),
                   x=list(df_neighborhoods_price.columns),
                   colorscale='Reds')

layout = go.Layout(
    title='Price Level Distribution by Neighborhood',
    width=1000,
    height=800,
     margin=go.Margin(
        l=300,
        r=150,
        b=150,
        t=50,
        pad=4
    ),
)
data=[trace]

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='prices_heatmap')

# Prices Box Plot 

In [55]:
df_neighborhoods_price2 = df_yelp[df_yelp['neighborhood_ids'].isin(restaurants_greater_than_100['neighborhood_ids'])]

In [56]:
list_of_hoods = df_neighborhoods_price2['neighborhoods']
list_of_hoods = list(list_of_hoods.unique())
list_of_lists = []
for i in range(0,len(list_of_hoods)):
    list_of_lists.append(list(df_neighborhoods_price2[df_neighborhoods_price2['neighborhoods']==list_of_hoods[i]]['price_level'].values))

In [57]:
trace0 = go.Box(y=list_of_lists[0],name = list_of_hoods[0])
trace1 = go.Box(y=list_of_lists[1],name = list_of_hoods[1])
trace2 = go.Box(y=list_of_lists[2],name = list_of_hoods[2])
trace3 = go.Box(y=list_of_lists[3],name = list_of_hoods[3])
trace4 = go.Box(y=list_of_lists[4],name = list_of_hoods[4])
trace5 = go.Box(y=list_of_lists[5],name = list_of_hoods[5])
trace6 = go.Box(y=list_of_lists[6],name = list_of_hoods[6])
trace7 = go.Box(y=list_of_lists[7],name = list_of_hoods[7])
trace8 = go.Box(y=list_of_lists[8],name = list_of_hoods[8])
trace9 = go.Box(y=list_of_lists[9],name = list_of_hoods[9])
trace10 = go.Box(y=list_of_lists[10],name = list_of_hoods[10])
trace11 = go.Box(y=list_of_lists[11],name = list_of_hoods[11])
trace12 = go.Box(y=list_of_lists[12],name = list_of_hoods[12])
trace13 = go.Box(y=list_of_lists[13],name = list_of_hoods[13])
trace14 = go.Box(y=list_of_lists[14],name = list_of_hoods[14])
trace15 = go.Box(y=list_of_lists[15],name = list_of_hoods[15])
trace16 = go.Box(y=list_of_lists[16],name = list_of_hoods[16])
trace17 = go.Box(y=list_of_lists[17],name = list_of_hoods[17])
trace18 = go.Box(y=list_of_lists[18],name = list_of_hoods[18])
trace19 = go.Box(y=list_of_lists[19],name = list_of_hoods[19])
trace20 = go.Box(y=list_of_lists[20],name = list_of_hoods[20])

data = [trace0,trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,
        trace10,trace11,trace12,trace13,trace14,trace15,trace16,trace17,trace18,trace19,
        trace20]

layout = go.Layout(
    title='Price Distribution of Restaurants in Toronto from Yelp',
    width=1000,
    height=800,
     margin=go.Margin(
        l=50,
        r=150,
        b=150,
        t=50,
        pad=4
    ),
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='prices_box')