In [1]:
# Import Dependencies
import os
import pandas as pd
import numpy as np
import requests
from yelpapi import YelpAPI
from pprint import pprint

# Import API key
from config import yelp_api_key

# Extraction

## 1) Toronto Restaurant Data

In [2]:
# Read Restaurant data source file
restaurant_data = pd.read_csv("Resources/Restaurant_Data.csv")

# Converted to Dataframe
restaurant_df = pd.DataFrame(restaurant_data)
restaurant_df.head()

Unnamed: 0,Category,Restaurant Address,Restaurant Name,Restaurant Phone,Restaurant Price Range,Restaurant Website,Restaurant Yelp URL,Restaurant Latitude,Restaurant Longitude
0,Afghan,"14 Prince Arthur Avenue\r\nToronto, ON M5R 1A9",The Host,(416) 962-4678,$11-30,welcometohost.com,https://www.yelp.ca/adredir?ad_business_id=OFA...,43.669935,-79.395858
1,Afghan,"259 Wellington St W\r\nToronto, ON M5V",Aanch Modernist Indian Cuisine,(647) 558-1508,$11-30,aanch.ca,https://www.yelp.ca/adredir?ad_business_id=SZu...,43.644708,-79.39067
2,Afghan,"736 Bay Street\r\nToronto, ON M5G 2J8",Silk Road Kabob House,,Under $10,,https://www.yelp.ca/biz/silk-road-kabob-house-...,43.659816,-79.385591
3,Afghan,"691 Yonge Street\r\nToronto, ON M4Y 2B2",Naan & Kabob,(416) 972-6623,$11-30,naanandkabob.ca,https://www.yelp.ca/biz/naan-and-kabob-toronto-5,43.669058,-79.3861
4,Afghan,"66 Overlea Boulevard\r\nUnit 62\r\nToronto, ON...",Afghan Cuisine,(416) 422-5858,$11-30,afghancuisinerestaurant.com,https://www.yelp.ca/biz/afghan-cuisine-toronto,43.70807,-79.341508


## 2) Toronto Neighbourhood Data

In [3]:
#Read Neighbourhoods data source file
neighbourhood = pd.read_csv('Resources/Neighbourhoods.csv')
neighbourhood.head()

Unnamed: 0,_id,AREA_ID,AREA_ATTR_ID,PARENT_AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,AREA_DESC,X,Y,LONGITUDE,LATITUDE,OBJECTID,Shape__Area,Shape__Length,geometry
0,8401,25886861,25926662,49885,94,94,Wychwood (94),Wychwood (94),,,-79.425515,43.676919,16491505,3217960.0,7515.779658,"{u'type': u'Polygon', u'coordinates': (((-79.4..."
1,8402,25886820,25926663,49885,100,100,Yonge-Eglinton (100),Yonge-Eglinton (100),,,-79.40359,43.704689,16491521,3160334.0,7872.021074,"{u'type': u'Polygon', u'coordinates': (((-79.4..."
2,8403,25886834,25926664,49885,97,97,Yonge-St.Clair (97),Yonge-St.Clair (97),,,-79.397871,43.687859,16491537,2222464.0,8130.411276,"{u'type': u'Polygon', u'coordinates': (((-79.3..."
3,8404,25886593,25926665,49885,27,27,York University Heights (27),York University Heights (27),,,-79.488883,43.765736,16491553,25418210.0,25632.335242,"{u'type': u'Polygon', u'coordinates': (((-79.5..."
4,8405,25886688,25926666,49885,31,31,Yorkdale-Glen Park (31),Yorkdale-Glen Park (31),,,-79.457108,43.714672,16491569,11566690.0,13953.408098,"{u'type': u'Polygon', u'coordinates': (((-79.4..."


## 3) Toronto Ethnicity Data

In [4]:
# Read Ethnicity data from api
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}
package = requests.get(url, params = params).json()

In [5]:
# Final Solution
offset = 0
total_record = 0
combined_dataframes = []

for resource in (package["result"]["resources"]):
    if resource["datastore_active"]:
        url = f'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search?id={resource["id"]}&offset=0'
        while True:
            data = requests.get(url).json()
            next_page = data['result']['_links']['next']
            records = data['result']['records']
            if next_page and records:
                dataframe = pd.DataFrame(records)
                combined_dataframes.append(dataframe)
                url = f'https://ckan0.cf.opendata.inter.prod-toronto.ca{next_page}'
            else:
                break
    break

result = pd.concat(combined_dataframes).sort_index()

In [6]:
# Put filters on Category and Topic to get Ethnic population in Toronto neighbourhoods
ethnicity_df = result.loc[((result["Category"] == "Ethnic origin") & (result["Topic"] == "Ethnic origin population"))]
ethnicity_df.head()

Unnamed: 0,_id,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
0,1401,Ethnic origin,Ethnic origin population,Census Profile 98-316-X2016001,Byelorussian,4085,0,15,25,70,...,45,0,10,0,0,25,10,45,30,30
0,1501,Ethnic origin,Ethnic origin population,Census Profile 98-316-X2016001,Liberian,170,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
0,1601,Ethnic origin,Ethnic origin population,Census Profile 98-316-X2016001,Cambodian (Khmer),2965,10,0,0,30,...,10,10,0,0,0,0,0,0,160,0
1,1602,Ethnic origin,Ethnic origin population,Census Profile 98-316-X2016001,Chinese,332830,16950,11455,420,2400,...,3825,655,4385,1260,805,1000,1100,645,2105,1130
1,1502,Ethnic origin,Ethnic origin population,Census Profile 98-316-X2016001,Malian,175,0,0,0,0,...,0,0,10,0,0,0,0,0,10,0


## 4) Toronto Neighbourhood Income Data

In [7]:
# Read Neighbourhoods Income data source file
income_df = pd.read_excel('Resources/neighbourhood-income-data-2011.xlsx')
income_df.head()

Unnamed: 0,Category,Topic,Attribute,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,Banbury-Don Mills,Bathurst Manor,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
0,Population,Population,"Population, 2011",2615060.0,30279.0,21988.0,11904.0,29177.0,26918.0,15434.0,...,15004.0,21343.0,53350.0,11703.0,7826.0,13986.0,10578.0,11652.0,27713.0,14687.0
1,Population,Population,"Population, 2006",2503281.0,30156.0,21562.0,11656.0,27482.0,25439.0,14945.0,...,12517.0,20907.0,52461.0,11550.0,8051.0,14194.0,10497.0,11235.0,26140.0,14830.0
2,Population,Population,"Population percentage change, 2006 to 2011",4.5,,,,,,,...,,,,,,,,,,
3,Population,Population,Population density per square kilometre,4149.5,,,,,,,...,,,,,,,,,,
4,Population,Dwellings,Total private dwellings,1107851.0,9341.0,7861.0,4840.0,17172.0,12118.0,6320.0,...,6931.0,8336.0,19181.0,5391.0,3645.0,6002.0,5550.0,7128.0,11722.0,5444.0


## 5) Toronto Neighbourhood Crime Data

In [8]:
# Read Neighbourhood Crime Rates
crime_csv = "Resources/Neighbourhood_Crime_Rates.csv"
crime_df = pd.read_csv(crime_csv)
crime_df.head()

Unnamed: 0,_id,OBJECTID,Neighbourhood,Hood_ID,Population,Assault_2014,Assault_2015,Assault_2016,Assault_2017,Assault_2018,...,TheftOver_2016,TheftOver_2017,TheftOver_2018,TheftOver_2019,TheftOver_AVG,TheftOver_CHG,TheftOver_Rate_2019,Shape__Area,Shape__Length,geometry
0,1,16,South Parkdale,85,21849,202,226,231,229,220,...,9,10,9,22,10.0,1.44,100.7,2286974.0,10802.83216,"{u'type': u'Polygon', u'coordinates': (((-79.4..."
1,2,17,South Riverdale,70,27876,215,207,236,243,304,...,22,27,24,21,21.3,-0.13,75.3,10964570.0,43080.7247,"{u'type': u'Polygon', u'coordinates': (((-79.3..."
2,3,18,St.Andrew-Windfields,40,17812,53,41,48,45,55,...,8,7,6,6,8.5,0.0,33.7,7299580.0,13025.99746,"{u'type': u'Polygon', u'coordinates': (((-79.3..."
3,4,19,Taylor-Massey,61,15683,127,92,97,107,123,...,5,2,4,3,3.5,-0.25,19.1,1062970.0,5940.70005,"{u'type': u'Polygon', u'coordinates': (((-79.2..."
4,5,20,Humber Summit,21,12416,76,89,118,116,109,...,18,18,15,22,17.3,0.47,177.2,7966905.0,12608.57312,"{u'type': u'Polygon', u'coordinates': (((-79.5..."


## 6) Restaurants Rating & Review Data from Yelp API

In [18]:
# Due to the daily limit of API call, we will use pandas read_csv to read the dataframe we create by using Yelp API
# The yelp_api extraction using the below code can be found in the 'data_cleaning' folder. 
rating_df = pd.read_csv('clean_data/rating_df.csv')
rating_df.drop('Unnamed: 0', axis='columns', inplace=True)
rating_df.head()

Unnamed: 0,id,name,category,ratings,review_counts,zip_code
0,e41TP5cXZqSrz50xCBJqZw,Insomnia Restaurant & Lounge,Lounges,4.0,923,M5S 1Y6
1,r_BrIgzYcwo1NAuG9dLbpg,Pai Northern Thai Kitchen,Thai,4.5,2895,M5H 3G8
2,Uq-GOs9_IqweUsB5MdII9w,Emma's Country Kitchen,Breakfast & Brunch,4.0,394,M6C 1B6
3,iGEvDk6hsizigmXhDKs2Vg,Seven Lives Tacos y Mariscos,Mexican,4.5,1323,M5T 2K1
4,-ICGmF2qUVKdvOehVNgPbg,Lamesa Filipino Kitchen,Filipino,4.0,352,M6C 1A9


In [15]:
# This code was used to extract restaurant information in the 'yelp_api.ipynb' file under 'data_cleaning' folder.
# This code will not be run in this notebook due to the api limit
'''
yelp_api = YelpAPI(yelp_api_key)

neighbourhood_lat_long = pd.read_csv('Resources/Neighbourhoods.csv')

# Create a set of neighbourhood lat and lng combinations
lng_lats = []

lngs = neighbourhood_lat_long['LONGITUDE']
lats = neighbourhood_lat_long['LATITUDE']

lng_lats = zip(lngs, lats)

offset = 1
limit = 49
total_num_queries = 20

# Create lists to hold the information
ids = []
names = []
ratings = []
review_counts = []
zip_code = []

# Query restaurants information using neighbourhoods latitude & longitude
# We will try to find about 500 restaurants per each neighbourhood including duplicates
for lng_lats in zip(lngs, lats):
    
    for i in range(total_num_queries):
        
        response = yelp_api.search_query(latitude=lng_lats[1], longitude=lng_lats[0], radius=5000, limit=limit, offset=offset)

        for business in range(len(response['businesses'])):
            
            try:
                ids.append(response['businesses'][business]['id'])
                names.append(response['businesses'][business]['name'])
                ratings.append(response['businesses'][business]['rating'])
                review_counts.append(response['businesses'][business]['review_count'])
                zip_code.append(response['businesses'][business]['location']['zip_code'])
                
            except:
                pass
        
        offset = offset + limit
    
    offset = 1
'''

"\nyelp_api = YelpAPI('o2kVySxuJlQHYpsXVpPZPqV_VpfD1eACBU9JURPwZlbLiKOI3xWLEevsCB7Sq09OPUdxGumOHYE5ib0dkzR-X4Uvs3w0Cp1bJD3SEaCa0X307SNFXfiPW5hyQA3VX3Yx')\n\nneighbourhood_lat_long = pd.read_csv('Resources/Neighbourhoods.csv')\n\n# Create a set of neighbourhood lat and lng combinations\nlng_lats = []\n\nlngs = neighbourhood_lat_long['LONGITUDE']\nlats = neighbourhood_lat_long['LATITUDE']\n\nlng_lats = zip(lngs, lats)\n\noffset = 1\nlimit = 49\ntotal_num_queries = 20\n\n# Create lists to hold the information\nids = []\nnames = []\nratings = []\nreview_counts = []\nzip_code = []\n\n# Query restaurants information using neighbourhoods latitude & longitude\n# We will try to find about 500 restaurants per each neighbourhood including duplicates\nfor lng_lats in zip(lngs, lats):\n    \n    for i in range(total_num_queries):\n        \n        response = yelp_api.search_query(latitude=lng_lats[1], longitude=lng_lats[0], radius=5000, limit=limit, offset=offset)\n\n        for business in range(