__Scrape Toronto Neighborhood Data__

In [3]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim 
import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#import folium # map rendering library
import requests
import lxml.html as lh

#------------------------
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')

#---------------------------------
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    name = name.strip().replace('\n', ' ')
    #print (i,name)
    col.append((name,[]))
#-------------------------------------------

#Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data = data.strip().replace('\n', ' ')
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1
#---------------------------------------------------------
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df.drop(df.loc[df['Postal code']==''].index, inplace=True)
df = df.reset_index(drop=True)
df.groupby(['Postal code', 'Borough']).agg(lambda x: ','.join(set(x)))
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',')
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'] )

__Load Toronto Geospatial Data__

In [8]:
url1 = 'http://cocl.us/Geospatial_data'
trnt_geo_data = pd.read_csv(url1)
trnt_geo_data.rename(columns = {'Postal Code':'Postal code'}, inplace = True) 
trnt_geo_data.head()


Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


__Validate number of unique postal codes__

In [5]:
print('The dataframe has {} unique Postal codes'.format(
        len(trnt_geo_data['Postal Code'].unique())
    )
)

The dataframe has 103 unique Postal codes


__Join Neighborhood Dataframe with Geospatial Dataframe__

In [9]:
df_trnt = df.merge(trnt_geo_data, left_on=['Postal code'], right_on = ['Postal code'], how='left')
df_trnt.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
