# Toronto Neighborhoods

Notebook to retrieve the list of Toronto neighborhoods from the Toronto postal codes wikipedia page and clean the data, then add geospatial data

In [2]:
#import all dependencies
import pandas as pd
import numpy as np 
import urllib.request
from bs4 import BeautifulSoup

### Obtain Data via BeautifulSoup
Create a soup object from the wikipedia page and initialize the a dataframe. Loop through the content of the wiki table and save each row to the dataframe.

In [3]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(wiki_url)
soup = BeautifulSoup(page)

In [16]:
#find the table on the page, between the <tbody> tags then find the content of the table
table = soup.find('tbody')
content = table.find_all('tr')

#initialize a dataframe with the requisite column names, plus an index column
col_names = ['Index','PostalCode', 'Borough', 'Neighborhood']
neighborhoods = pd.DataFrame(columns = col_names)

#loop through the content's text
for i in range(0,len(content)):
    x = content[i].get_text().split('\n')
    col_names = ['Index','PostalCode', 'Borough', 'Neighborhood','col5']
    df = pd.DataFrame(np.array(x).reshape(-1,len(x)),columns = col_names)
    df['Index'] = i - 1
    df.drop('col5', axis = 1, inplace = True)
    neighborhoods = neighborhoods.append(df)

#clean up the resulting df
neighborhoods.set_index('Index', inplace = True)
neighborhoods.drop(-1, axis = 0, inplace = True)

### Drop Unassigned and Roll Up by Code
Drop any rows with unassigned boroughs, then fill in unassigned neighborhoods with the borough name. Group the dataframe by postal code, rolling up neighborhoods with the same code.

In [17]:
#remove unassigned boroughs
neighborhoods = neighborhoods[neighborhoods['Borough']!='Not assigned']

#loop through the df to set the neighborhood = borough when neighborhood is unassigned
for i in range(0,len(neighborhoods)):
    if neighborhoods.iloc[i,2] == 'Not assigned':
        neighborhoods.iloc[i,2] = neighborhoods.iloc[i,1]

#roll up by postal code
grouped = neighborhoods.groupby(['PostalCode','Borough'])['Neighborhood'].apply(list)
neighborhoods = pd.DataFrame(grouped)
neighborhoods.reset_index(inplace = True)

In [18]:
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].apply(', '.join)
neighborhoods.head(n=30)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [None]:
neighborhoods.shape