In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from urllib.request import urlopen
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
%matplotlib inline

# extracting the table from the web page

In [7]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

In [13]:
rows = soup.find_all('tr')
rows

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td

In [14]:
list_rows = []
for row in rows:
    row_td = row.find_all('td')
    clean_text = BeautifulSoup(str(row_td), "lxml").get_text()
    list_rows.append(clean_text)

In [15]:
list_rows

['[]',
 '[M1A, Not assigned, Not assigned\n]',
 '[M2A, Not assigned, Not assigned\n]',
 '[M3A, North York, Parkwoods\n]',
 '[M4A, North York, Victoria Village\n]',
 '[M5A, Downtown Toronto, Harbourfront\n]',
 '[M5A, Downtown Toronto, Regent Park\n]',
 '[M6A, North York, Lawrence Heights\n]',
 '[M6A, North York, Lawrence Manor\n]',
 "[M7A, Queen's Park, Not assigned\n]",
 '[M8A, Not assigned, Not assigned\n]',
 '[M9A, Etobicoke, Islington Avenue\n]',
 '[M1B, Scarborough, Rouge\n]',
 '[M1B, Scarborough, Malvern\n]',
 '[M2B, Not assigned, Not assigned\n]',
 '[M3B, North York, Don Mills North\n]',
 '[M4B, East York, Woodbine Gardens\n]',
 '[M4B, East York, Parkview Hill\n]',
 '[M5B, Downtown Toronto, Ryerson\n]',
 '[M5B, Downtown Toronto, Garden District\n]',
 '[M6B, North York, Glencairn\n]',
 '[M7B, Not assigned, Not assigned\n]',
 '[M8B, Not assigned, Not assigned\n]',
 '[M9B, Etobicoke, Cloverdale\n]',
 '[M9B, Etobicoke, Islington\n]',
 '[M9B, Etobicoke, Martin Grove\n]',
 '[M9B, Etobi

# preprocessing the data

In [16]:
df = pd.DataFrame(list_rows)
df_splitted = df[0].str.split(',', expand=True)
df_splitted = df_splitted.iloc[:, 0:3]
df_splitted.dropna(axis=0, thresh=3, inplace=True)
df_splitted.reset_index(drop=True, inplace=True)
df_splitted.drop([288, 289, 290], inplace=True)
df_splitted = df_splitted.apply(lambda x: x.str.strip('[]\n '))
df_splitted

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [17]:
column_names = ['Postcode', 'Borough', 'Neighborhood']
df_splitted.columns = column_names
df_splitted

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [18]:
df_assigned = df_splitted[df_splitted.Borough != 'Not assigned']
df_assigned

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [49]:
def clean_list(x):
    cleaned = []
    for i in x:
        if np.unique(i).shape[0] > 1:
            cleaned.append(', '.join(i))
        else:
            cleaned.append(i[0])
    return cleaned

df_grouped = df_assigned.groupby('Postcode', as_index=False).agg(list)
df_grouped[['Borough', 'Neighborhood']] = \
        df_grouped[['Borough', 'Neighborhood']].apply(clean_list)

#df_grouped['Neighborhood'] = list(map(str, df_grouped['Neighborhood']))
#df_grouped= df_grouped.apply(lambda x: x.str.strip('[]'))

is_not_assigned_neighbour = df_grouped.Neighborhood == 'Not assigned'
for idx in range(df_grouped.shape[0]):
    if is_not_assigned_neighbour[idx]:
        df_grouped.at[idx, 'Neighborhood'] = df_grouped.at[idx, 'Borough']

df_grouped

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [22]:
df_grouped.shape

(103, 3)

# merging with geospatial data

In [26]:
df_geo_coordinates = pd.read_csv('../data/Geospatial_Coordinates.csv')
df_geo_coordinates.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

In [27]:
df_geo_coordinates

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [51]:
df_canada = df_grouped.merge(df_geo_coordinates, on='Postcode')
df_canada

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# finding boroughs in Toronto

In [55]:
df_toronto = df_canada[df_canada['Borough'].str.find('Toronto') != -1]
df_toronto.reset_index(drop=True, inplace=True)
df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


# classifying neighborhoods by postcode

In [62]:
k = len(df_toronto['Borough'].unique())
k_means_clf = KMeans(init='k-means++', n_clusters=k, n_init=10)
k_means_clf.fit(df_toronto[['Latitude', 'Longitude']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [66]:
geo_locator = Nominatim(user_agent='foursquare_agent')
location = geo_locator.geocode('Toronto')
toronto_latitude = location.latitude
toronto_longitude = location.longitude

In [72]:
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colours_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colours_array]

marker_colours = []
for lat, lon, poi, cluster in zip(
                        df_toronto['Latitude'], df_toronto['Longitude'], 
                        df_toronto['Neighborhood'], k_means_clf.labels_):
    label = folium.Popup(str(poi) + ' Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7
    ).add_to(map_toronto)
    
map_toronto