<h1> 1. Create Dataframe using the wikipedia link </h1>

In [71]:
from bs4 import BeautifulSoup # For Web scraping to get the table

import pandas as pd # library for data analsysis
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


<h3> First read the html page and create a dataframe </h3>

In [40]:
source_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source_url, 'xml')
table = soup.find('table',{'class':'wikitable sortable'})

In [41]:
# Loop through the table to build the DataFrame
table_rows = table.find_all('tr')
data = []
for row in table_rows:
    td=[]
    for t in row.find_all('td'):
        td.append(t.text.strip())
    data.append(td)
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighborhood'])

In [42]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [43]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


<h3> Data preprocessing and Cleaning -  </h3>

In [44]:
df = df[~df['Borough'].isnull()]  # to filter out bad rows
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True) # Drop rows with a borough that is Not assigned.
df.reset_index(drop=True, inplace=True)
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
df['Neighborhood'].replace('Not assigned',df['Borough'],inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge,Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek,Rouge H..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill,Guildwood, M..."
3,M1G,Scarborough,"Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae"
5,M1J,Scarborough,"Scarborough Village,Scarborough Village"
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park,Ke..."
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge,Golden Mile, C..."
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village Wes..."
9,M1N,Scarborough,"Birch Cliff, Cliffside West,Birch Cliff, Cliff..."


In [45]:
df.shape

(103, 3)

<h3> Import csv file conatining the latitudes and longitudes of neighbourhoods in Canada </h3>

In [46]:
geo_coor = pd.read_csv('https://cocl.us/Geospatial_data')
geo_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h3> Merge the original DataFrame with the one with the coordinates </h3?

In [50]:
geo_coor.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_coor.head()
df1 = pd.merge(df,geo_coor,on='PostalCode')
df1

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge,Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek,Rouge H...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill,Guildwood, M...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn,Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae,Cedarbrae",43.773136,-79.239476
5,M1J,Scarborough,"Scarborough Village,Scarborough Village",43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park,Ke...",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge,Golden Mile, C...",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village Wes...",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West,Birch Cliff, Cliff...",43.692657,-79.264848


<h1> Exploring and clustering the neighborhoods in Toronto</h1>

<h3>Getting all the neighborhoods that contain 'Toronto' in them</h3>

In [51]:
df2 = df1[df1['Borough'].str.contains('Toronto',regex=False)]
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,"The Beaches,The Beaches",43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale,The Danforth West...",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West,India Bazaar, T...",43.668999,-79.315572
43,M4M,East Toronto,"Studio District,Studio District",43.659526,-79.340923
44,M4N,Central Toronto,"Lawrence Park,Lawrence Park",43.72802,-79.38879
45,M4P,Central Toronto,"Davisville North,Davisville North",43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park,North Toron...",43.715383,-79.405678
47,M4S,Central Toronto,"Davisville,Davisville",43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East,Moore Park, Summer...",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


<h3> Map visualization using Folium </h3>

In [72]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

for lat,lng,borough,neighborhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'],df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

<h1> Cluster Neighborhoods </h1>

<h3> Using K-Means Clustering </h3>

In [75]:
k=5
toronto_clustering = df2.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df2.insert(0, 'ClusterLabels', kmeans.labels_)

In [76]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood'], df2['ClusterLabels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h3> To view the maps please use the links below </h3>

<a href="https://github.com/pranaylohkare/Coursera_Capstone/blob/master/Screen%20Shot%202020-07-06%20at%205.35.19%20PM.png">Map1</a>
<a href="https://github.com/pranaylohkare/Coursera_Capstone/blob/master/Screen%20Shot%202020-07-06%20at%205.34.40%20PM.png">Map2</a>