In [None]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')

# Scrape Data using Pandas

In [None]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [None]:
df.head()

In [None]:
df.shape

# Clean the data

### Remove rows where Borough is Not assigned

In [None]:
df = df[df.Borough!='Not assigned']
df.shape

### Comma Separated Neighbourhood for duplicate postcode

In [None]:
def group_apply(x):
    return pd.DataFrame([[x['Borough'].tolist()[0], ','.join(x['Neighbourhood'])]], columns=['Borough', 'Neighbourhood'])

grouped_df = df.groupby('Postcode').apply(group_apply).reset_index(level=0).reset_index(drop=True)

In [None]:
grouped_df.head()

In [None]:
grouped_df.shape

### Assign same neighbourhood as of Borough in case it is Not assigned

In [None]:
final_df = grouped_df.copy()
final_df.loc[final_df.Neighbourhood=='Not assigned', 'Neighbourhood'] = final_df.loc[final_df.Neighbourhood=='Not assigned', 
                                                                               'Borough']

In [None]:
final_df.head()

In [None]:
final_df.shape

# Get Geocode using geocoder csv file

In [None]:
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')

In [None]:
geospatial_data.head()

In [None]:
final_df = pd.merge(final_df, geospatial_data, how='left', left_on='Postcode', right_on='Postal Code')

In [None]:
final_df.drop('Postal Code', axis=1, inplace=True)
final_df.head()

In [None]:
final_df.shape

# Cluster the neighbourhoods

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.scatter(final_df.Latitude, final_df.Longitude)

In [None]:
# set number of clusters
kclusters = 5

final_df_clustering = final_df.drop(['Neighbourhood','Postcode','Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(final_df_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
final_df['Cluster'] = kmeans.labels_

In [None]:
plt.scatter(final_df.Latitude, final_df.Longitude, c=final_df.Cluster)

In [None]:
final_df.head()