## Part 1

In [1]:
!pip install BeautifulSoup4
!pip install requests



In [5]:
#Import Beautiful Soup, lxml, requests to scrap data from Toronto Neighborhood in Wikipedia

#!pip install wikipedia
#import wikipedia as wp

import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

#conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

from IPython.display import display_html

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from bs4 import BeautifulSoup
import requests                  

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

!pip install folium
import folium # map rendering library

print("Libraries Imported!")

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 8.7MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Libraries Imported!


### Extracting data from wikipedia page

In [6]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
# print(soup.title)
# If page has scraped successfully, then the table of postal codes of Canada is printed.
table = str(soup.table)
display_html(table,raw=True)

Postal Code,Borough,Neighborhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


### Cleaning and preprocessing : converting html table to Pandas DataFrame

In [7]:
canada_df = pd.read_html(table)
df = canada_df[0]

# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood 
df.columns=['Postcode','Borough','Neighbourhood']

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data preprocessing and cleaning:
    - Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
    - If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:
# Dropping the rows where Borough is 'Not assigned'
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

# Combining the neighbourhoods with same Postalcode
df2 = df.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df2.loc[df2['Neighbourhood']=="Not assigned",'Neighbourhood']=df2.loc[df2['Neighbourhood']=="Not assigned",'Borough']
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
df2.shape

(103, 3)

## Part 2:

### Importing the csv file containing the latitudes and longitudes for various neighbourhoods 

In [10]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Merging the two tables for getting the Latitudes and Longitudes 

In [11]:
df2['Latitude']=geo_data['Latitude'].values
df2['Longitude']=geo_data['Longitude'].values

df2

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.744734,-79.239476
6,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
7,M3B,North York,Don Mills,43.711112,-79.284577
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.716316,-79.239476
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848


## Part 3 : Clustering the neighborhoods in Toronto

### Getting all the rows from the data frame which contains Toronto in their Borough

In [12]:
df3 = df2[df2['Borough'].str.contains('Toronto',regex=False)]
df3

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
15,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
19,M4E,East Toronto,The Beaches,43.786947,-79.385975
20,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714
24,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259
25,M6G,Downtown Toronto,Christie,43.753259,-79.329656
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.737473,-79.464763
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.739015,-79.506944


### Visualizing the data

In [15]:
# Find the center of all the locations and prepare the folium map

center_lat=df2.Latitude.mean()
center_long=df2.Longitude.mean()

map_toronto = folium.Map(location=[center_lat, center_long], zoom_start=13)   # generate map 
for lat,lng,borough,neighbourhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(str(label), parse_html=True)
    #folium.features.CircleMarker
    folium.CircleMarker(
    [lat, lng],
    radius= 5,
    color= 'blue',
    popup=label,
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6).add_to(map_toronto)

map_toronto


In [16]:
# save the map as HTML file
map_toronto.save('map_toronto.html')

### Using KMeans clustering

In [17]:
# set number of clusters
kclusters = 5

toronto_clustering = df3.drop(['Postcode','Borough','Neighbourhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
df3.insert(0, 'Cluster Labels', kmeans.labels_)

In [18]:
df3

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
4,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
9,2,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
15,0,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
19,0,M4E,East Toronto,The Beaches,43.786947,-79.385975
20,0,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714
24,0,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259
25,0,M6G,Downtown Toronto,Christie,43.753259,-79.329656
30,3,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.737473,-79.464763
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.739015,-79.506944


In [19]:
center_lat=df2.Latitude.mean()
center_long=df2.Longitude.mean()

# create map
map_clusters = folium.Map(location=[center_lat, center_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighbourhood'], df3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [20]:
# save the map as HTML file
map_clusters.save('map_clusters.html')