# Part One

In [461]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [462]:
#use old Wikipedia file
req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942851379")
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))

neigh=pd.DataFrame(df[0])
neigh

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [463]:
#drop boroughs that are not assigned
neigh.drop(neigh[neigh['Borough'] == "Not assigned"].index, inplace = True)

In [464]:
#group duplicate boroughs; reset index
neigh = neigh.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
neigh.reset_index(inplace=True)


In [465]:
#replace name of unassigned neighborhoods to that of borough
neigh.Neighbourhood.replace("Not assigned", neigh.Borough, inplace=True)


In [476]:
neigh.rename(columns ={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)

# Part Two

In [468]:
geo = pd.read_csv("Geospatial_Coordinates.csv")

In [499]:
geo

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [500]:
geo.rename(columns = {'Postal Code': 'PostalCode'},inplace = True)

In [501]:
#merge latitude and longitude base on postal code
neigh2 = pd.merge(neigh, geo, on = 'PostalCode')

In [502]:
neigh2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


# Part Three

In [503]:
#only look at Toronto
toronto = neigh2[neigh2['Borough'].str.contains('Toronto', regex = False)]

In [555]:
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [505]:
import folium
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

### Define Toronto address and get latitude/longitude

In [496]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude,longitude)


43.6534817 -79.3839347


### Create map

In [579]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)
map_toronto

### Plot neighborhoods

In [580]:
for lat, lng, borough, neighborhood in zip(neigh2['Latitude'], neigh2['Longitude'], neigh2['Borough'], neigh2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [582]:
neigh_loc = neigh2.drop(['PostalCode', 'Borough', 'Neighborhood',],1)
neigh_loc.head()


Unnamed: 0,Latitude,Longitude
5,43.667856,-79.532242
6,43.806686,-79.194353
7,43.745906,-79.352188
8,43.706397,-79.309937
9,43.657162,-79.378937


### k-means

In [583]:
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neigh_loc)

# run k-means clustering
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 2, 3, 0, 0, 3, 4, 2, 3, 0])

In [584]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
map_clusters

In [586]:
# add clustering labels
neigh2.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

### Map the 5 neighborhood clusters

In [587]:
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(neigh2['Latitude'], neigh2['Longitude'], neigh2['Neighborhood'], neigh2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters