<b>Coursera Peer Review Assignment
    
Segment and Cluster Toronto Neighborhoods</b>



 <b>First Step: Use the Notebook to build the code to scrape the following Wikipedia page. </b>

In [23]:
#Import needed items
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas.io.json import json_normalize  
from sklearn.cluster import KMeans

In [24]:
#scrape, removes 'non-assigned'
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    if row != [] and row[1] != "Not assigned\n":
        if "Not assigned\n" in row[2]: 
            row[2] = row[1]
        res.append(row)

df = pd.DataFrame(res, columns = ["Postal Code", "Borough", "Neighborhood"])
df.head()



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [26]:
#clean up the /n on the fields

df["Postal Code"] = df["Postal Code"].str.replace("\n","")
df["Borough"] = df["Borough"].str.replace("\n","")
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood,PostalCode
0,M3A,North York,Parkwoods,M3A
1,M4A,North York,Victoria Village,M4A
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A


In [27]:
#Group those with the same postal code
df = df.groupby(["Postal Code", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
print("Shape: ", df.shape)
df.head()

Shape:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Second Step : Get Latitude/Longitude and make a new data frame</b>

In [28]:
#geocoder did not work for me, used alternate given in class
geo_data = pd.read_csv("http://cocl.us/Geospatial_data")

geo_data.head()
#print(geo_data.shape)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [74]:
#merging the tables to make 1 new table
df_toronto = pd.merge(df, geo_data, how='left', left_on = 'Postal Code', right_on = 'Postal Code')
# remove the "Postal Code" column
df_toronto.drop("Postal Code", axis=1, inplace=True)
df_toronto.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


<b>Third Step : Explore and cluster the neighborhoods in Toronto</b>

Time to explore!

In [93]:
#Limiting to only boroughs with Toronto in the name
df_torontoonly = df_toronto[df_toronto['Borough'].str.contains('Toronto',regex=False)]
df_torontoonly


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
37,East Toronto,The Beaches,43.676357,-79.293031
41,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,East Toronto,Studio District,43.659526,-79.340923
44,Central Toronto,Lawrence Park,43.72802,-79.38879
45,Central Toronto,Davisville North,43.712751,-79.390197
46,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,Central Toronto,Davisville,43.704324,-79.38879
48,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


Create a Toronto Map

Involves importing folium which is a mapping tool.

In [94]:
import folium #mapping tool


map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)  #Pulled this location from dataframe

for lat,lng,borough,neighborhood in zip(df_torontoonly['Latitude'],df_torontoonly['Longitude'],df_torontoonly['Borough'],df_torontoonly['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto



Next we will cluster the neighborhoods

In [95]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
df_torontoonly.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
37,East Toronto,The Beaches,43.676357,-79.293031
41,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,East Toronto,Studio District,43.659526,-79.340923
44,Central Toronto,Lawrence Park,43.72802,-79.38879


This code below does the K means then inserts cluster to the Toronto Only table.

In [96]:
k=5
toronto_clustering = df_torontoonly.drop(['Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_torontoonly.insert(0, 'Cluster', kmeans.labels_)

In [97]:
df_torontoonly

Unnamed: 0,Cluster,Borough,Neighborhood,Latitude,Longitude
37,4,East Toronto,The Beaches,43.676357,-79.293031
41,4,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,4,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,4,East Toronto,Studio District,43.659526,-79.340923
44,2,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,Central Toronto,Davisville North,43.712751,-79.390197
46,2,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,2,Central Toronto,Davisville,43.704324,-79.38879
48,2,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,2,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


Success - all the neighborhoods have a Cluster assigned

Next let's map the clusters

In [98]:
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

#colors for mapping
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# markers for mapping
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_torontoonly['Latitude'], df_torontoonly['Longitude'], df_torontoonly['Neighborhood'], df_torontoonly['Cluster']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Analysis:
    Purple clusters are downtown Toronto area
    The other cluster are dispersed in relationship to downtown Toronto.
    These clusterings will help us further analyze this area in terms of locations nearby to each other.