# Coursera Capstone Project - Notebook 
The notebook scrapes the Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M  to process into a dataframe.  The dataframe is then cleaned and put into a usable form for Week 3. 


# Decision 
I decided to "Explore and Cluster the Neighborhoods in Toronto" and follow the pattern observed in the labs. 

Elements covered are: 

    to add enough Markdown cells to explain what you decided to do and to report any observations you make. (( included here and down below ))
    to generate maps to visualize your neighborhoods and how they cluster together.
    

In [4]:
# Imports to load the utilities needed to parse and create the data frame 
import xml.etree.ElementTree as ET
import requests
import pandas as pd

In [8]:
# Downloads the Object
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
r

<Response [200]>

In [9]:
df = pd.DataFrame([], columns=['Postcode','Borough','Neighbourhood'])

tree = ET.fromstring(r.content)
tables = tree.findall(".//*[@id]/div/table")[0]

rows = tables.findall('.//tbody/tr')
for row in rows: 
    if not 'Not assigned' in str(row[1].text):
        borough = str(row[1].text).replace('\n','')
        if 'None' in borough: 
            borough = row[1][0].text
        
        neighbourhood = str(row[2].text).replace('\n','')
        if 'None' in neighbourhood: 
            neighbourhood = row[2][0].text
        
        if 'Not assigned' in neighbourhood: 
            neighbourhood = borough
        
        df = df.append(
            {'Postcode': str(row[0].text).replace('\n',''), 
             'Borough' : borough, 
             'Neighbourhood' : neighbourhood}, 
            ignore_index=True)

# remove the header data element
df = df.drop(df.index[0])
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,212,212,212
unique,103,11,210
top,M9V,Etobicoke,Runnymede
freq,8,45,2


In [10]:
df_g = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df_g.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [11]:
df_g[df_g['Borough'].str.contains('Downtown Toronto')]

Unnamed: 0,Postcode,Borough,Neighbourhood
50,M4W,Downtown Toronto,Rosedale
51,M4X,Downtown Toronto,"Cabbagetown,St. James Town"
52,M4Y,Downtown Toronto,Church and Wellesley
53,M5A,Downtown Toronto,"Harbourfront,Regent Park"
54,M5B,Downtown Toronto,"Ryerson,Garden District"
55,M5C,Downtown Toronto,St. James Town
56,M5E,Downtown Toronto,Berczy Park
57,M5G,Downtown Toronto,Central Bay Street
58,M5H,Downtown Toronto,"Adelaide,King,Richmond"
59,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station"


In [12]:
df_g[df_g['Borough'].str.contains('Queen')]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [13]:
print("The number of rows are ", df_g.shape[0])

The number of rows are  103


In [14]:
!curl -L https://cocl.us/Geospatial_data > geozip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   524    0   524    0     0    936      0 --:--:-- --:--:-- --:--:--   935
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100     4  100     4    0     0      2      0  0:00:02  0:00:01  0:00:01     0
100  2891  100  2891    0     0   1310      0  0:00:02  0:00:02 --:--:--  1310


In [16]:
# Check the head of the geo location - lat/lon
df_geo = pd.read_csv('geozip')
df_geo.columns = ['Postcode', 'Latitude','Longitude']
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
df_join = pd.merge(df_g, df_geo, how='left', on=['Postcode'])
df_join.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
df_join

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# Decision 
I decided to "Explore and Cluster the Neighborhoods in Toronto" and follow the pattern observed in the labs. 

Elements covered are: 

    to add enough Markdown cells to explain what you decided to do and to report any observations you make. (( included here and down below ))
    to generate maps to visualize your neighborhoods and how they cluster together.

In [19]:
import folium

In [20]:
# Create the initial map
latitude = 43.6532 
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

In [24]:

# add markers to map
for lat, lng, borough, neighborhood in zip(df_join['Latitude'], df_join['Longitude'], df_join['Borough'], df_join['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [25]:
from sklearn.cluster import KMeans

In [45]:
# Refine the dataframe
df_join_clustering = df_join.drop('Neighbourhood', 1)
df_join_clustering = df_join_clustering.drop('Postcode', 1)
df_join_clustering = df_join_clustering.drop('Borough', 1)
df_join_clustering.head()

Unnamed: 0,Cluster,Cluster Labels,Latitude,Longitude
0,0,0,43.806686,-79.194353
1,0,0,43.784535,-79.160497
2,0,0,43.763573,-79.188711
3,0,0,43.770992,-79.216917
4,0,0,43.773136,-79.239476


In [46]:
# per the pattern when looking at new york, let's try 5 clusters with toronto
kclusters = 5

# k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_join_clustering)

# Size of the Shape (results 0 to the top of the kclusters)
kmeans.labels_[0:103] 

array([4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0,
       0, 0, 0, 3, 3, 3, 0, 0, 0, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [49]:
df_resx = df_join
df_resx.insert(0, 'Cluster Label', kmeans.labels_)
df_resx

Unnamed: 0,Cluster Label,Cluster,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,4,0,0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,4,0,0,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,4,0,0,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,4,0,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,4,0,0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,4,0,0,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,4,0,0,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,3,2,2,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,4,0,0,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,3,2,2,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [50]:
df_resx.loc[df_resx['Cluster Labels'] == 0, df_resx.columns[[1] + list(range(5, df_resx.shape[1]))]]

Unnamed: 0,Cluster,Neighbourhood,Latitude,Longitude
0,0,"Rouge,Malvern",43.806686,-79.194353
1,0,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,0,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,0,Woburn,43.770992,-79.216917
4,0,Cedarbrae,43.773136,-79.239476
5,0,Scarborough Village,43.744734,-79.239476
6,0,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
8,0,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
10,0,"Dorset Park,Scarborough Town Centre,Wexford He...",43.75741,-79.273304
11,0,"Maryvale,Wexford",43.750072,-79.295849


In [51]:
df_resx.loc[df_resx['Cluster Labels'] == 1, df_resx.columns[[1] + list(range(5, df_resx.shape[1]))]]

Unnamed: 0,Cluster,Neighbourhood,Latitude,Longitude
31,1,Downsview West,43.739015,-79.506944
32,1,Downsview Central,43.728496,-79.495697
33,1,Downsview Northwest,43.761631,-79.520999
79,1,"Maple Leaf Park,North Park,Upwood Park",43.713756,-79.490074
80,1,"Del Ray,Keelesdale,Mount Dennis,Silverthorn",43.691116,-79.476013
81,1,"The Junction North,Runnymede",43.673185,-79.487262
84,1,"Runnymede,Swansea",43.651571,-79.48445
86,1,Canada Post Gateway Processing Centre,43.636966,-79.615819
88,1,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321
89,1,"Alderwood,Long Branch",43.602414,-79.543484


In [52]:
df_resx.loc[df_resx['Cluster Labels'] == 2, df_resx.columns[[1] + list(range(5, df_resx.shape[1]))]]

Unnamed: 0,Cluster,Neighbourhood,Latitude,Longitude
7,2,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
9,2,"Birch Cliff,Cliffside West",43.692657,-79.264848
25,2,Parkwoods,43.753259,-79.329656
26,2,Don Mills North,43.745906,-79.352188
27,2,"Flemingdon Park,Don Mills South",43.7259,-79.340923
34,2,Victoria Village,43.725882,-79.315572
35,2,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
36,2,Woodbine Heights,43.695344,-79.318389
37,2,The Beaches,43.676357,-79.293031
38,2,Leaside,43.70906,-79.363452


In [53]:
df_resx.loc[df_resx['Cluster Labels'] == 3, df_resx.columns[[1] + list(range(5, df_resx.shape[1]))]]

Unnamed: 0,Cluster,Neighbourhood,Latitude,Longitude
47,3,Davisville,43.704324,-79.38879
48,3,"Moore Park,Summerhill East",43.689574,-79.38316
49,3,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049
50,3,Rosedale,43.679563,-79.377529
51,3,"Cabbagetown,St. James Town",43.667967,-79.367675
52,3,Church and Wellesley,43.66586,-79.38316
53,3,"Harbourfront,Regent Park",43.65426,-79.360636
54,3,"Ryerson,Garden District",43.657162,-79.378937
55,3,St. James Town,43.651494,-79.375418
56,3,Berczy Park,43.644771,-79.373306


In [54]:
df_resx.loc[df_resx['Cluster Labels'] == 4, df_resx.columns[[1] + list(range(5, df_resx.shape[1]))]]

Unnamed: 0,Cluster,Neighbourhood,Latitude,Longitude
17,4,Hillcrest Village,43.803762,-79.363452
18,4,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
19,4,Bayview Village,43.786947,-79.385975
20,4,"Silver Hills,York Mills",43.75749,-79.374714
21,4,"Newtonbrook,Willowdale",43.789053,-79.408493
22,4,Willowdale South,43.77012,-79.408493
23,4,York Mills West,43.752758,-79.400049
24,4,Willowdale West,43.782736,-79.442259
28,4,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259
29,4,"Northwood Park,York University",43.76798,-79.487262


In [58]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [61]:
# df_resx
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_resx['Latitude'], df_resx['Longitude'], df_resx['Neighbourhood'], df_resx['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters