# Coursera Capstone
This notebook will be used for the IBM Data Science capstone project.

##Imports

In [0]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
#!pip install geocoder
import geocoder
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

## Segmenting and Clustering Neighborhoods in Toronto

### Initial dataframe

In [383]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(URL).text
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})

new_table = pd.DataFrame(columns=range(0,3), index = [0])
table_body = table.find('tbody')

data = []
rows = table_body.find_all('tr')
for row in rows:
  cols = row.find_all('td')
  row = [row.text for row in cols]
  # Ignore cells with a borough that is Not assigned.
  if "Not assigned" not in row:
    data.append(row)

df = pd.DataFrame(data, columns=['Postcode', 'Borough', 'Neighborhood_'])
df = df.replace(r'\n',' ', regex=True) 
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood_
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Downtown Toronto,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


###Grouping neighborhoods in the same borough 

More than one neighborhood can exist in one postal code area. For example, in the table above, you will notice that M1B is listed twice and has two neighborhoods: Rouge and Malvern. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [384]:
df_neigh = df.groupby('Postcode')['Neighborhood_'].agg({'Neighborhood_':'first', 'Neighborhood':', '.join})
df = pd.merge(df, df_neigh, how='inner', on=['Neighborhood_'] )
df.drop('Neighborhood_', axis=1, inplace=True)
df.head(10)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights , Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge , Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens , Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson , Garden District"


In [385]:
df.shape

(105, 3)

###Adding coordinates

In [386]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_coord = pd.merge(df, df_geo, on='Postcode')
df_coord.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson , Garden District",43.657162,-79.378937


###Clustering

In [387]:
# set number of clusters
kclusters = 5


df_coord1 = df_coord.drop(['Postcode','Borough','Neighborhood'], 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_coord1)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 2, 4, 2, 0, 1, 3, 3, 2], dtype=int32)

In [388]:
df_coord.insert(0, 'Cluster Labels', kmeans.labels_)
df_coord.head(10)

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighborhood,Latitude,Longitude
0,3,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,4,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763
4,2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,0,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,1,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
7,3,M3B,North York,Don Mills North,43.745906,-79.352188
8,3,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937
9,2,M5B,Downtown Toronto,"Ryerson , Garden District",43.657162,-79.378937


In [389]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_coord['Latitude'], df_coord['Longitude'], df_coord['Neighborhood'], df_coord['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters