# Coursera assignment - Applied Data Science - Week 3

## Importing required libraries

In [274]:
import pandas as pd
import numpy as np
#import requests
#import urllib
#import time
#import json

## Pulling the table from Wiki Link

In [275]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[0]
df = df[1:]
df.columns = ["Postcode", "Borough", "Neighbourhood"]

## Cleaning up the missing data and concatenating duplicate neighbourhood

In [276]:
df = df[df['Borough']!="Not assigned"]

In [277]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighbourhood'].tolist()))
                    
df_grp = df.groupby(['Postcode', 'Borough'])
df2 = df_grp.apply(neighborhood_list).reset_index(name='Neighbourhood')

In [278]:
df2['Neighbourhood'] = np.where(df2['Neighbourhood']=="Not assigned",df2['Borough'] ,df2['Neighbourhood'])

## Checking the number of rows and colums in the data and sample data

In [279]:
df2.shape

(103, 3)

In [280]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Importing geocoder and defining function to pull lat long data

In [281]:
import geocoder

def pull_latlng(Postcode):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(Postcode))
        lat_lng_coords = g.latlng
    return lat_lng_coords

## Pull data for all the post code using the above UDF and for loop

In [282]:
pl_data = [pull_latlng(Postcode) for Postcode in df2['Postcode'].tolist()]

## cleaning the data and merging it with the original dataframe

In [283]:
df_pl_data = pd.DataFrame(pl_data, columns=['Latitude', 'Longitude'])
df2['Latitude'] = df_pl_data['Latitude']
df2['Longitude'] = df_pl_data['Longitude']

## Checking the number of rows and colums in the data and sample data

In [284]:
df2.shape

(103, 5)

In [285]:
df2.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


## Visualizing the base data on a map

In [286]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [287]:
latitude= df2['Latitude'].mean()
longitude= df2['Longitude'].mean()

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(base['Latitude'], base['Longitude'], base['Neighbourhood'], base['cluster']):
    label = folium.Popup(str(poi) + ' cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Running multiple iterations of k-means with different Ks and viewing results on map
### Eventually 6 seemed to be a good number after viewing results of each iteration on a map

In [291]:
from sklearn.cluster import KMeans

In [296]:
base=pd.DataFrame(df2['Latitude'],columns=['Latitude'])
#base['Latitude'] = df2['Latitude']
base['Longitude'] = df2 ['Longitude']
k_clusters = 6
kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(base)
base['cluster'] = kmeans.labels_
base['Neighbourhood'] = df2['Neighbourhood']

In [297]:
latitude= base['Latitude'].mean()
longitude= base['Longitude'].mean()

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(base['Latitude'], base['Longitude'], base['Neighbourhood'], base['cluster']):
    label = folium.Popup(str(poi) + ' cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters