# Capstone Project 
## This Notebook is intended for the Capstone Project exercises
<br>
Author: Robert Jürgens

# The Toronto Neighbourhood Project with Geospatial Data

In [1]:
%matplotlib inline

## Import required Libraries

In [2]:
import pandas as pd
import numpy as np
import itertools

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

## Retrieve Postal Codes Table from Wikipedia
***
1. Read with pandas read_html method directly from source into dataframe
    - Restrict to table with attribute 'class wikitable sortable' as revealed by inspecting source
    - Use row 0 for the column names


2. Create unique list of postal codes


3. Copy dataframe with rows removed for which column 'Borough' has the value 'Not assigned'


4. Iterate through the set of postal codes to generate a row with
    - The Postal Code
    - The unique Borough Name
    - A list of Neighbourhoods for the postal code


5. Append that row to final data frame

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url,attrs={'class':'wikitable sortable'},header=0)
df_tmp = df[0][df[0]['Borough']!='Not assigned']

pc = set(df_tmp['Postcode'])

df_pc = pd.DataFrame(columns=['PostalCode','Borough','Neighbourhood','Latitude','Longitude'])

for p in pc:
    br = np.array(df_tmp[df_tmp['Postcode']==p]['Borough'].unique())[0]
    nh = np.array(df_tmp[df_tmp['Postcode']==p]['Neighbourhood'])
    if len(nh) == 1:
        if nh == 'Not assigned':
            nh = [br]
    nhs = ''
    for n in nh:
        nhs += n+','
    
    nhs = nhs[:-1]
    
    ndf = pd.DataFrame([[p,br,nhs,0.0,0.0]],columns=['PostalCode','Borough','Neighbourhood','Latitude','Longitude'])
    df_pc = df_pc.append(ndf)
    
df_pc.reset_index(drop=True,inplace=True)

## Read Geospatial Data from csv as google api induces cost

In [4]:
gd_df = pd.read_csv('geospatial_data.csv')
gd_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
for i in df_pc.index:
    postal_code = df_pc.iloc[i,:]['PostalCode']

    latitude = float(gd_df.loc[gd_df['Postal Code']==postal_code]['Latitude'])
    longitude = float(gd_df.loc[gd_df['Postal Code']==postal_code]['Longitude'])
    
    df_pc.at[i,'Latitude'] = latitude
    df_pc.at[i,'Longitude'] = longitude
    

    
df_pc

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M9L,North York,Humber Summit,43.756303,-79.565963
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M5M,North York,"Bedford Park,Lawrence Manor East",43.733283,-79.419750
3,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
4,M4N,Central Toronto,Lawrence Park,43.728020,-79.388790
5,M3C,North York,"Flemingdon Park,Don Mills South",43.725900,-79.340923
6,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
7,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493
8,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel...",43.815252,-79.284577
9,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849
