### Segmenting and Clustering Neighborhoods in Toronto

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

In [None]:
# To build the code to scrape the following Wikipedia page
#$ pip install beautifulsoup4
#$ easy_install beautifulsoup4
from bs4 import BeautifulSoup
url=  requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#with open (url) as html_file:
soup = BeautifulSoup(url.text,'lxml')
#print(soup.prettify())

In [None]:
import csv
csv_file=open('Toronto_Codes.csv','w')
csv_writer=csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])
for tr in soup.find_all('tr')[1:]:
    tds = tr.find_all('td')
    if len(tds)==3:
        Postcode=tds[0].text
        Borough=tds[1].text
        Neighbourhood=tds[2].text
    #print(Postcode, Borough, Neighbourhood)
    csv_writer.writerow([Postcode, Borough, Neighbourhood])
csv_file.close()

### Load table from a clean Toronto_Codes csv data

In [2]:
#load canada data from CSV
import pandas as pd
df_cd=pd.read_csv('Toronto_Codes.csv')
print('Data loaded')

Data loaded


In [3]:
df_cd.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
df_cd[df_cd.columns] = df_cd.apply(lambda x: x.str.strip('\n'))
df_cd.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore cells with a borough that is Not assigned

In [5]:
# excluding borough
df_ab=df_cd[df_cd.Borough!='Not assigned']
df_ab.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Get more then one neighborhood exist in one postal code area are 
like, get M5A is listed twice that has two neighborhoods: Harbourfront and Regent Park. 

In [6]:
df_ab.set_index(['Postcode','Borough'],inplace=True)
res = df_ab.groupby(level=['Postcode','Borough'], sort=False).agg( ','.join)
#res.dtypes

In [7]:
res.reset_index() # to remove set index

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


### A cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [8]:
res.loc[res['Neighbourhood'] =='Not assigned']=res[res['Neighbourhood'] == 'Not assigned'].index.values[0][1]
res.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront,Regent Park"
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Queen's Park


In [9]:
df=res.reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Using the .shape method to print the number of rows of your dataframe.

In [40]:
df.shape

(103, 6)

Now, you can see that we have 103 rows and 3 columns

In [35]:
df["Address"]=df['Borough'] +", " +df['Postcode'] 
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Latitude,Longitude
0,M3A,North York,Parkwoods,"North York, M3A",43.753259,-79.329656
1,M4A,North York,Victoria Village,"North York, M4A",43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A",43.664366,-79.392328


### Generate list of urls using my API key first

In [49]:
url2 = [] #generate list of urls
for i in range(0, len(df)):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={},+Toronto+Ontario,+CA&key=YOUR-KEY'.format(df['Address'][i])
    url2.append(url)

#### Now loop through the url to get latitude and longitude from google API

In [None]:
df['Latitude']=0
df['Longitude']=0
for i, url in enumerate(url2):
    r = requests.get(url2[i], params=params)
    results = r.json()['results']
    if results:
        location = results[0]['geometry']['location']
    else:
        print ("no results")
    df['Latitude'].iloc[i] = location['lat']
    df['Longitude'].iloc[i]= location['lng']

In [41]:
df.head(50)

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Latitude,Longitude
0,M3A,North York,Parkwoods,"North York, M3A",43.753259,-79.329656
1,M4A,North York,Victoria Village,"North York, M4A",43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A",43.664366,-79.392328
5,M9A,Etobicoke,Islington Avenue,"Etobicoke, M9A",43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern","Scarborough, M1B",43.806686,-79.194353
7,M3B,North York,Don Mills North,"North York, M3B",43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill","East York, M4B",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District","Downtown Toronto, M5B",43.657162,-79.378937


 write to csv so do not have to run the for loop again

In [None]:
df.to_csv('Toronto_codes.csv')