### Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab *already downloaded
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.12.5  |       ha878542_0         137 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    _openmp_mutex-4.5          |            1_gnu          22 KB  conda-forge
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    libgcc-ng-9.3.0            |      h2828fa1_18         7.8 MB  conda-forge
    openssl-1.1.1i             |       h7f98852_0         2.1 MB  conda-forge
    libgomp-9.3.0              |      h2828fa1_18         376 KB  conda-forge
    geopy-2.1.0                |     pyhd3deb0d_0          64 KB  conda-forge
    certifi-2020.12.5          |   py36h5fab9bb_1         143 KB  conda-forge
    python_abi-3.

In [10]:
#Scrape the data from wiki
url='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M'

df = pd.read_html(url)

#Transfer to data frame
df_postcodes=df[0]

print('Imported dataframe has', df_postcodes['Postal Code'].count(), "entries")

df_postcodes.head()

Imported dataframe has 180 entries


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [13]:
#process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

df_postcodes['Borough'].unique()
df_postcodes['Neighbourhood'].unique()


df_postcodes_valid = df_postcodes[df_postcodes["Borough"] != "Not assigned"]

#Replace the Not assigned values of Neighborhood with the Borough Value
df_postcodes_valid["Neighbourhood"].replace("Not assigned", df_postcodes_valid["Borough"], inplace=True)

df_postcodes_valid.head(20)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [14]:
#Now combine all the neighbourhoods with the same postal codes
df_clean = df_postcodes_valid.groupby(["Postal Code","Borough"])["Neighbourhood"].apply(list)
df_clean = df_clean.sample(frac=1).reset_index()
df_clean["Neighbourhood"] = df_clean["Neighbourhood"].str.join(', ')
df_clean.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M7R,Mississauga,Canada Post Gateway Processing Centre
1,M2J,North York,"Fairview, Henry Farm, Oriole"
2,M5B,Downtown Toronto,"Garden District, Ryerson"
3,M2K,North York,Bayview Village
4,M9P,Etobicoke,Westmount
5,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
6,M6C,York,Humewood-Cedarvale
7,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park"
8,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
9,M4N,Central Toronto,Lawrence Park


## Data cleaned. See dimensions below

In [18]:
df_clean.shape

(103, 3)

#### PART2  I have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

In [27]:
#!pip install geocoder

#import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
  #g = geocoder.google('{}, Toronto, Ontario'.format(df_clean.at[i,'Postal Code']))
  #lat_lng_coords = g.latlng
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()
    

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the 2 tables for final data frame

In [33]:
df_final = pd.merge(df_clean,lat_lon,on="Postal Code")
df_final.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M2K,North York,Bayview Village,43.786947,-79.385975
4,M9P,Etobicoke,Westmount,43.696319,-79.532242


### PART 3: Clustering and the plotting of the neighbourhoods of Canada which contain Toronto in their Borough

In [39]:
df_toronto = df_final[df_final['Borough'].str.contains('Toronto',regex=False)]
df_toronto


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
7,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049
9,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
11,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
18,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
21,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
25,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
29,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
31,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
32,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


### Visualize the data on Map using Folium

In [35]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

### Using KMeans clustering for the clsutering of the neighbourhoods

In [40]:
k=5
toronto_clustering = df_toronto.drop(['PostalCode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

df_toronto

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
7,1,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049
9,3,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
11,1,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
18,2,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
21,1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
25,4,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
29,1,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
31,1,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
32,1,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817


In [41]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters