# Segmenting and Clustering Neighborhoods in Toronto

### This is the peer-graded assignment for week three.

### Part One: Retrieving the postal codes, boroughs, and neighborhoods of Toronto.

In [117]:
# We perform our imports.
import numpy as np
import pandas as pd

In [118]:
# We read in the dataframe from wikipedia.
df_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [119]:
# We locate the correct dataframe.
df = df_list[0]

In [120]:
# We create and apply our filter to get rid of unassigned boroughs.
b_filter = df['Borough'] != 'Not assigned'
df = df[b_filter]
df.reset_index(inplace = True, drop = True)

In [121]:
# We take a quick look at the shape of the dataframe.
print(df.shape)

# We look at the head of the dataframe.
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Part Two: Retrieving the Geographical Coordinates of Neighborhoods in Toronto

In [122]:
# Let's install the geocoder module if it hasn't already been installed.
#! pip install geocoder
# Just kidding. This didn't work out so well. Let's just read in the csv file.

In [123]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [124]:
# We merge the two dataframes together so that we can access the latitude and longitude of each location.
merged_df = lat_long_df.merge(right = df, on = 'Postal Code')

# Let's reorder the columns of our dataframe.
merged_df = merged_df[['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part 3: New York Clustering Analysis

### Data Standardization:

In [125]:
# We import the necessary package for scaling the data k-means clustering analysis.
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 

# We need to decide what features we are going to use to cluster our data. 
# Because latitude and longitude are the only numerical features, they are the only ones we will keep.
features = merged_df[['Latitude', 'Longitude']].values

# It's time to standardize the data. Each value now represents the number of standard deviations it is above or below the mean respectively.
cluster_dataset = StandardScaler().fit_transform(features)

# Here's a sample of our dataset.
cluster_dataset[0:5]

array([[1.95523876, 2.09777597],
       [1.53094905, 2.44798852],
       [1.12942801, 2.15613628],
       [1.271543  , 1.86437197],
       [1.3126078 , 1.6310228 ]])

### K-Means Modeling:

In [126]:
# We create the k-means model.
k_means_model = KMeans(init = "k-means++", n_clusters = 4, n_init = 12)

# It's time to fit the model.
k_means_model.fit(cluster_dataset)

# Let's look at our predicted labels.
print(k_means_model.labels_, '\n')

# let's look at our cluster centers.
print(k_means_model.cluster_centers_)

# Let's assign the labels to our merged dataframe.
merged_df['Label'] = k_means_model.labels_

# We take a look at our df.
merged_df.head()

[2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 0 0 0 0 0 0 2 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1
 1 1 1 1 1 0 3 3 3 3 3 1 3 1 3 3 3 3 3 3 3 3 0 0 0 3 3 0 3] 

[[ 0.73256303 -0.64730861]
 [-0.65914393  0.21933192]
 [ 1.18851287  1.27863113]
 [-0.9394901  -1.30790562]]


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Label
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,2
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2


### Drawing Maps Using Folium:

In [127]:
# We install folium if it hasn't already been installed.
# ! pip install folium

# We import folium.
import folium

# The latitude and longitude of Toronto are assigned below.
lat = 43.651070
long = -79.347015

# It's time to create our first map of Toronto.
map_toronto = folium.Map(location = [lat, long], zoom_start = 11)
map_toronto

# We prepare a color dictionary that will plot different points with different colors based on their cluster number.
cluster_dict = {0:'blue', 1:'green', 2:'red', 3:'yellow'}

# Let's add some circle markers to our map by putting our data through a for loop.
for lat, lon, neigh, cluster_num in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Neighbourhood'], merged_df['Label']):
    
    # We create the label that we will be putting on the points.
    # label = folium.Popup(str(neigh) + ', Cluster ' + str(cluster_num), parse_html=True)
    label = str(neigh) + ', Cluster ' + str(cluster_num)
    
    # We create circle markers for our neighborhoods.
    folium.CircleMarker(location = [lat, lon], radius = 5, popup = label, fill=True, color = cluster_dict[cluster_num], 
                        fill_color = cluster_dict[cluster_num], fill_opacity=0.7).add_to(map_toronto)

# To find the centroid coordinates, we simply have to group our dataframe by cluster and calculate the mean values of each cluster.
centroid_df = merged_df.groupby('Label').mean()

# We find the coordinates and add them to our plot.
for index, row in centroid_df.iterrows():
    lat = row['Latitude']
    lon = row['Longitude']
    folium.CircleMarker(location = [lat, lon], radius = 10, fill=True, 
                        color = 'orange', fill_color = 'orange', fill_opacity=0.7).add_to(map_toronto)
    
# We display our map.
map_toronto

### In the above map, you can see that we have successfully performed a cluster analysis on the various neighborhoods of Toronto by grouping them into four main clusters.

### Each cluster was given a different color, with the cluster's respective points plotted in that color.

### The centroid points themselves were plotted in orange.