#                          Segmenting and Clustering Neighborhoods in Toronto

## Importing all Libraries before I start.

In [4]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Libraries imported.


### Obtaining the HTML Code for the webpage which contains the Canada Neighbourhood data [Link](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [19]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

### BeautifulSoup needs quite a bit of coding to parse the table but it can be done in pandas prety easy hence using Pandas to parse the table from the webpage

The below code gives me a list of the table as displayed

In [20]:
dfs = pd.read_html(source,header=0)
type(dfs)
dfs

[    Postcode           Borough  \
 0        M1A      Not assigned   
 1        M2A      Not assigned   
 2        M3A        North York   
 3        M4A        North York   
 4        M5A  Downtown Toronto   
 5        M5A  Downtown Toronto   
 6        M6A        North York   
 7        M6A        North York   
 8        M7A      Queen's Park   
 9        M8A      Not assigned   
 10       M9A         Etobicoke   
 11       M1B       Scarborough   
 12       M1B       Scarborough   
 13       M2B      Not assigned   
 14       M3B        North York   
 15       M4B         East York   
 16       M4B         East York   
 17       M5B  Downtown Toronto   
 18       M5B  Downtown Toronto   
 19       M6B        North York   
 20       M7B      Not assigned   
 21       M8B      Not assigned   
 22       M9B         Etobicoke   
 23       M9B         Etobicoke   
 24       M9B         Etobicoke   
 25       M9B         Etobicoke   
 26       M9B         Etobicoke   
 27       M1C       

Converting the list to a DataFrame and displaying the head of the Dataframe

In [21]:
Canada_df = dfs[0]
Canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [22]:
#Check the current shape of the DataFrame
Canada_df.shape

(289, 3)

The below code filters out rows from **Borough** which contain **Not assigned**

In [23]:
CDF = Canada_df[~Canada_df.Borough.str.contains("Not assigned")]
CDF.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [24]:
#Shape shows that rows with Not assigned in Brough has been dropped
CDF.shape

(212, 3)

In [25]:
#Creating a copy of the Dataframe
CDF2 = CDF

### Grouping data by the Postcode and Borough

In [26]:
## Initially I am grouping the data by the first two columns and applying the join function to concat the strings to once 
## cell in Neighbourhood column
Canada_Series = CDF2.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x)) ##
type(Canada_Series)
## o/p will be a pandas series

pandas.core.series.Series

In [27]:
## Converting the Pd series to a Dataframe
CDF3 = Canada_Series.to_frame()
CDF3.reset_index(level=['Postcode','Borough'], inplace=True) #Resetting the index 
CDF3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Checking if there are any "Not assigned" values in the "Neighbourhood" Column

In [28]:
CDF4 = CDF3[CDF3.Neighbourhood.str.contains("Not assigned")]
CDF4 # Found one row

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


As there was only one row with Not assigned value changing the changing the vaule manually.

In [29]:
CDF3.loc[85,'Neighbourhood'] = CDF3.loc[85,'Borough'] #Copying the vaule in Borough to Neighbourhood
#Displaying the changes
CDF3.loc[85]

Postcode                  M7A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 85, dtype: object

## Loading the Geospatial Coordinates data to a dataframe

In [30]:
datapath = "http://cocl.us/Geospatial_data"
geospac = pd.read_csv(datapath)
geospac.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Now as the data is loaded to a DF. I am Merging the two DataFrames into one.

In [31]:
# The merge function lets me merge two dataframe on an same column with same data.
dfinal = CDF3.merge(geospac, how = 'inner',left_on='Postcode', right_on='Postal Code')
dfinal = dfinal.drop(labels='Postal Code',axis=1) # as the above line adds both the columns dropping one of the column
dfinal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Displaying the shape of the Data Frame

In [32]:
dfinal.shape

(103, 5)

## Displaying all the markers from the map

In [48]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfinal['Latitude'], dfinal['Longitude'], dfinal['Borough'], dfinal['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

In [45]:
#Filtering just the Rows which contain "toronto" in Borough column
dfinal1 = dfinal[dfinal['Borough'].str.contains('Toronto')]
dfinal1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Code to display markers of Toronto map where Borough contains "Toronto"

In [46]:
# create map of New York using latitude and longitude values
map_toronto_only = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfinal1['Latitude'], dfinal1['Longitude'], dfinal1['Borough'], dfinal1['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_only)  
map_toronto_only