## Week 3 assignment - Segmenting and Clustering neighborhoods in Toronto <br>

### Part 1 - Setting up neighborhoods data into a pandas dataframe 

In [6]:
# Load data from wikipedia page

import pandas as pd
d = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df_toronto = d[0]       # First element of the list is the required dataframe 
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
# Check for not assigned rows
print("Number of not assigned neighbourhoods: ", len(df_toronto[df_toronto['Neighbourhood'] =='Not assigned']))
print("Borough value for such rows:", df_toronto[df_toronto['Neighbourhood'] =='Not assigned']['Borough'].value_counts())

Number of not assigned neighbourhoods:  77
Borough value for such rows: Not assigned    77
Name: Borough, dtype: int64


Hence we see that wherever neighborhood is not assigned, borough is also not assigned. So we drop these rows.

In [8]:
df_toronto.drop(df_toronto[df_toronto['Neighbourhood']=='Not assigned'].index, axis=0, inplace=True)
df_toronto.reset_index(drop=True, inplace=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [9]:
# Merge rows by postal code
df_toronto = df_toronto.groupby('Postcode', as_index=False).aggregate({'Borough': 'first', 'Neighbourhood':', '.join})
df_toronto.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
print("Number or postcodes/ rows:", df_toronto.shape[0])

Number or postcodes/ rows: 103


<br>
<br>

### Part 2 - Add latitude and longitude coordinates of neighborhoods

In [16]:
df_ll = pd.read_csv('http://cocl.us/Geospatial_data')
df_ll.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_toronto = pd.merge(df_toronto, df_ll, on='Postcode')
df_toronto.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


<br>
<br>

### Part - 3 Exploring and clustering the neighborhoods in Toronto 

In [19]:
# Take a look at number of boroughs and neighbourhoods
print('Toronto has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

Toronto has 10 boroughs and 103 neighborhoods.


In [20]:
# Let's check how many neighborhoods belong to each borough
df_toronto['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Mississauga          1
Name: Borough, dtype: int64

<br>
Now let us visualize the ten boroughs on a map. For this I approximate the location of each borough as the average value of the coordinates for all its neighborhoods. 

In [30]:
# Let's visualize the ten boroughs on a map
# df_map = df_toronto.drop('Neighbourhood', axis=1).groupby('Borough').mean()
df_map = df_toronto.drop('Neighbourhood', axis=1).groupby('Borough', as_index=False).mean()

In [41]:
df_map

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,West Toronto,43.652653,-79.44929
9,York,43.690797,-79.472633


In [35]:
!pip install geopy
!pip install folium



In [36]:
from geopy.geocoders import Nominatim
import folium

In [40]:
# Get coordinates of Toronto

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ttt")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [45]:
# Make a map of toronto showing all the boroughs
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, borough in zip(df_map['Latitude'], df_map['Longitude'], df_map['Borough']):
    folium.Marker(
    location=[lat, long],
    popup=borough).add_to(map_toronto)
    
map_toronto