# This assignment deals with clustering and segmenting Toronto neighborhoods.

### Let's start by importing relevant libraries.

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  25.23 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  35.88 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  53.36 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.81 MB/s
vincent-0.4.4- 100% |###################

### Scrap the wikipedia file to extract the table of postal codes of Toronto area using pandas read_html() method and display the data frame.

In [4]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header = 0)

df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Let's see the total number of columns in the scrapped table of data.

In [5]:
df.shape

(289, 3)

### Eliminate entries from the second column "Borough" that has the phrase "Not assigned".

In [6]:
df= df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Let's check whether the number of rows reduced or not.

In [7]:
df.shape

(212, 3)

### Group the neighborhoods and boroughs with identical postal codes.

In [8]:
df = df.groupby(['Postcode', 'Borough', 'Neighbourhood']).agg({'Postcode':lambda x: ', '.join(tuple(x.tolist())),

                                     'Neighbourhood':lambda x: ', '.join(tuple(x.tolist()))}
                                   )
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Postcode,Neighbourhood
Postcode,Borough,Neighbourhood,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,Malvern,M1B,Malvern
M1B,Scarborough,Rouge,M1B,Rouge
M1C,Scarborough,Highland Creek,M1C,Highland Creek
M1C,Scarborough,Port Union,M1C,Port Union
M1C,Scarborough,Rouge Hill,M1C,Rouge Hill


### Merging all those neighborhoods with the same postal codes.

In [9]:
df = df.groupby('Borough').agg({'Postcode':'first', 
                             'Neighbourhood': ', '.join 
                              }).reset_index()
df.head(20)

Unnamed: 0,Borough,Postcode,Neighbourhood
0,Central Toronto,M4N,"Lawrence Park, Davisville North, North Toronto..."
1,Downtown Toronto,M4W,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,East Toronto,M4E,"The Beaches, Riverdale, The Danforth West, Ind..."
3,East York,M4B,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,Etobicoke,M8V,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,Mississauga,M7R,Canada Post Gateway Processing Centre
6,North York,M2H,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,Queen's Park,M7A,Not assigned
8,Scarborough,M1B,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,West Toronto,M6H,"Dovercourt Village, Dufferin, Little Portugal,..."


### Re-arrange the columns by shifting "Postcode" in the first column.

In [10]:
columnsList=["Postcode","Borough", "Neighbourhood"]
df=df.reindex(columns=columnsList)
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,M4E,East Toronto,"The Beaches, Riverdale, The Danforth West, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,M7R,Mississauga,Canada Post Gateway Processing Centre
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,M7A,Queen's Park,Not assigned
8,M1B,Scarborough,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,M6H,West Toronto,"Dovercourt Village, Dufferin, Little Portugal,..."


### Remove the phrase "Not assigned" from the "Neighbourhood" column and replace with "Queen's Park" that is same as the entry in the "Borough" column.

In [11]:
df.replace({'Neighbourhood': 'Not assigned'}, {'Neighbourhood': "Queen's Park"}, regex=True)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, Cabbagetown, St. James Town, Church ..."
2,M4E,East Toronto,"The Beaches, Riverdale, The Danforth West, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto, ..."
5,M7R,Mississauga,Canada Post Gateway Processing Centre
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,M7A,Queen's Park,Queen's Park
8,M1B,Scarborough,"Malvern, Rouge, Highland Creek, Port Union, Ro..."
9,M6H,West Toronto,"Dovercourt Village, Dufferin, Little Portugal,..."


### Let's check the total number of rows now.

In [12]:
df.shape

(11, 3)