# Segmenting and Clustering Neighborhoods in Toronto

This notebook is part of the Applied Data Science Capstone Week 3 Peer-Graded Assignment.

#### Loading Libraries

In [9]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# for webscraping import Beautiful Soup 
from bs4 import BeautifulSoup

import xml

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


#### Scrape Wikipedia

In [12]:
import requests
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url, 'html.parser')

In [13]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

#### Creating & Cleaning the Dataframe

In [14]:
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand=True)
df2 = df1.rename(columns=df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postal Code,Unnamed: 3,Borough,Unnamed: 5,Neighbourhood,Unnamed: 7
1,,M1A,,Not assigned,,Not assigned,
2,,M2A,,Not assigned,,Not assigned,
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,"Regent Park, Harbourfront",


#### Ignore Cells with a Borough that is Not Assigned

In [15]:
df4 = df3[df3.Borough != 'Not assigned']
df4.head()

Unnamed: 0,Unnamed: 1,Postal Code,Unnamed: 3,Borough,Unnamed: 5,Neighbourhood,Unnamed: 7
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,"Regent Park, Harbourfront",
6,,M6A,,North York,,"Lawrence Manor, Lawrence Heights",
7,,M7A,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government",


#### Combine Neighborhoods with the Same Postcode

In [23]:
df5 = df4.groupby(['Postal Code', 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head(10)

Unnamed: 0,Postal Code,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighbourhood,Unnamed: 7
0,M3A,North York,,,,Parkwoods,
1,M4A,North York,,,,Victoria Village,
2,M5A,Downtown Toronto,,,,"Regent Park, Harbourfront",
3,M6A,North York,,,,"Lawrence Manor, Lawrence Heights",
4,M7A,Downtown Toronto,,,,"Queen's Park, Ontario Provincial Government",
5,M9A,Etobicoke,,,,"Islington Avenue, Humber Valley Village",
6,M1B,Scarborough,,,,"Malvern, Rouge",
7,M3B,North York,,,,Don Mills,
8,M4B,East York,,,,"Parkview Hill, Woodbine Gardens",
9,M5B,Downtown Toronto,,,,"Garden District, Ryerson",


#### Change Value of Not Assigned Neighborhood to Borough

In [24]:
df6 = df5.replace("Not assigned", "Queen's Park")
df6.head(10)

Unnamed: 0,Postal Code,Borough,Unnamed: 3,Unnamed: 4,Unnamed: 5,Neighbourhood,Unnamed: 7
0,M3A,North York,,,,Parkwoods,
1,M4A,North York,,,,Victoria Village,
2,M5A,Downtown Toronto,,,,"Regent Park, Harbourfront",
3,M6A,North York,,,,"Lawrence Manor, Lawrence Heights",
4,M7A,Downtown Toronto,,,,"Queen's Park, Ontario Provincial Government",
5,M9A,Etobicoke,,,,"Islington Avenue, Humber Valley Village",
6,M1B,Scarborough,,,,"Malvern, Rouge",
7,M3B,North York,,,,Don Mills,
8,M4B,East York,,,,"Parkview Hill, Woodbine Gardens",
9,M5B,Downtown Toronto,,,,"Garden District, Ryerson",


#### Print the number of Rows in the Dataframe

In [25]:
df6.shape

(103, 7)