# Segmenting and Clustering Neighborhoods in Toronto

First import the needed python packages

In [61]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import folium 
import requests

Scrape Toronto neighborhoods from the Wikipedia. First download the page using the `requests` package. Then use the `BeautifulSoup` web scraping package to get a list of Toronto neighborhoods.

In [62]:
wikipedia_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = BeautifulSoup(requests.get(wikipedia_url).text, 'html.parser')

# find first table on the html
postal_codes = soup.find_all('table')[0]

# iterate over rows in the table body
table_body = postal_codes.find('tbody')

data = []
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    df_row = tuple([elem.text.strip() for elem in cols])
    
    if cols:
        data.append(df_row)


Use a list of tuples `(postal_code, borough, neighborhood)` to create a Pandas dataframe. Clean up the dataframe by removing all the rows in which borough is not assigned. After that replace not assigned neighborhood with the borough name. 

In [63]:
# create Pandas dataframe
df = pd.DataFrame(data, columns=["PostalCode", "Borough", "Neighborhood"])

# drop if borough is not assigned 
df = df[df["Borough"] != "Not assigned"]

# if neighborhood is not assigned us the borough name
df["Neighborhood"] = np.where(df["Neighborhood"] == "Not assigned", df["Borough"], df["Neighborhood"])

Finally combine neighborhoods with the same postal code.

In [64]:
# combine postal codes
combined = df.groupby(["PostalCode"])["Neighborhood"].agg(lambda n: ','.join(n))
df = df.drop(["Neighborhood"], axis=1)
df.drop_duplicates(subset="PostalCode", inplace=True)

toronto_df = df.join(combined, on="PostalCode")

# print out shape
toronto_df.shape

(103, 3)

In [65]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront,Regent Park"
6,M6A,North York,"Lawrence Heights,Lawrence Manor"
8,M7A,Queen's Park,Queen's Park
