# Segmenting and Clustering Neighborhoods in Toronto#

In [1]:
!pip install folium
from bs4 import BeautifulSoup
import requests
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

import folium # map rendering library

import pandas as pd
# import k-means from clustering stage
from sklearn.cluster import KMeans



## Extract the list of neighborhoods from Wikipedia

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")
res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned":
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df_wiki = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df_wiki.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [3]:
df_wiki["PostalCode"]=df_wiki["PostalCode"].str.replace("\n","")
df_wiki["Borough"]=df_wiki["Borough"].str.replace("\n","")
df_wiki["Neighborhood"]=df_wiki["Neighborhood"].str.replace("\n","")
df_wiki.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
df_wiki.to_csv("./CanadaNeighborhood_List.csv",index=False)

## Extracting the Coordinates for all the neighborhoods.

In [5]:
toronto_geocsv = 'https://cocl.us/Geospatial_data'
!wget -q -O 'toronto_m.geospatial_data.csv' toronto_geocsv
geocsv_data = pd.read_csv(toronto_geocsv).set_index("Postal Code")
geocsv_data.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [6]:
filtered_df = df_wiki[df_wiki["Borough"] != "Not Assigned"]

In [7]:
TorontoNeighborhoodCoordinates_df = pd.DataFrame(filtered_df).rename(columns={"PostalCode":"Postal Code"})
TorontoNeighborhoodCoordinates_df = pd.DataFrame(TorontoNeighborhoodCoordinates_df).set_index("Postal Code")
#TorontoNeighborhoodCoordinates_df.head()
TorontoNeighborhoodCoordinates_df = TorontoNeighborhoodCoordinates_df.join(geocsv_data)

In [8]:
TorontoNeighborhoodCoordinates_df.shape

(180, 4)

In [9]:
TorontoNeighborhoodCoordinates_df.to_csv("./TorontoNeighborhoodCoordinates_df.csv",index=False)