# Segmenting and Clustering Neighborhoods in Toronto - Part 1

In [11]:
from bs4 import BeautifulSoup
import requests
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

import pandas as pd

## Extract the list of neighborhoods from Wikipedia

In [12]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")
res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned":
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df_wiki = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df_wiki.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [13]:
df_wiki["PostalCode"]=df_wiki["PostalCode"].str.replace("\n","")
df_wiki["Borough"]=df_wiki["Borough"].str.replace("\n","")
df_wiki["Neighborhood"]=df_wiki["Neighborhood"].str.replace("\n","")
df_wiki.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Filtered data with Borough "Not Assigned"

In [16]:
df_wiki_filtered = df_wiki[df_wiki["Borough"]!="Not assigned"]

In [17]:
df_wiki_filtered.to_csv("./CanadaNeighborhood_List.csv",index=False)
df_wiki_filtered.shape

(103, 3)