# **Segmenting and Clustering Neighborhoods in Toronto (First Comment)** 

## **1. Importing Libraries** 

In [1]:
# Import libraries to handle data
import requests 
import numpy as np
import pandas as pd
import json
#!conda install -c conda-forge/label/gcc7 geopy --yes 
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize

print("Work Done!")

Work Done!


In [2]:
# Import Matplotlib and associated plotting modules 
import matplotlib.cm as cm 
import matplotlib.colors as colors 

print("Work Done!")

Work Done!


In [3]:
# Import K-Means for clustering 
from sklearn.cluster import KMeans

print("Work Done!")

Work Done!


In [9]:
# Import webscraping tool from Beautiful Soup 
#!conda install -c anaconda beautifulsoup4 --yes
from bs4 import BeautifulSoup as bts
import xml 
import folium 

## **2. Scrap Data from Wikipedia into a Dataframe**

In [33]:
# Apply Get request 
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data_html = requests.get(wikipedia_link).text

In [34]:
# Trans data from html into bs object 
soup = bts(data_html, 'html.parser')

In [35]:
# Create lists for table data
postalcodeList = []
boroughList = []
neighborhoodList = []

In [36]:
# Append the data into the respective lists 
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new liens in neghborhood cell 

In [37]:
# create a new DataFrame from the three lists
df_toronto = pd.DataFrame({"PostalCode": postalcodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## **3. Ignore cells with a borough that is *Not assigned***

In [38]:
# drop cells with a borough that is Not assigned
df_toronto_drop = df_toronto[df_toronto.Borough != "Not assigned"].reset_index(drop=True)
df_toronto_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## **4. Combine Neighborhoods with same Borough**

In [39]:
# group neighborhoods in the same borough
df_toronto_comb = df_toronto_drop.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df_toronto_comb.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## **5. Give Borough name to those Neighborhood names not assigned** 

In [42]:
# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in df_toronto_comb.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df_toronto_comb.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## **6. Show the cleaning result and decide wheither all step are correct**

In [47]:
# Create data_frame for checking 
column_names = ["PostalCode", "Borough", "Neighborhood"]
df_toronto_clean = pd.DataFrame(columns = column_names)

checking_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in checking_list:
    df_toronto_clean = df_toronto_clean.append(df_toronto_comb[df_toronto_comb["PostalCode"]==postcode], ignore_index=True)

In [48]:
# Show the cleaning data_frame
df_toronto_clean

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens, Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale, Wexford"
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


## **7. Use the .shape method to print the number of rows**

In [49]:
df_toronto_comb.shape

(103, 3)