# Segmenting and Clustering Neighborhoods in Toronto - part 1

## import packages

In [1]:
import pandas as pd
import numpy as np
import urllib.request

In [2]:
!conda install -c conda-forge beautifulsoup4 --yes
print('finished!')
import bs4 as bs

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

finished!


## retrieve dataframe from wikipedia web page

In [3]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source)

table = soup.find('table')
table_rows = table.find_all('tr')

wiki_table = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    wiki_table.append(row)
    
df = pd.DataFrame(wiki_table, columns=["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


## drop the first row

In [4]:
df = df.iloc[1:]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n


## remove all \n

In [5]:
df.replace(r'\s', '', regex = True, inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Notassigned,Notassigned
2,M2A,Notassigned,Notassigned
3,M3A,NorthYork,Parkwoods
4,M4A,NorthYork,VictoriaVillage
5,M5A,DowntownToronto,Harbourfront


## remove rows where Borough is 'Notassigned'

In [6]:
df = df[df.Borough != "Notassigned"]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,NorthYork,Parkwoods
4,M4A,NorthYork,VictoriaVillage
5,M5A,DowntownToronto,Harbourfront
6,M6A,NorthYork,LawrenceHeights
7,M6A,NorthYork,LawrenceManor


## reset index

In [7]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,NorthYork,Parkwoods
1,M4A,NorthYork,VictoriaVillage
2,M5A,DowntownToronto,Harbourfront
3,M6A,NorthYork,LawrenceHeights
4,M6A,NorthYork,LawrenceManor


## merge same postal codes in one unique row

In [8]:
index = 0
size = len(df)-1

while index < size:
    if df.iloc[index]['PostalCode'] == df.iloc[index+1]["PostalCode"]:
        df.iloc[index]['Neighborhood'] += ", "+df.iloc[index+1]['Neighborhood']
        df.drop(index+1, inplace=True)
        df = df.reset_index(drop=True)
        index -= 1
        size -= 1
    index += 1

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,NorthYork,Parkwoods
1,M4A,NorthYork,VictoriaVillage
2,M5A,DowntownToronto,Harbourfront
3,M6A,NorthYork,"LawrenceHeights, LawrenceManor"
4,M7A,Queen'sPark,Notassigned


In [77]:
print(df)

    PostalCode          Borough  \
0          M3A        NorthYork   
1          M4A        NorthYork   
2          M5A  DowntownToronto   
3          M6A        NorthYork   
4          M7A      Queen'sPark   
..         ...              ...   
98         M8X        Etobicoke   
99         M4Y  DowntownToronto   
100        M7Y      EastToronto   
101        M8Y        Etobicoke   
102        M8Z        Etobicoke   

                                          Neighborhood  
0                                            Parkwoods  
1                                      VictoriaVillage  
2                                         Harbourfront  
3                       LawrenceHeights, LawrenceManor  
4                                          Notassigned  
..                                                 ...  
98           TheKingsway, MontgomeryRoad, OldMillNorth  
99                                  ChurchandWellesley  
100        BusinessReplyMailProcessingCentre969Eastern  
101  Humb

## give a proper name to 'Notassigned' Neighbordhoods

In [9]:
index = 0
size = len(df)-1

while index < size:
    if df.iloc[index]['Neighborhood'] == 'Notassigned':
        df.iloc[index]['Neighborhood'] = df.iloc[index]['Borough']
    index += 1
        
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,NorthYork,Parkwoods
1,M4A,NorthYork,VictoriaVillage
2,M5A,DowntownToronto,Harbourfront
3,M6A,NorthYork,"LawrenceHeights, LawrenceManor"
4,M7A,Queen'sPark,Queen'sPark
5,M9A,Queen'sPark,Queen'sPark
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,NorthYork,DonMillsNorth
8,M4B,EastYork,"WoodbineGardens, ParkviewHill"
9,M5B,DowntownToronto,"Ryerson, GardenDistrict"


## output

In [10]:
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,NorthYork,Parkwoods
1,M4A,NorthYork,VictoriaVillage
2,M5A,DowntownToronto,Harbourfront
3,M6A,NorthYork,"LawrenceHeights, LawrenceManor"
4,M7A,Queen'sPark,Queen'sPark
5,M9A,Queen'sPark,Queen'sPark
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,NorthYork,DonMillsNorth
8,M4B,EastYork,"WoodbineGardens, ParkviewHill"
9,M5B,DowntownToronto,"Ryerson, GardenDistrict"
