## Program for Segmenting and Clustering Neighborhoods in Toronto

### 1. Start importing required libraries

In [41]:
## Import requests, pandas, numpy etc libraries
import requests 
from pandas import DataFrame, read_csv
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

In [35]:
## Install BeautifulSoup library 
! pip install beautifulsoup4 
from bs4 import BeautifulSoup



In [36]:
## Install lxml and html5lib parsers
! pip install lxml
! pip install html5lib
import lxml



### 2. Read in the Wikipedia page containing Toranto neighborhood data

In [37]:
##Use the function "get" from the requests library to download from the wikipage url and get the content into "page"
wikipage_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(wikipage_url).text

## Use BeautifulSoup to parse the html content from "page"
soup = BeautifulSoup(page, "html.parser")
#print(soup.prettify())

### 3. Parse the table header row and data rows and write the parsed data to a CSV file

In [38]:
## Find the table hook  
my_table = soup.find("table", class_="wikitable sortable")

## Open a file to write to
myfile=open("MyFile.csv","w")

## Start writing the parsed table header row to CSV file
count=-1
for th in my_table.find_all('th'):
    count=(count+1)%3
    if (count == 0) or (count == 1) :
        myfile.write('%s ,' % th.text)
    elif (count ==2 ):
        myfile.write('%s ' % th.text)

## Start parsing table data rows and write to CSV file
for tr in my_table.find_all('tr'): 
    count= -1
    for tds in my_table.find_all('td'):
        count= (count+1)%3
        if (count == 0 ) or (count == 1 ):
            #print(tds.text)
            myfile.write('%s , ' % (tds.text))
        elif (count == 2) :
            #print(tds.text)
            myfile.write('%s ' % (tds.text))
## Close the CSV file
myfile.close()

### 4. Create a DataFrame from CSV file that was created in previous step

In [42]:
## Read the csv file into a pandas dataframe
filename = r'MyFile.csv'
df = pd.read_csv(filename, header=0, names=['Postcode','Borough','Neighborhood'])
df.head(5)
#df.shape

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 5. Clean up Borough and Neighborhood data for each postal code

In [46]:
## Drop the rows that have Borough value as 'Not Assigned'
df = df[ df.Borough != " Not assigned "]
df.head(10)
## Observe that Borough column doesn't have "Not assigned" anymore but Neighborhood column has

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [47]:
## if Neighborhood has " Not assigned" then copy Borough value to Neighborhood
for i in df.index :
    if ( df.at[i,'Neighborhood'] == " Not assigned") :
        #print("Before: ", df.at[i,'Neighborhood'])
        df.at[i,'Neighborhood'] = df.at[i,'Borough']
        #print("After: ", df.at[i,'Neighborhood'])
df.head(10)
## Note for Postcode 'M7A' neighborhood changed from "Not assigned" to Queen's Park, which is the corresponding Borough 

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### 6. Create temporary and target dataframes needed to complete further processing 

In [52]:
## Create df2 as guiding dataframe to hold Postcodes to drive further processing
df['Postcode'] = df['Postcode'].str.strip()
df2 = df.groupby('Postcode').count().reset_index()
df2.head(5)

## Create a copy of df2 as the final dataframe, df3, to hold fully processed data 
df3 = df2.copy()
## set Borough and Neighborhood as "str" data type
df3.Borough = df3.Borough.astype(str)
df3.Neighborhood = df3.Neighborhood.astype(str)

### 7. Create the final dataframe with Postcode, Borough and Neighborhood containg list of corresponding neighborhoods.

In [55]:
## For each Postcode, copy Borough from dataframe resulted from step5 above to target dataframe, df3
for index, row in df2.iterrows():
    pcode=row['Postcode'] 
    borough=df[df['Postcode'] == pcode]['Borough'].reset_index() ## get borough values corresponding to Postcode
    df3.at[index,'Borough']=borough.at[0,'Borough']  ## just copy one value
    #print(df3.at[index,'Postcode'], df3.at[index,'Borough'])
    
    ## Compile a list, "neigh", of comma seperated neighborhood data items for each Postcode 
    neighbors=df[df['Postcode'] == pcode]['Neighborhood'].reset_index()
    neigh=""
    num=len(neighbors['Neighborhood'].drop_duplicates()) ## drop the duplicates if any
    for j in range(0,num):
        neigh += neighbors.at[j,'Neighborhood']
        if j < num-1 :
            neigh += ","
        #print(j, neigh)
    df3.at[index,'Neighborhood'] = neigh  ## copy comma seperated list of neighborhoods to target dataframe, df3
    
df3  

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Rich..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate,..."


In [56]:
## Well, we have 103 rows in the final dataframe ....here again
df3.shape

(103, 3)