<h1>IBM Data Science Professional Certificate Capstone</h1>
<h3>Week 3 - Segmenting and Clustering Toronto neighborhoods</h3>

<h4> 1 - Import Libraries</h4>

In [1]:
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup

<h4> 2 - Scrape Wikipedia data using BeautifulSoup</h4>

In [2]:
# Get Wikipedia Page text
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Parse using Beautifulsoup
soup = BeautifulSoup(data, 'html.parser')
#print(soup.prettify())

In [3]:
scrapeData = []

# Search complete table to get data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        scrapeData.append([cells[0].text,cells[1].text,cells[2].text])
        #neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell
        
scrapeData[0:5]

[['M1A', 'Not assigned', 'Not assigned\n'],
 ['M2A', 'Not assigned', 'Not assigned\n'],
 ['M3A', 'North York', 'Parkwoods\n'],
 ['M4A', 'North York', 'Victoria Village\n'],
 ['M5A', 'Downtown Toronto', 'Harbourfront\n']]

<h4> 3 - Create Data Frame</h4>

In [4]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [5]:
# Add items from scraped data into dataframe
for data in scrapeData:
    neighborhoods = neighborhoods.append({'PostalCode': data[0].rstrip(), #removing trailing spaces and newlines
                                          'Borough': data[1].rstrip(),
                                          'Neighborhood': data[2].rstrip()},
                                          ignore_index=True)

In [6]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
# Remove Not assigned
neighborhoods = neighborhoods[neighborhoods.Borough != "Not assigned"].reset_index(drop=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [8]:
# Merge neighborhoods in same postal code area
neighborhoods = neighborhoods.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
neighborhoods[neighborhoods.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront,Regent Park"


In [9]:
# Copy Borough into Neighborhood when not assigned
def set_Neighborhood(row):
    if(row['Neighborhood'] == "Not assigned"):
        return row['Borough']
    else:
        return row['Neighborhood']

neighborhoods['Neighborhood'] = neighborhoods.apply(lambda row: set_Neighborhood(row), axis=1)
neighborhoods[neighborhoods.PostalCode == 'M7A'] #Queens Park

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [12]:

# create a new test dataframe
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(neighborhoods[neighborhoods["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens,Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale,Wexford"
7,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."


In [10]:
#print shape of cleaned dataframe
neighborhoods.shape

(103, 3)