#### This notebook will be used for Coursera Capstone Project. The project is about segmenting and clustering neighborhoods in Toronto.

##### Importing libraries

In [1]:
import pandas as pd
import numpy as np

# To access html page
import requests

# To parse html page
from bs4 import BeautifulSoup
from bs4 import SoupStrainer

### Getting Toronto Data

In [2]:
# Requesting the data from url
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# Condition to extract only table
only_table = SoupStrainer('table',attrs={'class':'wikitable sortable'}) # Condition to p
soup = BeautifulSoup(page,'html.parser',parse_only=only_table)
# Condition to extract rows
only_rows = SoupStrainer('tr')
# Condition to extract data from rows
def only_data(lst):
    return [[d.text.rstrip() for d in li.find_all(['td'])] for li in lst] 

data = only_data(soup.find_all(only_rows))

###### Creating a Data Frame with columns PostalCode, Borough, Neighborhood and the scraped data

In [4]:
toronto_data = pd.DataFrame(data = data, columns = ['PostalCode','Borough','Neighborhood'])[1:]

### Cleaning Toronto Data

##### Finding cells where Borough is assigned and Neighborhood is not assigned

In [5]:
toronto_data[(toronto_data['Borough'] != 'Not assigned') & (toronto_data['Neighborhood'] == 'Not assigned')]

Unnamed: 0,PostalCode,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [6]:
toronto_data.loc[9,'Neighborhood'] = "Queen's Park"

##### Removing rows where Borough is Not assigned

In [7]:
toronto_data = toronto_data[toronto_data['Borough'] != 'Not assigned']

###### Combining neighborhoods in one postal code area

In [8]:
toronto_data = toronto_data.groupby(['PostalCode','Borough']).Neighborhood.apply(lambda x: ', '.join(x)).reset_index()

##### Printing the shape and the first few rows of the data

In [9]:
print(toronto_data.shape)
toronto_data.head(10)

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
