#### Scrape the Wikipedia page for the Toronto neighborhood data

In [21]:
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 
req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1012118802") 
soup = BeautifulSoup(req.content,'lxml') 
table = soup.find_all('table')[0]  
df = pd.read_html(str(table)) 
neighborhood=pd.DataFrame(df[0]) 

#### check head and shape of dataframe

In [24]:
print(neighborhood.shape)
neighborhood.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove all rows with Borough = 'Not assigned' strings

In [26]:
# drops those rows where 'Not assigned' appears in column '[Borough]'
Data=neighborhood[~neighborhood.Borough.str.contains("Not assigned")]
Data=Data.reset_index(drop=True)
Data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Replace 'Not assigned' neighborhoods with the name of the Borough

In [27]:
Data.loc[Data['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = Data['Borough']
Data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Find How many Unique Postal codes exists

In [29]:
postalcodes = Data['Postal Code'].nunique()
boroughs = Data['Borough'].nunique()
neighbourhoods= Data['Neighbourhood'].nunique()
print('Unique Postalcodes : ' + str(postalcodes))
print('Unique Boroughs  : '+ str(boroughs))
print('Unique Neighbourhoods  :' + str(neighbourhoods))

Unique Postalcodes : 103
Unique Boroughs  : 11
Unique Neighbourhoods  :99


#### Consolidate the dataframe to each unique PostalCodes and aggregated Neighbourhoods

In [30]:
nrows=len(Data)-1
nrows
n=0

while n < nrows :
    post1=Data.iloc[n,0]
    #post1
    m=n+1
    post2=Data.iloc[m,0]
    #post2
    neigh1=Data.iloc[n,2]
    neigh2=Data.iloc[m,2]
    if post1==post2:
        Data.Neighbourhood[n,2] = neigh1=neigh1+','+neigh2
        #df2 = df2[df2.Neighbourhood != 'neigh2']
        Data=Data.drop(Data.index[m])
        nrows=nrows-1
        Data = Data.reset_index(drop=True)
    else:
        n=n+1

Data.index

RangeIndex(start=0, stop=103, step=1)

#### Consolidated Dataframe with unique postal codes and well cleaned data

In [33]:
Data.sort_values('Postal Code')
Data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [34]:
print(Data.shape)

(103, 3)


## END OF PART 1 OF PROJECT. SAVING URL FOR UPLOADING