# Getting the table from Wiki 

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from sklearn.cluster import KMeans
import requests

#Getting the table 
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]
postcode = df["Postcode"].tolist()
borough = df["Borough"].tolist()
neigh = df["Neighbourhood"].tolist()

# Creating the dataframe

In [2]:
# instantiate the dataframe
neighborhoods = pd.DataFrame(
{
 "Postal Code":postcode, 
    "Borough":borough,
    "Neighbourhood":neigh
}
)

# Cleaning the data

In [3]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
neighborhoods.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

In [4]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, 
# you will notice that M5A is listed twice and has two neighborhoods: 
# Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
# separated with a comma as shown in row 11 in the above table.

neighborhoods1=neighborhoods.groupby("Postal Code").agg(lambda x:','.join(set(x)))

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 
# So for the 9th cell in the table on the Wikipedia page, 
# the value of the Borough and the Neighborhood columns will be Queen's Park.

neighborhoods1.loc[neighborhoods1['Neighbourhood']=="Not assigned",'Neighbourhood']=neighborhoods1.loc[neighborhoods1['Neighbourhood']=="Not assigned",'Borough']

neighborhoods1.shape



(103, 2)

In [5]:
neighborhoods1

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,West Hill,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Kennedy Park,Ionview"
M1L,Scarborough,"Clairlea,Oakridge,Golden Mile"
M1M,Scarborough,"Scarborough Village West,Cliffside,Cliffcrest"
M1N,Scarborough,"Cliffside West,Birch Cliff"


# Using the csv file 

In [6]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


# Creating the new dataframe

In [7]:
neighborhoods1['Latitude']=geo_data['Latitude'].values
neighborhoods1['Longitude']=geo_data['Longitude'].values

neighborhoods1

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood,West Hill,Morningside",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park,Kennedy Park,Ionview",43.727929,-79.262029
M1L,Scarborough,"Clairlea,Oakridge,Golden Mile",43.711112,-79.284577
M1M,Scarborough,"Scarborough Village West,Cliffside,Cliffcrest",43.716316,-79.239476
M1N,Scarborough,"Cliffside West,Birch Cliff",43.692657,-79.264848
