### Import the necessary libraries

In [465]:
import requests 
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup

### Scrap the URL and get the table data 

In [468]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')
df_list = []
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    try:
        postcode = data[0].get_text()       
        borough = data[1].get_text()
        neighbourhood = data[2].get_text().rstrip('\n')
    except IndexError:pass
    df_list.append((postcode,borough,neighbourhood))

print(df_list[0:3])

[('M1A', 'Not assigned', 'Not assigned'), ('M2A', 'Not assigned', 'Not assigned'), ('M3A', 'North York', 'Parkwoods')]


In [469]:
df_list[0:10]

[('M1A', 'Not assigned', 'Not assigned'),
 ('M2A', 'Not assigned', 'Not assigned'),
 ('M3A', 'North York', 'Parkwoods'),
 ('M4A', 'North York', 'Victoria Village'),
 ('M5A', 'Downtown Toronto', 'Harbourfront'),
 ('M5A', 'Downtown Toronto', 'Regent Park'),
 ('M6A', 'North York', 'Lawrence Heights'),
 ('M6A', 'North York', 'Lawrence Manor'),
 ('M7A', "Queen's Park", 'Not assigned'),
 ('M8A', 'Not assigned', 'Not assigned')]

### Convert the above list data into PANDAS DataFrame

In [470]:
df_data = pd.DataFrame(df_list, columns=['Postal Code', 'Borough', 'Neighbourhood'])

In [471]:
df_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Check the shape of the DataFrame

In [472]:
df_data.shape

(289, 3)

### Get the count of Column Borough = Not assigned

In [473]:
df_data['Borough'].eq('Not assigned').sum()

77

### Get the count of Column Neighbourhood = Not assigned

In [474]:
df_data['Neighbourhood'].eq('Not assigned').sum()

78

### Remove the rows where Borough = Not assigned 

In [475]:
df_data = df_data[df_data.Borough != 'Not assigned']

### Shape of the DataFrame after removing rows, where Borough column equal to Not assigned values

In [476]:
df_data.shape

(212, 3)

### Get the rows where Neighbourhood is equal to Not assigned values

In [477]:
df_data['Neighbourhood'].eq('Not assigned').sum()

1

In [478]:
df_data.loc[df_data['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


### Copy the value of Borough column to the Neighbourhood column, if Neighbourhood == Not assigned 

In [480]:
df_data.Neighbourhood = df_data.Borough.where(df_data.Neighbourhood == 'Not assigned', df_data.Neighbourhood)

In [481]:
df_data.loc[df_data['Postal Code'] == 'M7A']

Unnamed: 0,Postal Code,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [482]:
df_data['Neighbourhood'].eq('Not assigned').sum()

0

### Merge the Neighbourhoods for the same Postal Code 

In [483]:
df_data = df_data.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

## Final Shape of the Notebook

In [485]:
df_data.shape

(103, 3)

## Upload the Geo Co-Ordinates CSV File 

In [486]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [487]:
df_cor.shape

(103, 3)

## Merge the 2 DataFrames - df_data and df_cor

In [488]:
df_final = pd.merge(df_data, df_cor, on='Postal Code')

In [489]:
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Exploring and Clustering the Neighbourhoods in Toronto

## Considering only those Boroughs which contains the word Toronto

In [490]:
df_toronto = df_final[df_final['Borough'].str.contains('Toronto')]

In [491]:
df_toronto.shape

(38, 5)

In [492]:
df_toronto['Borough'].value_counts()

Downtown Toronto    18
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

### Import the necessary libraries

In [493]:
from geopy.geocoders import Nominatim

### Get the latitude and longitude values of Toronto City.

In [494]:
address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create map of Toronto using latitude and longitude values

In [496]:
# install and import folium library 
!pip -q install folium
import folium 

In [497]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

## Superimpose the neighborhoods on the Toronto map 

In [498]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7).add_to(map_toronto) 

map_toronto    