# 1. Import relevant libraries

In [1]:
import numpy as np 

import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


import matplotlib.cm as cm
import matplotlib.colors as colors



from sklearn.cluster import KMeans

import folium

print("Libraries imported.")

Libraries imported.


# 2. Data scraping from Wikipedia

In [2]:
data=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

The above is a get request which scrapes text from the webpage

In [3]:
soup=BeautifulSoup(data,'html.parser')

In [4]:
postal_code=[]
borough=[]
neighborhood=[]

### BeautifulSoup functions

In HTML, tables are marked as 'tr'

*  soup.find('table').find_all('tr') finds all tables in the webpage 

In HTML, all table rows are marked as 'td'. Hence, to obtain the data, we need to go through each td instance as shown below

* for rows in soup.find('table').find_all('tr'):


    cells=rows.find_all('td')


In [5]:
for row in soup.find('table').find_all('tr'):
    cells=row.find_all('td')
    if (len(cells)>0):
        postal_code.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text)

### Lets take a look at the lists we have scrapped in the webpage

In [6]:
postal_code_list=[]
for string in postal_code:
    postal_code_list.append(string.replace('\n',''))
    
postal_code_list[0:10]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B']

In [7]:
borough[0:10]
borough_list=[]

for b in borough:
    borough_list.append(b[0:-1])

borough_list[0:10]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto',
 'North York',
 'Downtown Toronto',
 'Not assigned',
 'Etobicoke',
 'Scarborough']

In [8]:
neighborhood[0:10]
neighborhood_list=[]

for n in neighborhood:
    neighborhood_list.append(n[0:-1])
neighborhood_list[0:10]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Regent Park, Harbourfront',
 'Lawrence Manor, Lawrence Heights',
 "Queen's Park, Ontario Provincial Government",
 'Not assigned',
 'Islington Avenue, Humber Valley Village',
 'Malvern, Rouge']

### Now, let us transform the above lists into a pandas dataframe for better readability

# 3. Dataframe creation

In [9]:
df=pd.DataFrame({'Postal code':postal_code_list,'Borough':borough_list,'Neighborhood':neighborhood_list})
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


 However, we need to pre process the data since we have a lot of "Not assigned" in the Borough and Neighborhood columns

In [10]:
(df['Neighborhood']=='Not assigned').sum()

77

In [11]:
(df['Borough']=='Not assigned').sum()

77

We shall drop all the entries where Borough name is Not assigned. If Borough name is assigned but neighborhood in not, then Borough name is Neighborhood name

In [12]:
df_processed=df.copy()
df_processed=df[df['Borough']!='Not assigned']
df_processed

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
(df_processed['Neighborhood']=='Not assigned').any()

False

Hence, it can be seen that after clearing the unassigned boroughs, there are no unassigned neighborhoods

# 4. Grouping the rows with same postal code 

In [14]:
df_grouped=df_processed.groupby(['Postal code','Borough'],as_index=False).agg(lambda x: ', '.join(x))
df_grouped.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# 5. Unassigned neighborhoods as Boroughs

Although thankfully we did face the issue of having unassigned neighborhoods after removing all the unassigned Boroughs, we can test the code for taking care of the unassigned neighborhoods as follows.

If in case the neighborhood is unassigned, it must be the same as the borough name. An example has been shown below.

In [15]:
df['Borough'][0]='sample1'
df['Borough'][1]='sample2'

for index,rows in df.iterrows():
    if rows['Neighborhood']=='Not assigned':
        rows['Neighborhood']=rows['Borough']
           

In [16]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,sample1,sample1
1,M2A,sample2,sample2
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# 6. Comparing the dataframe

In [17]:
postal_test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

df_test=pd.DataFrame(columns=df_grouped.columns)

for pcodes in postal_test_list:
    df_test=df_test.append(df_grouped[df_grouped['Postal code']==pcodes])

In [18]:
df_test.reset_index(inplace=True)

In [19]:
df_test.drop('index',axis=1,inplace=True)

In [20]:
df_test

Unnamed: 0,Postal code,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


# 7. Merging postal codes with Latitudes and Longitudes

In [26]:
df_lat_lon=pd.read_csv('Geospatial_Coordinates.csv')
df_lat_lon.rename(columns={'Postal Code':'Postal code'},inplace=True)
df_lat_lon.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


As we can see, the latitudes and longitudes are postal coded uniquely. Hence, we can merge our data frame by postal codes using the pandas merge function.

In [29]:
df_locations=df_grouped.merge(df_lat_lon, on='Postal code')
df_locations.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Let us check specifically for the postal codes which were asked in the assignment.

In [38]:
df_test1=pd.DataFrame(columns=df_locations.columns)

for pcodes in postal_test_list:
    df_test1=df_test1.append(df_locations[df_locations['Postal code']==pcodes],ignore_index=True)

In [39]:
df_test1

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
