Week 3 - Toronto Clusterning

In [244]:
import pandas as pd
import numpy as np
import requests
#importing BeautifulSoup lib. (Allready istalled in JupiterLab)
from bs4 import BeautifulSoup 

### Using requests to get the content of web-page and parse it using BeautifulSoup

In [245]:
#Using requests to get the content of web-page and parse it using BeautifulSoup
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(link)
soup = BeautifulSoup(page.content,'html.parser')

postal_df = pd.DataFrame()
i=0
end = False
#Looping <tr> tag to get rows of the table
for tr in soup.find_all('tr'):
    j=0
    #Looping <td> tag to get columns within eath row
    for tds in tr.find_all('td'):
        #Using empty value as the end criteria. There is another table at this page, with we do not whant to be appended.
        if (len(tds.text) > 1)&(not end): 
            postal_df.loc[i,j]=tds.text
            j=j+1
        else:
            end=True
    i=i+1

postal_df.columns=['PostCode','Borough','Neighborhood']

In [246]:
postal_df.shape

(288, 3)

### Cleaning the data

In [247]:
# Deleting new line tag "/n" in last column
postal_df['Neighborhood']=postal_df['Neighborhood'].str.replace('\n','')
postal_df=postal_df.reset_index(drop=True)
postal_df.head(15)


Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [248]:
#Deleting rows with no Borough info. Assigning Neighborhood to Borough where is "Not assigned"
for i in range(postal_df.shape[0]):
    if postal_df.loc[i,'Borough']=='Not assigned':
        postal_df.drop(i,inplace=True)
    elif postal_df.loc[i,'Neighborhood']=='Not assigned':
        postal_df.loc[i,'Neighborhood']=postal_df.loc[i,'Borough']
postal_df=postal_df.reset_index(drop=True)

#Merging rows with the same PostCodes
for i in range(postal_df.shape[0]):
    for k in range(i+1,postal_df.shape[0]):
        if postal_df.loc[i,'PostCode']==postal_df.loc[k,'PostCode']:
            postal_df.loc[i,'Neighborhood']='{}, {}'.format(postal_df.loc[i,'Neighborhood'],postal_df.loc[k,'Neighborhood'])
postal_df.drop_duplicates(keep='first',subset=['PostCode'],inplace=True)
postal_df=postal_df.reset_index(drop=True)

In [249]:
postal_df.head(15)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [250]:
postal_df.shape

(103, 3)

### Getting the geo coordinates for each Postal Code

In [251]:
!pip install geocoder
import geocoder
print("Geocoder is ready!")

Geocoder is ready!


In [252]:
#At this moment geocoder do not work (hang the kernel)

"""
for i in range(postal_df.shape[0]):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_df.loc[i,'PostCode']))
        lat_lng_coords = g.latlng
    postal_df.loc[i,'Latitude'] = lat_lng_coords[0]
    postal_df.loc[i,'Longitude'] = lat_lng_coords[1]
"""

#Using csv file with coordinates
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [253]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [254]:
#Using 'left join' to merge dataframe with Postal Codes and Geo Coordinates
postal_geo_df = pd.merge(postal_df, geo_df, left_on='PostCode',right_on='Postal Code', how='left').drop('Postal Code', axis=1)

In [255]:
postal_geo_df.head(20)

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [256]:
postal_geo_df.shape

(103, 5)