## Part 1 - Webscraping

In [1]:
import pandas as pd
import numpy as np

In [2]:
# downloading beautifulsoup4 which is used for webscraping
!pip install bs4

from bs4 import BeautifulSoup



In [3]:
# upgrading it to the latest update
!python -m pip install --upgrade pip

Requirement already up-to-date: pip in c:\users\ritwik\anaconda3\lib\site-packages (20.0.2)


In [4]:
# parsers to parse the HTML
!pip install lxml



In [5]:
# parsers to parse the HTML (various parsers available in the beautifulsoup4 documentation)
!pip install html5lib



In [6]:
!pip install requests



In [7]:
# Get the HTML Code of the webpage using 'lxml' parser

import requests

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')

soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XnCt2QpAAD4AACwWvMwAAABF","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":945633050,"wgRevisionId":945633050,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short descr

In [8]:
# Getting the HTML for the table in the Webpage after inspecting the webpage

My_table = soup.find('table',{'class':'wikitable sortable'})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

In [9]:
table_rows = My_table.find_all('tr')

In [10]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighborhood"])
df

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Downtown Toronto,Queen's Park\n
9,M8A,Not assigned,Not assigned\n


In [11]:
# Removing '\n' from the values in the neighborhood column

df['Neighborhood'] = df['Neighborhood'].str.replace("\n", "")

df

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Downtown Toronto,Queen's Park
9,M8A,Not assigned,Not assigned


In [12]:
# Removing Borough with Not assigned values

df2 = df[df['Borough'] != 'Not assigned']

df2

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Downtown Toronto,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [13]:
#Exporting into a CSV to cross-verify

df2.to_csv (r'C:\Users\Ritwik\Desktop\export_raw_dataframe.csv', index = False, header=True)

In [14]:
# If Neighborhood is 'Not Assigned' but has a Borough value
# then Neighborhood is assigned the Borough value

#Trial 1 : df3 = df2[df2['Neighborhood'].str.replace("Not assigned",df2['Borough'])]

#Trial 2 : df3 = df2[df2['Neighborhood' == 'Not assigned'] = df2['Borough']]

#Trial 3 : df2.Neighborhood[df2.Neighborhood == 'Not assigned'] = df2['Borough'] - Gives warning

#Trial 4: df2['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])

#Trial 5: df2['Neighborhood'] = np.where((df.Neighborhood == 'Not assigned'),df.Borough,df.Neighborhood)

#df2['Neighborhood'] = df2['Borough'].where(df2['Neighborhood'] == 'Not assigned')

df2.loc[df2['Neighborhood'] == 'Not assigned', 'Neighborhood'] = 'Borough'

df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Downtown Toronto,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [15]:
# Grouping Postcode values and concatenating Neighborhood values

df3 = df2.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [16]:
df3

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [17]:
df3.shape

(103, 3)

In [32]:
# writing to excel to cross-check

df3.to_csv (r'C:\Users\Ritwik\Desktop\export_dataframe.csv', index = False, header=True)

----

## Part 2 - Getting the co-ordinates (lat,long) using Geocoder

In [18]:
!pip install geocoder



In [19]:
# trial 

import geocoder
g = geocoder.google('Toronto')
g.latlng

#Output - None??

In [20]:
import geocoder

# initialize your variable to None
lat_lng_coords = None

In [21]:
# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(df['Postcode'])
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

SyntaxError: invalid syntax (<ipython-input-21-3bfc43cfaf11>, line 4)

In [22]:
#Uploading zip with long and lat data

dfloc = pd.read_csv(r'C:\Users\Ritwik\Desktop\Geospatial_Coordinates.csv')

dfloc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [23]:
# Merging with the oridingal data to get the master database with Pincode, Borough, Neighborhood, Long, Lat

dfmerge = pd.merge(df3, dfloc, left_on='Postcode', right_on='Postal Code', how='left')

dfmaster = dfmerge[['Postcode','Borough','Neighborhood','Latitude','Longitude']]

dfmaster

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


----

## Part 3

In [24]:
#Installing folium

!pip install folium
import folium



In [25]:
from geopy.geocoders import Nominatim

In [26]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="my_application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [27]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

from IPython.display import display
display(map_toronto)

In [28]:
# add markers to map
for lat, lng, label in zip(dfmaster['Latitude'], dfmaster['Longitude'], dfmaster['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

---