# Web Scraping using BeautifulSoup

### Importing all required packages

In [21]:
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup
import geocoder # import geocoder
#import numpy as np
from geopy.geocoders import Nominatim 
import folium

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url).text

### Collecting required data and converting to dictionary then to dataframe

In [3]:
soup = BeautifulSoup(result, 'lxml')
col = []
values = []
table= soup.find('table', class_ = 'wikitable')

for match in table.find_all('th'):
    head = match.text.rstrip('\n')
    col.append((head,[]))

for item in table.find_all('td'):
    data = item.text.rstrip('\n')
    values.append(data)

for i in range(len(values)):
    col[i%3][1].append(values[i])
    i+=1

Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df.tail()

Unnamed: 0,Postal code,Borough,Neighborhood
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
179,M9Z,Not assigned,


### Refining the dataframe

In [4]:
df = df[df.Borough!='Not assigned'] # Deleting rows having Not assigned values in Borough column

In [5]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
df.reset_index() # Resetting the index

Unnamed: 0,index,Postal code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Regent Park / Harbourfront
3,5,M6A,North York,Lawrence Manor / Lawrence Heights
4,6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...,...
98,160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,165,M4Y,Downtown Toronto,Church and Wellesley
100,168,M7Y,East Toronto,Business reply mail Processing CentrE
101,169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [7]:
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',') # Replacing # with ,

In [8]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [9]:
df['Neighborhood'] = df['Neighborhood'].str.replace(' , ',', ')

In [10]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
df[df['Neighborhood']==''] #Checking if column Neighborhood has any blank values

Unnamed: 0,Postal code,Borough,Neighborhood


In [12]:
df.shape

(103, 3)

### Uploading geospatial data and merging with original dataframe

In [13]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv') #Creating geo_df with geospatial data
geo_df.rename(columns={"Postal Code":"Postal code"}, inplace = True) # Updating column names so that it will be easy for merge
geo_df.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df = pd.merge(df, geo_df, on ='Postal code', how ='left') # Merging two datafrmaes 

In [15]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Checking unique values of boroughs and neighborhoods

In [16]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


### Finding out coordinates of Toronto city

In [19]:
address = 'Toronto'

geolocator = Nominatim(user_agent="t_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


### Creating  map of Toronto and neighborhood points

In [26]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto