## Importing and installing libraries though to be used later on...


In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import os
!pip install geopy
!pip install geocoder
from geopy.geocoders import Nominatim
import geocoder



## Loading the DataFrame saved during the previous step.

In [16]:
scraped_df_path = os.path.join(".", "raw_scraped_canadian_postal_codes.csv")
raw_df = pd.read_csv(scraped_df_path)
raw_df.columns

Index(['Unnamed: 0', 'PostalCode', 'Borough', 'Neighbourhood'], dtype='object')

In [17]:
raw_df.index.name = "id"
raw_df.drop(columns = ["Unnamed: 0"], inplace = True, axis = 0)

## Getting the latitude and longitude values
This didn't go as smooth as it was planned. 
1. Firstly, following the instructions given by IBM, I tried to use geocoder library. That ended up in an infinite loop (I was advised by IBM to create one for some weird reason). Therefore I commented out their version.
2. Secondly, I though I would resort to geopy's Nominatim class, by it could only retrieve a fragment (20 geocodes). Perhaps the requests are limited?
3. Therefore, as a final attempt, downloaded the geocode file provided by IBM and merged it with the raw scraped data.
    

In [19]:
address_translator = Nominatim(user_agent = "IBM_data_science_course")
for i in raw_df.index:
    postcode = raw_df.loc[i, "PostalCode"]
    """
    Endless loop: courtesy of IBM Advanced Data Science Course
    lat_lng = None

    while (lat_lng is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng = g.latlng"""
    lat_lng = address_translator.geocode('{}, Toronto, Ontario'.format(postcode))
    if(lat_lng is not None):
        raw_df.loc[i, "Latitude"] = lat_lng.latitude
        raw_df.loc[i, "Longitude"] = lat_lng.longitude
        print(raw_df.loc[i, ["PostalCode", "Longitude", "Latitude"]])

PostalCode        M3A
Longitude    -79.3839
Latitude      43.6535
Name: 0, dtype: object
PostalCode        M7A
Longitude    -79.3839
Latitude      43.6535
Name: 4, dtype: object
PostalCode        M1B
Longitude    -79.3839
Latitude      43.6535
Name: 6, dtype: object
PostalCode        M1C
Longitude    -79.3839
Latitude      43.6535
Name: 12, dtype: object
PostalCode        M9C
Longitude    -79.5889
Latitude      43.6441
Name: 17, dtype: object
PostalCode        M5E
Longitude    -79.3774
Latitude      43.6421
Name: 20, dtype: object
PostalCode        M1G
Longitude    -79.2219
Latitude      43.7657
Name: 22, dtype: object
PostalCode        M5H
Longitude    -79.3837
Latitude      43.6499
Name: 30, dtype: object
PostalCode        M2J
Longitude    -79.3662
Latitude      43.7798
Name: 33, dtype: object
PostalCode        M5J
Longitude    -79.3828
Latitude      43.6393
Name: 36, dtype: object
PostalCode        M6K
Longitude     -79.435
Latitude      43.6371
Name: 43, dtype: object
PostalCode   

As you can see from the results below, 83 out of 103 requests were not successful.

In [20]:
raw_df[raw_df["Latitude"].isna()].count()

PostalCode       83
Borough          83
Neighbourhood    83
Latitude          0
Longitude         0
dtype: int64

Geopy has let me down as well. Therefore, I have decided to resort to a ready-made file.

In [10]:
provided_url = "https://cocl.us/Geospatial_data"
downloaded_df = pd.read_csv(provided_url)
downloaded_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Getting rid of the latitude and longitude columns

In [11]:
raw_df.drop(columns = ["Latitude", "Longitude"], inplace = True)

### Merging the two dataframes

In [13]:
merged_df = raw_df.merge(downloaded_df, how = "left", left_on="PostalCode", right_on = "Postal Code")
merged_df.drop(columns = "Postal Code", inplace = True)
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,Lawrence Manor,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


In [14]:
merged_df.shape

(103, 5)

In [15]:
merged_df.to_csv("latlng_scraped_canadian_postal_codes.csv")