In [39]:
import requests
import json
import pandas as pd
import numpy as np
import time
from os.path import exists
from datetime import datetime

# Libraries for GC API
import os
from dotenv import load_dotenv

# For testing
import mitosheet

In [40]:
# requires a valid key to use the GC API
# see docs https://api.canada.ca/en/homepage 
# create a .env file in this repository and add GC_API_KEY="YOUR_GC_API_KEY"
# make sure to add .env filetype to gitignore
load_dotenv()
gc_key = os.environ.get("GC_API_KEY")

# temporarily suppresses SettingWithCopyWarning
pd.options.mode.chained_assignment = None 

In [41]:
input_data = "../5a-Matching/output/matched_processed.csv"
df = pd.read_csv(input_data, low_memory=False, dtype="str")

# For testing
df_input = df

In [51]:
# For testing
df = df_input

In [52]:
len(set(df_input['idx']))

803491

In [53]:
len(df_input)

803491

In [54]:
# Filter the data

print('Original length: ', len(df))

# filter to those with a street number and street name
df = df[~df.street_no.isna()]
df = df[~df.formatted_en.isna()]
print('After removing those with no address data: ', len(df))

# filter those without lat/lon 
df = df[df.latitude.isna()]
print('After removing those already with lat/lon: ', len(df))

# remove duplicates 
df_dup = df[df.duplicated(subset=['street_no','formatted_en', 'province', 'city'], keep='first')]
df = df[~df.duplicated(subset=['street_no','formatted_en', 'province', 'city'], keep='first')]
print('After removing duplicates: ', len(df))

# For testing
df = df.sample(5)
print('sample: ', len(df))

Original length:  803491
After removing those with no address data:  695084
After removing those already with lat/lon:  239396
After removing duplicates:  158337
sample:  5


In [46]:
# Analyse what's left to geocode

# By province
df['province'].value_counts().sum()

5

In [47]:
# By source
df['localfile'].value_counts()

QC_Etablissements.csv               3
ON_Toronto_Business_Licences.csv    2
Name: localfile, dtype: int64

In [55]:
# Set up our GC API call

# define parameters for osm api call
url_gc = 'https://national-address-register-statcan.api.canada.ca:443/v2/addresses/search'

# clean dataset for queries
df.fillna('', inplace=True)

# define api query string
df['gc_request_street'] = df['street_no'] + ', ' + df['formatted_en'] + ', ' + df['city'] + ', ' + df['province']

# create list of provinces in the dataset
provinces = df['province'].unique()

# 
print(len(df))

5


In [59]:
df = df.sample(2)

In [65]:
# Set a timer
t1 = time.time()

# Reset arrays for collecting our results
JSONS = []
JSONS_CITIES = []
JSONS_ALL = []

# loop through dataframe by province
for province in provinces:
    
    # Reset arrays for collecting our results
    JSONS = []
    JSONS_CITIES = []
    
    df2 = df[df['province'] == province]
    print(province)
    print('Number to geocode: ', len(df2))
    reqs_gc = list(df2['gc_request_street'])
    idxs = list(df2['idx'])

    # Loop through each request
    for i in range(len(reqs_gc)):
        query_gc = reqs_gc[i]
        
        # Set query interval
        time.sleep(1) 
        
        # Print a message every 1000 queries
        if (i % 1000 == 0 and i > 1):
            print(str(i), ' of ', str(len(df2)), ' queries completed')
            t2 = time.time()
            print('seconds elapsed: ', str(round(t2-t1, 2)), '\n')

        # set GC API parameters
        params_gc = {'qstr': query_gc}
        headers_gc = {'user_key': gc_key}
        
        # Send API request
        try:
            coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
    
            # If the API response is success
            if (coords_gc.status_code == 200):

                # Save response to a variable
                resp = coords_gc.json()

                # Delete all but the first three results
                for index in range(len(resp['data'])):
                    if (index > 2):
                        del resp['data'][3]
            else:
                resp = ''
    #             print('no gc street address found')
        
        except requests.exceptions.ConnectionError:
            print("Connection refused for query: ", query_gc) 
            resp = ''
        
        # Add data id's to json
        resp['idx'] = idxs[i]
    
        # Append to array of results
        JSONS.append(resp)
        JSONS_ALL.append(resp)
    
    # once each province is complete, create a json file to save the results object
    json_name = 'data/geocoded_' + str(province) + '.json'
    with open(json_name, 'w', encoding='utf-8') as f:
        json.dump(JSONS, f, ensure_ascii=False, indent=4)
#     print("\n")

# create a big json dump at the end
with open('data/geocoded.json', 'w', encoding='utf-8') as f:
    json.dump(JSONS_ALL, f, ensure_ascii=False, indent=4) 

t2 = time.time()
print('DONE. Seconds elapsed: ', str(round(t2-t1, 2)))

QC
Number to geocode:  2
BC
Number to geocode:  0
DONE. Seconds elapsed:  2.93


In [172]:
# # define query function
# def gc_query(query, query_type):
#     # set interval for requests
#     time.sleep(1) 
#     coords = requests.get(url, params=params, headers=headers)
#     print("osm query " + query_type + ": " + query)
#     coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
#     return coords.json()

In [29]:
with open('data/geocodedPE.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)  
print(len(JSONS))

34306


In [30]:
# read json request results into our dataframe

def append_blank(index):
    df['geo_source'].iloc[index] = "no_gc"
    LATS.append('')
    LONS.append('')
    NAME.append('')
    ST_NO.append('')
    ST_NAME.append('')
    CITY.append('')
    PROV.append('')
    POST.append('')
    COUNTRY.append('')
    TYPE.append('')
    CLASS.append('')

# with open('Nominatim.json', 'r', encoding='utf-8') as f:
#     JSONS=json.load(f)    
        
LATS = []
LONS = []
NAME = []
ST_NO = []
ST_NAME = []
CITY = []
PROV = []
POST = []
COUNTRY = []
TYPE = []
CLASS = []

In [32]:
provinces = np.delete(provinces, [9,10,11,12])
df = df[df['province'].isin(provinces)]
len(df)

34306

In [33]:
# order df by province

provinces = df['province'].unique()

df = df.sort_values(by=['province'], ascending=True)

mitosheet.sheet(df, analysis_to_replay="id-vwdfxtprrm")

MitoWidget(analysis_data_json='{"analysisName": "id-erseurrrwq", "analysisToReplay": {"analysisName": "id-vwdf…

In [34]:
# osm_healthcare_types = ['hospital', 'clinic', 'pharmacy']

for index, element in enumerate(JSONS):
    if element==[]:
        append_blank(index)
    
    else:
        # format gc api results      
        if 'meta' in element:
            
            if (element['data'][0]['country']['code'] == 'CA'):
                COUNTRY.append(element['data'][0]['country']['code'])
                LATS.append(element['data'][0]['location']['geoCoordinates']['latitude'])
                LONS.append(element['data'][0]['location']['geoCoordinates']['longitude'])
                if 'streetName' in element.keys():
                    ST_NAME.append(element['streetName'])
                else:
                    ST_NAME.append('')
                if 'civicNumber' in element.keys():
                    ST_NO.append(element['civicNumber']['number'])
                else:
                    ST_NO.append('')
                # NOTE is there some equivalent?
                TYPE.append('')
                CLASS.append('')
                NAME.append('')
                if 'province' in element.keys():
                    PROV.append(element['data'][0]['mailingAddress']['province']['code'])
                else:
                    PROV.append('')
                if 'postalCode' in element['data'][0]['mailingAddress']:
                    POST.append(element['data'][0]['mailingAddress']['postalCode'])
                else:
                    POST.append('')
                if 'cityName' in  element['data']:
                    CITY.append(element['data'][0]['cityName']['en'])
                else:
                    CITY.append('')
        else:
            append_blank(index)

# append results to dataframe
df['gc_name'] = NAME
df['gc_street_no'] = ST_NO
df['gc_street_name'] = ST_NAME
df['gc_city'] = CITY
df['gc_province'] = PROV
df['gc_postal_code'] = POST
df['gc_class'] = CLASS
df['gc_type'] = TYPE
df['gc_lat'] = LATS
df['gc_lon'] = LONS

In [35]:
df2 = df[['business_name', 'latitude', 'longitude', 'street_no', 'formatted_en', 'geo_source', 'gc_request_street', 'gc_street_no', 'gc_name', 'gc_street_name', 'gc_city', 'gc_lat', 'gc_lon']]


In [None]:
# post processing

# we need to do matches of civic number
# and street name
# would need to format street name - remove -, '
# check first two or three responses

In [36]:
mitosheet.sheet(df2, analysis_to_replay="id-jtutpdnplx")

MitoWidget(analysis_data_json='{"analysisName": "id-jtutpdnplx", "analysisToReplay": null, "code": [], "stepSu…