In [2]:
import requests
import json
import pandas as pd
import numpy as np
import time
from os.path import exists
from datetime import datetime



In [5]:
# Libraries for GC API
import os
from dotenv import load_dotenv

In [14]:
import mitosheet

In [6]:
# requires a valid key to use the GC API
# see docs https://api.canada.ca/en/homepage 
# create a .env file in this repository and add GC_API_KEY="YOUR_GC_API_KEY"
# make sure to add .env filetype to gitignore
load_dotenv()
gc_key = os.environ.get("GC_API_KEY")

# temporarily suppresses SettingWithCopyWarning
pd.options.mode.chained_assignment = None 

In [44]:
input_data = "../5a-Matching/output/matched_processed.csv"
df = pd.read_csv(input_data, low_memory=False, dtype="str")
df_input = df

In [37]:
df2 = pd.read_csv("../5a-Matching/output/matched.csv", low_memory=False, dtype="str")
df3 = pd.read_csv("../5a-Matching/data/formatted.csv", low_memory=False, dtype="str")

In [100]:
df = df_input

In [101]:
len(df)

803491

In [45]:
# How many lat/lons do we still have to process?

print('pasrsed: ', len(df3[df3.latitude.isna()]))
print('matched: ', len(df2[df2.latitude.isna() & df2.x.isna()]))
print('match processed: ', len(df[df.latitude.isna()]))

# len(df[df.geo_source == 'oda_match'])
# len(df[df.latitude.isna()])

pasrsed:  580990
matched:  463005
match processed:  345150


In [102]:
# some have no street no or name - so cannot geocode
df4 = df[~df.street_no.isna()]

print('after no street no: ', len(df4[df4.latitude.isna()]))


after no street no:  241060


In [103]:
# some are duplicates
df5 = df4[~df4.duplicated(subset=['street_no','formatted_en', 'province', 'city'], keep='first')]



In [104]:
len(df5)

362666

In [106]:
print('after no street no: ', len(df5[~df5.latitude.isna()]))


after no street no:  203746


In [114]:
df_no_street = df[df.street_no.isna()]
df_no_address = df_no_street[df_no_street.latitude.isna()]

In [50]:
df6 = df5[df5.latitude.isna()]

In [115]:
len(df_no_address)

104090

In [116]:
mitosheet.sheet(df_no_address, analysis_to_replay="id-azxoxgzple")

MitoWidget(analysis_data_json='{"analysisName": "id-fszccbqjtk", "analysisToReplay": {"analysisName": "id-azxo…

In [117]:
df_no_address['localfile'].value_counts()

BC_Vancouver_Business_Licences.csv                      54198
ON_Toronto_Business_Licences.csv                        26167
AB_Edmonton_Business_Licences.csv                       13511
QC_Etablissements.csv                                    2855
ON_Greater_Sudbury_Business_Licences.csv                 1511
BC_Langley_Business_Licences.csv                         1322
BC_Nanaimo_Business_Licences.csv                          984
BC_Squamish_Business_Licence_Annual_2021.csv              933
Indigenous_Business_Directory.csv                         544
BC_Prince_George_Business_Licence.csv                     426
AB_Banff_Business_Licences.csv                            351
BC_Burnaby_Business_Licences.csv                          320
BC_Liquor_licences.csv                                    280
ON_oebd.csv                                               249
NU_Indigenous_Business.csv                                171
NT_Yellowknife_Business_Directory.csv                      94
BC_New_W

In [79]:
# how many in each province?
df6['province'].value_counts()

QC    123562
ON     19944
BC     13755
AB       220
NT        85
SK        66
MB        49
NB        20
NS         5
NL         4
YT         3
NU         2
PE         1
Name: province, dtype: int64

In [67]:
sample_size = 100
if (len(df_input) > sample_size):
    df = df_input.sample(sample_size)
else:
    sample_size = len(df)

In [52]:
mitosheet.sheet(df6, analysis_to_replay="id-rmmopdyskz")

MitoWidget(analysis_data_json='{"analysisName": "id-jjtwdsqjtn", "analysisToReplay": {"analysisName": "id-rmmo…

In [None]:
from mitosheet import *; register_analysis("id-rmmopdyskz");
    
# Reordered column keep_match
df6_columns = [col for col in df6.columns if col != 'keep_match']
df6_columns.insert(51, 'keep_match')
df6 = df6[df6_columns]


In [None]:
from mitosheet import *; register_analysis("id-heawxepiyk");
    
# Reordered column keep_match
df_columns = [col for col in df.columns if col != 'keep_match']
df_columns.insert(51, 'keep_match')
df = df[df_columns]


In [118]:
# we want to run a test of the geocoding, using results which already have lat/lon

df = df_input

# filter to those with a streen number and street name
df = df[~df.street_no.isna()]
df = df[~df.formatted_en.isna()]

# filter those without lat/lon WITH for testing
df = df[~df.latitude.isna()]

# remove duplicates 
df = df[~df.duplicated(subset=['street_no','formatted_en', 'province', 'city'], keep='first')]

print(len(df))

# get sample
df = df.sample(10)

203717


In [119]:
# define parameters for osm api call
url_gc = 'https://national-address-register-statcan.api.canada.ca:443/v1/addresses/search'

JSONS = []
JSONS_CITIES = []

In [121]:
# clean dataset for queries
df.fillna('', inplace=True)

# define api query
df['gc_request_street'] = df['street_no'] + ', ' + df['formatted_en'] + ', ' + df['city'] + ', ' + df['province']

# NOTE adapt for Quebec
# if (province_code == 'QC'):
#         # I should change this 
#     df['gc_request_street'] = df['street_no'] + ', ' + df['formatted_fr'] + ', ' + df['city'] + ', ' + df['province']

# else:
#     df['gc_request_street'] = df['street_no'] + ', ' + df['formatted_en'] + ', ' + df['city'] + ', ' + df['province']

reqs_gc = list(df['gc_request_street'])

In [83]:
# Filter to only do geocoding on specific results

# no lat lon
# has well parsed address
# hasn't already been geocoded - use idx

In [122]:
# define query
def gc_query(query, query_type):
    # set interval for requests
    time.sleep(1) 
    coords = requests.get(url, params=params, headers=headers)
    print("osm query " + query_type + ": " + query)
    coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
    return coords.json()

In [123]:
for i in range(len(reqs_gc)):
    query_gc = reqs_gc[i]
    
    
    time.sleep(1) 
        
    # we first try facility name street address,
    # if that fails, we try street address and then city 
    # which one we use is recorded in the geo_source column
        
    print(str(i + 1) + ': ' + query_gc)
    

     # try GC API
    params_gc = {'qstr': query_gc}
    headers_gc = {'user_key': gc_key}
    coords_gc = requests.get(url_gc, params=params_gc, headers=headers_gc)
    
    print(coords_gc)
#                 print("trying gc api with query: " + query_gc)    
    if (coords_gc.status_code == 200):
        resp = coords_gc.json()
        print('[3] gc street address found')
#         df['geo_source'].iloc[i] = "gc_street_address"
#         print(resp)
    else:
        print('no gc street address found')

    print("\n")
    JSONS.append(resp)

#     if resp!=[]:
#         print("\n")
# #         print(resp)

1: 1150, raymur av, Vancouver, BC
<Response [200]>
[3] gc street address found


2: 1625, rue de la seigneurie, Québec (Québec), QC
<Response [200]>
[3] gc street address found


3: 7667, 6th st, Burnaby, BC
<Response [200]>
[3] gc street address found


4: 25, rue paquin, Notre-Dame-de-Lourdes (Québec), QC
<Response [200]>
[3] gc street address found


5: 23650, 112 av, Maple Ridge, BC
<Response [200]>
[3] gc street address found


6: 4638, burke st, Burnaby, BC
<Response [200]>
[3] gc street address found


7: 6270, av henri julien, Montréal (Québec), QC
<Response [200]>
[3] gc street address found


8: 1665, the collegeway, Mississauga, ON
<Response [200]>
[3] gc street address found


9: 6681, arcola st, Burnaby, BC
<Response [200]>
[3] gc street address found


10: 4655, highlawn dr, Burnaby, BC
<Response [200]>
[3] gc street address found




In [124]:
# next we save and then load a json

with open('geocoded.json', 'w', encoding='utf-8') as f:
    json.dump(JSONS, f, ensure_ascii=False, indent=4) 

In [125]:
with open('geocoded.json', 'r', encoding='utf-8') as f:
    JSONS=json.load(f)  
print(len(JSONS))

10


In [126]:
# read json request results into our dataframe

def append_blank(index):
    df['geo_source'].iloc[index] = "no_osm"
    LATS.append('')
    LONS.append('')
    NAME.append('')
    ST_NO.append('')
    ST_NAME.append('')
    CITY.append('')
    PROV.append('')
    POST.append('')
    COUNTRY.append('')
    TYPE.append('')
    CLASS.append('')

# with open('Nominatim.json', 'r', encoding='utf-8') as f:
#     JSONS=json.load(f)    
        
LATS = []
LONS = []
NAME = []
ST_NO = []
ST_NAME = []
CITY = []
PROV = []
POST = []
COUNTRY = []
TYPE = []
CLASS = []

In [127]:
# osm_healthcare_types = ['hospital', 'clinic', 'pharmacy']

for index, element in enumerate(JSONS):
    if element==[]:
        append_blank(index)
    
    else:
        # format gc api results      
        if 'meta' in element:
            
            if (element['data'][0]['country']['code'] == 'CA'):
                COUNTRY.append(element['data'][0]['country']['code'])
                LATS.append(element['data'][0]['location']['geoCoordinates']['latitude'])
                LONS.append(element['data'][0]['location']['geoCoordinates']['longitude'])
                if 'streetName' in element.keys():
                    ST_NAME.append(element['streetName'])
                else:
                    ST_NAME.append('')
                if 'civicNumber' in element.keys():
                    ST_NO.append(element['civicNumber']['number'])
                else:
                    ST_NO.append('')
                # NOTE is there some equivalent?
                TYPE.append('')
                CLASS.append('')
                NAME.append('')
                if 'province' in element.keys():
                    PROV.append(element['data'][0]['mailingAddress']['province']['code'])
                else:
                    PROV.append('')
                if 'postalCode' in element['data'][0]['mailingAddress']:
                    POST.append(element['data'][0]['mailingAddress']['postalCode'])
                else:
                    POST.append('')
                if 'cityName' in  element['data']:
                    CITY.append(element['data'][0]['cityName']['en'])
                else:
                    CITY.append('')
        else:
            append_blank(index)

# append results to dataframe
df['osm_name'] = NAME
df['osm_street_no'] = ST_NO
df['osm_street_name'] = ST_NAME
df['osm_city'] = CITY
df['osm_province'] = PROV
df['osm_postal_code'] = POST
df['osm_class'] = CLASS
df['osm_type'] = TYPE
df['osm_lat'] = LATS
df['osm_lon'] = LONS

In [129]:
df2 = df[['business_name', 'latitude', 'longitude', 'street_no', 'formatted_en', 'geo_source', 'gc_request_street', 'osm_lat', 'osm_lon']]


In [130]:
mitosheet.sheet(df2, analysis_to_replay="id-zxopmludbv")

MitoWidget(analysis_data_json='{"analysisName": "id-zxopmludbv", "analysisToReplay": null, "code": [], "stepSu…

In [None]:
from mitosheet import *; register_analysis("id-zxopmludbv");
    
# Reordered column latitude
df2_columns = [col for col in df2.columns if col != 'latitude']
df2_columns.insert(5, 'latitude')
df2 = df2[df2_columns]

# Reordered column latitude
df2_columns = [col for col in df2.columns if col != 'latitude']
df2_columns.insert(7, 'latitude')
df2 = df2[df2_columns]

# Reordered column longitude
df2_columns = [col for col in df2.columns if col != 'longitude']
df2_columns.insert(5, 'longitude')
df2 = df2[df2_columns]

# Reordered column longitude
df2_columns = [col for col in df2.columns if col != 'longitude']
df2_columns.insert(8, 'longitude')
df2 = df2[df2_columns]


In [None]:
from mitosheet import *; register_analysis("id-xdcdirwill");
    
# Reordered column latitude
df_columns = [col for col in df.columns if col != 'latitude']
df_columns.insert(21, 'latitude')
df = df[df_columns]

# Reordered column latitude
df_columns = [col for col in df.columns if col != 'latitude']
df_columns.insert(31, 'latitude')
df = df[df_columns]
