In [74]:
import numpy as np
import pandas as pd
import pickle

from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from geopy.location import Location

In [75]:
regions = pd.read_csv('../data/wikipedia/administrative_regions.csv')
mtl_boroughs = pd.read_csv('../data/wikipedia/mtl-boroughs.csv')['Borough']

In [76]:
regions.head()

Unnamed: 0,City,Region
0,Rimouski,Bas-Saint-Laurent
1,Rivière-du-Loup,Bas-Saint-Laurent
2,Matane,Bas-Saint-Laurent
3,Alma,Saguenay-Lac-Saint-Jean
4,Saguenay,Saguenay-Lac-Saint-Jean


In [77]:
df = pd.DataFrame(mtl_boroughs).rename(columns={'Borough': 'City'})
df['Region'] = df['City']

In [78]:
regions = regions.append(df, ignore_index=True)

In [79]:
geolocator = Nominatim(user_agent='housing-qc')

In [80]:
region_dicts = []

for _, row in regions.iterrows():
    print(row['City'])
    geocode = geolocator.geocode(row['City'] + ', QC')
    region_dicts.append({'City': row['City'], 'Region': row['Region'], 'Latitude': geocode.latitude, 'Longitude': geocode.longitude})

Rimouski
Rivière-du-Loup
Matane
Alma
Saguenay
Quebec City
Shawinigan
Trois-Rivières
Sherbrooke
Magog
Gatineau
Rouyn-Noranda
Val-d'Or
Baie-Comeau
Sept-Îles
Chibougamau
Matagami
Puvirnituq
Gaspé
Îles-de-la-Madeleine
Lévis
Saint-Georges
Laval
Joliette
Repentigny
Terrebonne
Blainville
Mirabel
Saint-Jérôme
Sainte-Thérèse
Granby
Brossard
Candiac
Longueuil
Saint-Jean-sur-Richelieu
Saint-Hyacinthe
Sorel
Vaudreuil-Dorion
Drummondville
Victoriaville
Ahuntsic-Cartierville
Anjou
Côte-des-Neiges–Notre-Dame-de-Grâce
Lachine
LaSalle
Le Plateau-Mont-Royal
Le Sud-Ouest
L'Île-Bizard–Sainte-Geneviève
Mercier–Hochelaga-Maisonneuve
Montréal-Nord
Outremont
Pierrefonds-Roxboro
Rivière-des-Prairies–Pointe-aux-Trembles
Rosemont–La Petite-Patrie
Saint-Laurent
Saint-Léonard
Verdun
Ville-Marie
Villeray–Saint-Michel–Parc-Extension


In [81]:
regions = pd.DataFrame(region_dicts)
regions.head()

Unnamed: 0,City,Region,Latitude,Longitude
0,Rimouski,Bas-Saint-Laurent,48.450155,-68.529968
1,Rivière-du-Loup,Bas-Saint-Laurent,47.835816,-69.536802
2,Matane,Bas-Saint-Laurent,48.846863,-67.529598
3,Alma,Saguenay-Lac-Saint-Jean,48.548887,-71.651459
4,Saguenay,Saguenay-Lac-Saint-Jean,48.405959,-71.069183


In [82]:
home_df = pd.read_csv('../data/raw/home_raw_listings.csv')
condo_df = pd.read_csv('../data/raw/condo_raw_listings.csv')
raw_df = pd.concat([home_df, condo_df], axis=0, ignore_index=True)
raw_locations = list(raw_df['location'].value_counts().to_dict().keys())

In [83]:
def find_matching_region(geocode: Location, regions: pd.DataFrame):
    distances = []
    for _, row in regions.iterrows():
        distances.append(geodesic((geocode.latitude, geocode.longitude), (row['Latitude'], row['Longitude'])))
    return regions.iloc[np.argmin(distances)]['Region']

In [84]:
def build_location_mapper(locations: list, regions: pd.DataFrame):
    location_mapper = {}
    unknown_locations = []

    for location in locations:
        geocode = geolocator.geocode(str(location) + ', QC', timeout=None)

        if geocode is None:
            unknown_locations.append(location)
            print('UNKNOWN LOCATION: ' + location)
        else:
            region = find_matching_region(geocode, regions)
            
            location_mapper[location] = region
            print('LOCATION: ' + location)
            print('MAPPED TO REGION: ' + region)
    return location_mapper, unknown_locations

In [85]:
location_mapper, unknown_locations = build_location_mapper(raw_locations, regions)

LOCATION: Gatineau
MAPPED TO REGION: Outaouais
LOCATION: Trois-Rivières
MAPPED TO REGION: Mauricie
LOCATION: Beauport
MAPPED TO REGION: Capitale-Nationale
LOCATION: Mercier
MAPPED TO REGION: Lachine
LOCATION: Alma
MAPPED TO REGION: Saguenay-Lac-Saint-Jean
LOCATION: Fabreville
MAPPED TO REGION: Laurentides
LOCATION: Sherbrooke
MAPPED TO REGION: Estrie
LOCATION: Ste-Marthe-Sur-Le-Lac
MAPPED TO REGION: L'Île-Bizard–Sainte-Geneviève
LOCATION: Stoneham
MAPPED TO REGION: Capitale-Nationale
LOCATION: St-Ferréol-les-Neiges
MAPPED TO REGION: Capitale-Nationale
LOCATION: Longueuil
MAPPED TO REGION: Montérégie
LOCATION: Deschambault
MAPPED TO REGION: Mauricie
LOCATION: St-Raymond
MAPPED TO REGION: Capitale-Nationale
LOCATION: Charlesbourg
MAPPED TO REGION: Capitale-Nationale
LOCATION: Terrebonne
MAPPED TO REGION: Lanaudière
LOCATION: Drummondville
MAPPED TO REGION: Centre-du-Québec
LOCATION: Ste-Foy
MAPPED TO REGION: Capitale-Nationale
LOCATION: Rosemont
MAPPED TO REGION: Outremont
LOCATION: St-J

In [86]:
town_dict = {}
town_dict["L'Ile Des Soeurs"] = "Ile des soeurs"
town_dict["St-Joseph-De-Ham-Sud"] = "Ham-Sud"
town_dict["St-Mathieu-De-Laprairie"] = "Saint-Mathieu"
town_dict["St-Denis-sur-Mer"] = "Saint-Denis"
town_dict["St-Isidore-De-Laprairie"] = "Saint-Isodore"
town_dict["St-Stanislas-De-Champlain"] = "Saint-Denis"
town_dict["St-Sebastien-De-Frontenac"] = "Saint-Sebastien"
town_dict["St-Simon-De-Rimouski"] = "Saint-Simon"
town_dict["Ste-Francoise-De-Lotbiniere"] = "Saint-Francoise"
town_dict["St-Guillaume-D'Upton"] = "Saint-Guillaume"
town_dict["St-Adelphe-De-Champlain"] = "Saint-Adelphe"

def fix_unknown_location(unknown_location: str):
    if unknown_location in town_dict.keys():
        return town_dict[unknown_location]
    elif "ND-" in unknown_location:
        return unknown_location.replace("ND-", "Notre-Dame-")
    elif "JC" in unknown_location:
        return unknown_location.replace("JC", "Jacques-Cartier")
    elif "St-" in unknown_location:
        return unknown_location.replace("St-", "Saint-")
    elif "Ste-" in unknown_location:
        return unknown_location.replace("Ste-", "Sainte-")
    elif "Sts-" in unknown_location:
        return unknown_location.replace("Sts-", "Saints-")
    else:
        return unknown_location

In [87]:
fixed_locations = list(map(fix_unknown_location, unknown_locations))
unknown_locations_mapper = {fixed_locations[i]: unknown_locations[i] for i in range(len(unknown_locations))}

In [88]:
location_mapper2, unknown_locations2 = build_location_mapper(fixed_locations, regions)

LOCATION: Ile des soeurs
MAPPED TO REGION: Montérégie
LOCATION: Ste-Catherine-de-la-Jacques-Cartier
MAPPED TO REGION: Capitale-Nationale
LOCATION: Notre-Dame-Des-Prairies
MAPPED TO REGION: Lanaudière
LOCATION: Notre-Dame-De-L'Ile-Perrot
MAPPED TO REGION: Montérégie
LOCATION: Notre-Dame-Du-Mont-Carmel
MAPPED TO REGION: Mauricie
LOCATION: Saint-Mathieu
MAPPED TO REGION: Montérégie
LOCATION: Notre-Dame-Du-Bon-Conseil
MAPPED TO REGION: Centre-du-Québec
LOCATION: Notre-Dame-Du-Laus
MAPPED TO REGION: Outaouais
LOCATION: Saint-Isodore
MAPPED TO REGION: Lanaudière
LOCATION: Notre-Dame-De-Lourdes
MAPPED TO REGION: Lanaudière
LOCATION: Saint-Magloire-De-Bellechasse
MAPPED TO REGION: Chaudière-Appalaches
LOCATION: Saints-Anges
MAPPED TO REGION: Chaudière-Appalaches
LOCATION: Notre-Dame-De-La-Merci
MAPPED TO REGION: Laurentides
LOCATION: Saint-Donat-De-Rimouski
MAPPED TO REGION: Bas-Saint-Laurent
LOCATION: Notre-Dame-De-La-Salette
MAPPED TO REGION: Outaouais
LOCATION: Saint-Denis
MAPPED TO REGION:

In [89]:
unknown_locations2

[]

In [90]:
location_mapper2 = { unknown_locations_mapper.get(k, k): v for k, v in location_mapper2.items() }
location_mapper.update(location_mapper2)

In [91]:
with open('../data/processed/region_mapper.pkl', 'wb') as f:
    pickle.dump(location_mapper, f)