In [127]:
import pandas as pd
import numpy as np
import pickle

from geopy.geocoders import Nominatim
from geopy.distance import geodesic

Scraped Locations

In [108]:
raw_df = pd.read_csv('../data/raw/raw_listings.csv')
locations = list(raw_df['location'].value_counts().to_dict().keys())

Population centres

In [109]:
population_centres = pd.read_csv('../data/wikipedia/population_centres.csv')
population_centres.head()

Unnamed: 0,Rank,Population centre[2],Size group[2],Population (2016)[2],Population (2011)[2],% Change[2]
0,1,Montreal,Large urban,3519595,3387653,+3.9%
1,2,Ottawa - Gatineau (QC/ON),Large urban,989567,945592,+4.7%
2,3,Quebec City,Large urban,705103,681804,+3.4%
3,4,Sherbrooke,Large urban,139565,133673,+4.4%
4,5,Trois-Rivières,Large urban,114203,112626,+1.4%


In [110]:
population_centres = population_centres.drop(population_centres.columns[[0, 2, 3, 4, 5]], axis=1)
population_centres = population_centres.rename(columns={'Population centre[2]': 'Name'})
population_centres.head()

Unnamed: 0,Name
0,Montreal
1,Ottawa - Gatineau (QC/ON)
2,Quebec City
3,Sherbrooke
4,Trois-Rivières


Dropping Quebec, Montreal & Laval

In [111]:
population_centres = population_centres[population_centres['Name'] != 'Montreal']
population_centres = population_centres[population_centres['Name'] != 'Quebec City']

Fixing town names

In [112]:
d = {'Ottawa - Gatineau (QC/ON)': 'Gatineau', 'Chicoutimi - Jonquière': 'Chicoutimi', 'Hawkesbury (QC/ON)': 'Grenville',
     'Campbellton (QC/NB)': 'Pointe-à-la-Croix', 'Port-Alfred-Bagotville': 'Bagotville', 'Sainte-Agathe-des-Monts -Val-David': 'Sainte-Agathe-des-Monts', 'Terrasse-des-Pins': 'Prévost', 'Domaine-C.-L.-C.': 'Saint-Lin–Laurentides'}
population_centres['Name'] = population_centres['Name'].replace(d)


Adding Quebec, Montreal & Laval Boroughs

In [113]:
mtl = pd.read_csv('../data/wikipedia/mtl-boroughs.csv')
laval = pd.read_csv('../data/wikipedia/laval-boroughs.csv')
quebec = pd.read_csv('../data/wikipedia/qc-boroughs.csv')

In [114]:
centres = list(population_centres['Name']) + list(mtl['Borough']) + list(laval['name']) + list(quebec['name'])

Finding geographical position

In [115]:
geolocator = Nominatim(user_agent='housing-qc')

In [116]:
# centre_dicts = []

# for centre in centres:
#     print(centre)
#     geocode = geolocator.geocode(centre + ', QC')
#     centre_dicts.append({'Name': centre, 'Latitude': geocode.latitude, 'Longitude': geocode.longitude})

# population_centres = pd.DataFrame(centre_dicts)
# population_centres.head()

In [117]:
# population_centres.to_csv('../data/processed/population_centres.csv', index=False)

In [118]:
population_centres = pd.read_csv('../data/processed/population_centres.csv')

Associating locations to populations centres

In [119]:
def find_closest_population_centre(locations, population_centres):
    location_mapper = {}
    unknown_locations = []
    for location in locations:
        print('LOCATION: ' + location)

        distances = []
        geocode = geolocator.geocode(str(location) + ', QC', timeout=None)

        if geocode is None:
            unknown_locations.append(location)
            print('UNKNOWN LOCATION: ' + location)
        else:
            for _, centre in population_centres.iterrows():
                distances.append(geodesic((geocode.latitude, geocode.longitude), (centre['Latitude'], centre['Longitude'])))

            closest_centre = population_centres.iloc[np.argmin(distances)]['Name']
            location_mapper[location] = closest_centre
            print('CLOSEST POPULATION CENTRE: ' + closest_centre)
    return location_mapper, unknown_locations

In [120]:
location_mapper, unknown_locations = find_closest_population_centre(locations, population_centres)

LOCATION: Gatineau
CLOSEST POPULATION CENTRE: Gatineau
LOCATION: Trois-Rivières
CLOSEST POPULATION CENTRE: Trois-Rivières
LOCATION: Beauport
CLOSEST POPULATION CENTRE: Beauport
LOCATION: Mercier
CLOSEST POPULATION CENTRE: Châteauguay
LOCATION: Alma
CLOSEST POPULATION CENTRE: Alma
LOCATION: Fabreville
CLOSEST POPULATION CENTRE: Fabreville
LOCATION: Ste-Marthe-Sur-Le-Lac
CLOSEST POPULATION CENTRE: Laval-sur-le-Lac
LOCATION: Stoneham
CLOSEST POPULATION CENTRE: Stoneham
LOCATION: St-Ferréol-les-Neiges
CLOSEST POPULATION CENTRE: Beaupré
LOCATION: Deschambault
CLOSEST POPULATION CENTRE: Portneuf
LOCATION: Sherbrooke
CLOSEST POPULATION CENTRE: Sherbrooke
LOCATION: St-Raymond
CLOSEST POPULATION CENTRE: Saint-Raymond
LOCATION: Longueuil
CLOSEST POPULATION CENTRE: Mercier–Hochelaga-Maisonneuve
LOCATION: Drummondville
CLOSEST POPULATION CENTRE: Drummondville
LOCATION: Charlesbourg
CLOSEST POPULATION CENTRE: Charlesbourg
LOCATION: Terrebonne
CLOSEST POPULATION CENTRE: Filion
LOCATION: Berthier-Sur

In [121]:
town_dict = {}
town_dict["L'Ile Des Soeurs"] = "Ile des soeurs"
town_dict["St-Joseph-De-Ham-Sud"] = "Ham-Sud"
town_dict["St-Mathieu-De-Laprairie"] = "Saint-Mathieu"
town_dict["St-Denis-sur-Mer"] = "Saint-Denis"
town_dict["St-Isidore-De-Laprairie"] = "Saint-Isodore"
town_dict["St-Stanislas-De-Champlain"] = "Saint-Denis"
town_dict["St-Sebastien-De-Frontenac"] = "Saint-Sebastien"
town_dict["St-Simon-De-Rimouski"] = "Saint-Simon"
town_dict["Ste-Francoise-De-Lotbiniere"] = "Saint-Francoise"
town_dict["St-Guillaume-D'Upton"] = "Saint-Guillaume"
town_dict["St-Adelphe-De-Champlain"] = "Saint-Adelphe"

def fix_unknown(unknown: str):
    if unknown in town_dict.keys():
        return town_dict[unknown]
    elif "ND-" in unknown:
        return unknown.replace("ND-", "Notre-Dame-")
    elif "JC" in unknown:
        return unknown.replace("JC", "Jacques-Cartier")
    elif "St-" in unknown:
        return unknown.replace("St-", "Saint-")
    elif "Ste-" in unknown:
        return unknown.replace("Ste-", "Sainte-")
    elif "Sts-" in unknown:
        return unknown.replace("Sts-", "Saints-")
    else:
        return unknown

In [122]:
fixed_locations = list(map(fix_unknown, unknown_locations))
unknown_locations_mapper = {fixed_locations[i]: unknown_locations[i] for i in range(len(unknown_locations))}

In [123]:
location_mapper2, unknown_locations2 = find_closest_population_centre(fixed_locations, population_centres)

LOCATION: Ste-Catherine-de-la-Jacques-Cartier
CLOSEST POPULATION CENTRE: Sainte-Catherine-de-la-Jacques-Cartier
LOCATION: Notre-Dame-Des-Prairies
CLOSEST POPULATION CENTRE: Joliette
LOCATION: Notre-Dame-De-L'Ile-Perrot
CLOSEST POPULATION CENTRE: Melocheville
LOCATION: Notre-Dame-Du-Mont-Carmel
CLOSEST POPULATION CENTRE: Notre-Dame-du-Mont-Carmel
LOCATION: Ile des soeurs
CLOSEST POPULATION CENTRE: Les Cèdres
LOCATION: Saint-Mathieu
CLOSEST POPULATION CENTRE: Saint-Michel
LOCATION: Notre-Dame-Du-Bon-Conseil
CLOSEST POPULATION CENTRE: Notre-Dame-du-Bon-Conseil
LOCATION: Notre-Dame-Du-Laus
CLOSEST POPULATION CENTRE: Maniwaki
LOCATION: Saint-Isodore
CLOSEST POPULATION CENTRE: Saint-Roch-de-l'Achigan
LOCATION: Notre-Dame-De-Lourdes
CLOSEST POPULATION CENTRE: Joliette
LOCATION: Saint-Magloire-De-Bellechasse
CLOSEST POPULATION CENTRE: Sainte-Justine
LOCATION: Saints-Anges
CLOSEST POPULATION CENTRE: Vallée-Jonction
LOCATION: Notre-Dame-De-La-Merci
CLOSEST POPULATION CENTRE: Saint-Donat-de-Montc

In [124]:
unknown_locations2

[]

In [125]:
location_mapper2 = { unknown_locations_mapper.get(k, k): v for k, v in location_mapper2.items() }

In [126]:
location_mapper.update(location_mapper2)

In [128]:
with open('../data/processed/location_mapper.pkl', 'wb') as f:
    pickle.dump(location_mapper, f)