# Location
Raw data contains over a thousand different locations, some with very low occurences. This reduces the quality of the model. We must pick a handful of population centres as location data. Population centre for every raw location is chosen based on distance. 

In [60]:
import numpy as np
import pandas as pd
import pickle

from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from geopy.location import Location

### Raw Locations

In [61]:
home_df = pd.read_csv('../data/raw/home_raw_listings.csv')
condo_df = pd.read_csv('../data/raw/condo_raw_listings.csv')
raw_df = pd.concat([home_df, condo_df], axis=0, ignore_index=True)
raw_locations = list(raw_df['location'].value_counts().to_dict().keys())

### Population centers

In [62]:
population_centers = pd.read_csv('../data/wikipedia/cities.csv')
population_centers.head()

Unnamed: 0,Name
0,Montréal
1,Québec
2,Laval
3,Longueuil
4,Gatineau


Dropping Quebec, Montreal & Laval

In [63]:
population_centers = population_centers[population_centers['Name'] != 'Montréal']
population_centers = population_centers[population_centers['Name'] != 'Québec']
population_centers = population_centers[population_centers['Name'] != 'Laval']

Fixing town names

In [64]:
d = {'Ottawa - Gatineau (QC/ON)': 'Gatineau', 'Chicoutimi - Jonquière': 'Chicoutimi', 'Hawkesbury (QC/ON)': 'Grenville',
     'Campbellton (QC/NB)': 'Pointe-à-la-Croix', 'Port-Alfred-Bagotville': 'Bagotville', 'Sainte-Agathe-des-Monts -Val-David': 'Sainte-Agathe-des-Monts', 'Terrasse-des-Pins': 'Prévost', 'Domaine-C.-L.-C.': 'Saint-Lin–Laurentides'}
population_centers['Name'] = population_centers['Name'].replace(d)

Adding Quebec, Montreal & Laval Boroughs

In [65]:
mtl = pd.read_csv('../data/wikipedia/mtl-boroughs.csv')
laval = pd.read_csv('../data/wikipedia/laval-boroughs.csv')
quebec = pd.read_csv('../data/wikipedia/qc-boroughs.csv')

In [66]:
centers = list(population_centers['Name']) + list(mtl['Borough']) + list(laval['name']) + list(quebec['name'])

Finding geographical position for all population centers

In [67]:
geolocator = Nominatim(user_agent='housing-qc')

geolocator = Nominatim(user_agent='housing-qc')geolocator = Nominatim(user_agent='housing-qc')

In [None]:
center_dicts = []

for center in centers:
    print(center)
    geocode = geolocator.geocode(center + ', QC')
    center_dicts.append({'Name': center, 'Latitude': geocode.latitude, 'Longitude': geocode.longitude})

population_centers = pd.DataFrame(center_dicts)
population_centers.head()

In [None]:
population_centers.to_csv('../data/processed/population_centers.csv', index=False)

In [68]:
population_centers = pd.read_csv('../data/processed/population_centers.csv')

### Matching Raw Locations with Population Centers

In [69]:
def find_closest_population_center(geocode: Location, population_centers: pd.DataFrame):
    distances = []

    for _, center in population_centers.iterrows():
        distances.append(geodesic((geocode.latitude, geocode.longitude), (center['Latitude'], center['Longitude'])))
    return population_centers.iloc[np.argmin(distances)]['Name']

In [70]:
def build_location_mapper(locations: list, population_centers: pd.DataFrame):
    location_mapper = {}
    unknown_locations = []

    for location in locations:
        geocode = geolocator.geocode(str(location) + ', QC', timeout=None)

        if geocode is None:
            unknown_locations.append(location)
            print('UNKNOWN LOCATION: ' + location)
        else:
            closest_center = find_closest_population_center(geocode, population_centers)
            location_mapper[location] = closest_center
            print('LOCATION: ' + location)
            print('CLOSEST POPULATION center: ' + closest_center)

    return location_mapper, unknown_locations

In [71]:
location_mapper, unknown_locations = build_location_mapper(raw_locations, population_centers)

LOCATION: Gatineau
CLOSEST POPULATION center: Gatineau
LOCATION: Trois-Rivières
CLOSEST POPULATION center: Trois-Rivières
LOCATION: Beauport
CLOSEST POPULATION center: Beauport
LOCATION: Mercier
CLOSEST POPULATION center: Mercier
LOCATION: Alma
CLOSEST POPULATION center: Alma
LOCATION: Fabreville
CLOSEST POPULATION center: Fabreville
LOCATION: Sherbrooke
CLOSEST POPULATION center: Sherbrooke
LOCATION: Ste-Marthe-Sur-Le-Lac
CLOSEST POPULATION center: Sainte-Marthe-sur-le-Lac
LOCATION: Stoneham
CLOSEST POPULATION center: La Haute-Saint-Charles
LOCATION: St-Ferréol-les-Neiges
CLOSEST POPULATION center: Montmagny
LOCATION: Longueuil
CLOSEST POPULATION center: Longueuil
LOCATION: Deschambault
CLOSEST POPULATION center: Pont-Rouge
LOCATION: St-Raymond
CLOSEST POPULATION center: Saint-Raymond
LOCATION: Charlesbourg
CLOSEST POPULATION center: Charlesbourg
LOCATION: Terrebonne
CLOSEST POPULATION center: Terrebonne
LOCATION: Drummondville
CLOSEST POPULATION center: Drummondville
LOCATION: Ste-Fo

Fixing Unknown Locations

In [73]:
town_dict = {}
town_dict["L'Ile Des Soeurs"] = "Ile des soeurs"
town_dict["St-Joseph-De-Ham-Sud"] = "Ham-Sud"
town_dict["St-Mathieu-De-Laprairie"] = "Saint-Mathieu"
town_dict["St-Denis-sur-Mer"] = "Saint-Denis"
town_dict["St-Isidore-De-Laprairie"] = "Saint-Isodore"
town_dict["St-Stanislas-De-Champlain"] = "Saint-Denis"
town_dict["St-Sebastien-De-Frontenac"] = "Saint-Sebastien"
town_dict["St-Simon-De-Rimouski"] = "Saint-Simon"
town_dict["Ste-Francoise-De-Lotbiniere"] = "Saint-Francoise"
town_dict["St-Guillaume-D'Upton"] = "Saint-Guillaume"
town_dict["St-Adelphe-De-Champlain"] = "Saint-Adelphe"

def fix_unknown_location(unknown_location: str):
    if unknown_location in town_dict.keys():
        return town_dict[unknown_location]
    elif "ND-" in unknown_location:
        return unknown_location.replace("ND-", "Notre-Dame-")
    elif "JC" in unknown_location:
        return unknown_location.replace("JC", "Jacques-Cartier")
    elif "St-" in unknown_location:
        return unknown_location.replace("St-", "Saint-")
    elif "Ste-" in unknown_location:
        return unknown_location.replace("Ste-", "Sainte-")
    elif "Sts-" in unknown_location:
        return unknown_location.replace("Sts-", "Saints-")
    else:
        return unknown_location

In [74]:
fixed_locations = list(map(fix_unknown_location, unknown_locations))
unknown_locations_mapper = {fixed_locations[i]: unknown_locations[i] for i in range(len(unknown_locations))}

In [75]:
location_mapper2, unknown_locations2 = build_location_mapper(fixed_locations, population_centers)

LOCATION: Ile des soeurs
CLOSEST POPULATION center: Salaberry-de-Valleyfield
LOCATION: Ste-Catherine-de-la-Jacques-Cartier
CLOSEST POPULATION center: Pont-Rouge
LOCATION: Notre-Dame-Des-Prairies
CLOSEST POPULATION center: Joliette
LOCATION: Notre-Dame-De-L'Ile-Perrot
CLOSEST POPULATION center: Notre-Dame-de-l'Île-Perrot
LOCATION: Notre-Dame-Du-Mont-Carmel
CLOSEST POPULATION center: Shawinigan
LOCATION: Saint-Mathieu
CLOSEST POPULATION center: Saint-Constant
LOCATION: Notre-Dame-Du-Bon-Conseil
CLOSEST POPULATION center: Drummondville
LOCATION: Notre-Dame-Du-Laus
CLOSEST POPULATION center: Val-des-Monts
LOCATION: Saint-Isodore
CLOSEST POPULATION center: Mascouche
LOCATION: Notre-Dame-De-Lourdes
CLOSEST POPULATION center: Saint-Charles-Borromée
LOCATION: Saint-Magloire-De-Bellechasse
CLOSEST POPULATION center: Montmagny
LOCATION: Saints-Anges
CLOSEST POPULATION center: Sainte-Marie
LOCATION: Notre-Dame-De-La-Merci
CLOSEST POPULATION center: Sainte-Agathe-des-Monts
LOCATION: Saint-Donat-De

In [76]:
unknown_locations2

[]

In [77]:
location_mapper2 = { unknown_locations_mapper.get(k, k): v for k, v in location_mapper2.items() }
location_mapper.update(location_mapper2)

### Saving Final Location Mapper

In [79]:
with open('../data/processed/population_center_mapper.pkl', 'wb') as f:
    pickle.dump(location_mapper, f)