# Location Mapping

In [101]:
import numpy as np
import pandas as pd
import pickle

from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from os import path
from tqdm import tqdm

In [102]:
geolocator = Nominatim(user_agent='housing-qc')

### Read Data

In [103]:
home_df = pd.read_csv('../data/raw/home_listings.csv')
condo_df = pd.read_csv('../data/raw/condo_listings.csv')
raw_df = pd.concat([home_df, condo_df], axis=0, ignore_index=True)

In [104]:
raw_locations = list(raw_df['location'].unique())

In [105]:
population_centers_df = pd.read_csv('../data/references/handmade/qc-population-centers.csv')
population_centers_df.sample(5)

Unnamed: 0,Location,City,RCM,Region,Bounding Territory,Display Name,Bounding Type,Bounding Population,GeoPy Index
24,Saint-Apollinaire,Saint-Apollinaire,Lotbinière,Chaudière-Appalaches,Lotbinière,Lotbinière,Regional County Municipality (RCM),30000,1
113,Saint-Calixte,Saint-Calixte,Montcalm,Lanaudière,Montcalm,Montcalm,Regional County Municipality (RCM),50000,1
128,Blainville,Blainville,Thérèse-De Blainville,Laurentides,Thérèse-De Blainville,Thérèse-De Blainville,Regional County Municipality (RCM),160000,0
129,Saint-Jérôme,Saint-Jérôme,La Rivière-du-Nord,Laurentides,La Rivière-du-Nord,La Rivière-du-Nord,Regional County Municipality (RCM),130000,0
126,Mirabel,Mirabel,Mirabel,Laurentides,"[Mirabel,-Deux-Montagnes,-La Rivière-du-Nord,-...",Mirabel,Municipality,60000,0


### Methods

In [106]:
def compute_location_coordinates(location: str):
    geocode = geolocator.geocode(location + ', QC')
    return {'Location': location, 'Latitude': geocode.latitude, 'Longitude': geocode.longitude}

In [107]:
def compute_locations_coordinates(locations: list):
    location_coordinates = []
    unknown_locations = []
    for location in tqdm(locations, desc='Finding Location Coordinates'):
        try:
            coordinates = compute_location_coordinates(location)
            location_coordinates.append(coordinates)
        except:
            unknown_locations.append(location)
            print('Unable to find Geocode for: ' + str(location))
    return pd.DataFrame(location_coordinates), unknown_locations

In [108]:
def compute_coordinates(locations: list, output: str):
    if path.exists(output):
        unknown_locations = []
        coordinates = pd.read_csv(output)
    else:
        coordinates, unknown_locations = compute_locations_coordinates(locations)
        coordinates.to_csv(output, index=False)
    
    return coordinates, unknown_locations

In [109]:
# Given geographical latitude and longitude, find the closest location in the given list.
def find_closest_location(latitude: float, longitude: float, locations: pd.DataFrame):
    distances = []

    for _, location in locations.iterrows():
        distances.append(geodesic((latitude, longitude), (location['Latitude'], location['Longitude'])))

    return locations.iloc[np.argmin(distances)]['Location']

In [110]:
# Map all locations to a reference location based on closest distance.
def build_location_mapper(locations: pd.DataFrame, reference_locations: pd.DataFrame):
    location_mapper = {}
    
    for _, location in tqdm(locations.iterrows(), desc="Building Location Mapper", total=locations.shape[0]):
        closest_location = find_closest_location(location['Latitude'], location['Longitude'], reference_locations)
        location_mapper[location['Location']] = closest_location
        
    return location_mapper

### Format Location Strings

In [111]:
raw_locations = [str(i) for i in raw_locations]
raw_locations = [location.replace('St-', 'Saint-') for location in raw_locations]
raw_locations = [location.replace('Ste-', 'Sainte-') for location in raw_locations]
raw_locations = [location.replace('Sts-', 'Saints-') for location in raw_locations]
raw_locations = [location.replace('ND-', 'Notre-Dame-') for location in raw_locations]
raw_locations = [location.replace('JC', 'Jacques-Cartier') for location in raw_locations]

In [112]:
unknown_locations_dict = {
    "Saint-Denis-sur-Mer": "Saint-Denis",
    "Saint-Simon-De-Rimouski": "Saint-Simon",
    "Saint-Guillaume-D'Upton": "Saint-Guillaume",
    "Saint-Joseph-De-Ham-Sud": "Ham-Sud",
    "Saint-Adelphe-De-Champlain": "Saint-Adelphe",
    "Saint-Mathieu-De-Laprairie": "Saint-Mathieu",
    "Saint-Isidore-De-Laprairie": "Saint-Isidore",
    "Saint-Stanislas-De-Champlain": "Saint-Denis",
    "Saint-Sebastien-De-Frontenac": "Saint-Sebastien",
    "Sainte-Francoise-De-Lotbiniere": "Saint-Francoise",
    "Saint-Donat-De-Rimouski": "Rimouski",
    "Sheenboro": "Pontiac",
    "Mercier": "Hochelaga", 
    "Mont-Royal": "Ville de Mont-Royal",
    "Rosemont": "Rosemont-La Petite-Patrie",
    "Rivière des Prairies": "Rivière-des-Prairies-Pointe-aux-Trembles",
    "Pointe-Aux-Trembles": "Rivière-des-Prairies-Pointe-aux-Trembles",
    "Côte-des-Neiges": "Côte-des-Neiges–Notre-Dame-de-Grâce"
}

raw_locations = [x if x not in unknown_locations_dict else unknown_locations_dict[x] for x in raw_locations]

### Compute Coordinates

Raw Location Coordinates

In [113]:
output = '../data/processed/locations/raw_location_coordinates.csv'
raw_location_coordinates, unknown_locations = compute_coordinates(locations=raw_locations, output=output)

if len(unknown_locations) == 0:
    print("All coordinates successfully calculated.")

raw_location_coordinates.head()

All coordinates successfully calculated.


Unnamed: 0,Location,Latitude,Longitude
0,Beauport,46.907111,-71.212797
1,Deschambault,46.6492,-71.929893
2,Hochelaga,45.542133,-73.54763
3,Stoneham,46.999608,-71.369475
4,Trois-Rivières,46.371592,-72.600502


Population Center Coordinates

In [114]:
output = '../data/processed/locations/population_centers_coordinates.csv'
population_centers_coordinates, unknown_locations = compute_coordinates(locations=population_centers_df["Location"].unique(), output=output)

if len(unknown_locations) == 0:
    print("All coordinates successfully calculated.")

population_centers_coordinates.head()

Finding Location Coordinates: 100%|██████████| 137/137 [01:08<00:00,  2.00it/s]

All coordinates successfully calculated.





Unnamed: 0,Location,Latitude,Longitude
0,Rimouski,48.450155,-68.529968
1,Gaspé,48.658056,-65.752778
2,La Cité-Limoilou,46.829408,-71.225593
3,Les Rivières,46.839562,-71.30457
4,Sainte-Foy-Sillery-Cap-Rouge,46.782012,-71.349279


### Map Raw Locations to Population Centers

In [115]:
location_mapper = build_location_mapper(raw_location_coordinates, population_centers_coordinates)

Building Location Mapper: 100%|██████████| 1168/1168 [00:31<00:00, 37.09it/s]


To .csv

In [116]:
location_mapper_df = pd.DataFrame(location_mapper.items(), columns=['location', 'mapping'])
location_mapper_df.to_csv('../data/processed/locations/location_mapper.csv', index=False)
display(location_mapper_df.head())

Unnamed: 0,location,mapping
0,Beauport,Beauport
1,Deschambault,Cap-Santé
2,Hochelaga,Hochelaga-Maisonneuve
3,Stoneham,Lac-Beauport
4,Trois-Rivières,Trois-Rivières


To .pkl

In [117]:
with open('../data/processed/locations/location_mapper.pkl', 'wb') as f:
    pickle.dump(location_mapper, f)