# Location Mapping

In [35]:
import numpy as np
import pandas as pd
import pickle

import geopandas as gpd
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from os import path
from shapely.geometry import Point
from tqdm import tqdm

In [36]:
geolocator = Nominatim(user_agent='housing-qc')

### Read Data

In [37]:
home_df = pd.read_csv('../data/raw/home-listings.csv')
condo_df = pd.read_csv('../data/raw/condo-listings.csv')
raw_df = pd.concat([home_df, condo_df], axis=0, ignore_index=True)

In [38]:
raw_locations = list(raw_df['location'].unique())

### Methods

In [39]:
def compute_location_coordinates(location: str):
    geocode = geolocator.geocode(location + ', QC')
    return {'Location': location, 'Latitude': geocode.latitude, 'Longitude': geocode.longitude}

In [40]:
def compute_locations_coordinates(locations: list):
    location_coordinates = []
    unknown_locations = []
    for location in tqdm(locations, desc='Finding Location Coordinates'):
        try:
            coordinates = compute_location_coordinates(location)
            location_coordinates.append(coordinates)
        except:
            unknown_locations.append(location)
            print('Unable to find Geocode for: ' + str(location))
    return pd.DataFrame(location_coordinates), unknown_locations

In [41]:
def compute_coordinates(locations: list, output: str):
    if path.exists(output):
        unknown_locations = []
        coordinates = pd.read_csv(output)
    else:
        coordinates, unknown_locations = compute_locations_coordinates(locations)
        coordinates.to_csv(output, index=False)
    
    return coordinates, unknown_locations

In [50]:
def extract_parent_location(gdf: gpd.GeoDataFrame, contains: list):
    gdf['contains'] = contains
    parent_location = gdf[gdf["contains"] == True]["location"]
    if len(parent_location) == 0:
        return "Unknown Location"
    return parent_location.iloc[0]

### Format Location Strings

In [44]:
raw_locations = [str(i) for i in raw_locations]
raw_locations = [location.replace('St-', 'Saint-') for location in raw_locations]
raw_locations = [location.replace('Ste-', 'Sainte-') for location in raw_locations]
raw_locations = [location.replace('Sts-', 'Saints-') for location in raw_locations]
raw_locations = [location.replace('ND-', 'Notre-Dame-') for location in raw_locations]
raw_locations = [location.replace('JC', 'Jacques-Cartier') for location in raw_locations]

In [45]:
unknown_locations_dict = {
    "Saint-Denis-sur-Mer": "Saint-Denis",
    "Saint-Simon-De-Rimouski": "Saint-Simon",
    "Saint-Guillaume-D'Upton": "Saint-Guillaume",
    "Saint-Joseph-De-Ham-Sud": "Ham-Sud",
    "Saint-Adelphe-De-Champlain": "Saint-Adelphe",
    "Saint-Mathieu-De-Laprairie": "Saint-Mathieu",
    "Saint-Isidore-De-Laprairie": "Saint-Isidore",
    "Saint-Stanislas-De-Champlain": "Saint-Denis",
    "Saint-Sebastien-De-Frontenac": "Saint-Sebastien",
    "Sainte-Francoise-De-Lotbiniere": "Saint-Francoise",
    "Saint-Donat-De-Rimouski": "Rimouski",
    "Sheenboro": "Pontiac",
    "Mercier": "Hochelaga", 
    "Mont-Royal": "Ville de Mont-Royal",
    "Rosemont": "Rosemont-La Petite-Patrie",
    "Rivière des Prairies": "Rivière-des-Prairies-Pointe-aux-Trembles",
    "Pointe-Aux-Trembles": "Rivière-des-Prairies-Pointe-aux-Trembles",
    "Côte-des-Neiges": "Côte-des-Neiges–Notre-Dame-de-Grâce"
}

raw_locations = [x if x not in unknown_locations_dict else unknown_locations_dict[x] for x in raw_locations]

### Compute Coordinates

Raw Location Coordinates

In [46]:
output = '../data/processed/locations/raw-location-coordinates.csv'
raw_location_coordinates, unknown_locations = compute_coordinates(locations=raw_locations, output=output)

if len(unknown_locations) == 0:
    print("All coordinates successfully calculated.")

raw_location_coordinates.head()

Finding Location Coordinates: 100%|██████████| 1170/1170 [09:45<00:00,  2.00it/s]

All coordinates successfully calculated.





Unnamed: 0,Location,Latitude,Longitude
0,Beauport,46.907111,-71.212797
1,Deschambault,46.662647,-71.944288
2,Hochelaga,45.542133,-73.54763
3,Stoneham,46.999608,-71.369475
4,Trois-Rivières,46.371592,-72.600502


### Map Raw Locations to Bounding Territories

In [49]:
polygons_gdf = gpd.read_file('../data/processed/locations/location-polygons.gpkg')

In [51]:
location_mapper = {}

for _, location in tqdm(raw_location_coordinates.iterrows(), desc="Building Location Mapper", total=raw_location_coordinates.shape[0]):
    point = Point(location["Longitude"], location["Latitude"])
    contains = polygons_gdf["geometry"].contains(point)
    location_mapper[location["Location"]] = extract_parent_location(polygons_gdf.copy(), contains)

Building Location Mapper: 100%|██████████| 1170/1170 [00:03<00:00, 386.14it/s]


In [52]:
location_mapper_df = pd.DataFrame(location_mapper.items(), columns=['location', 'mapping'])
location_mapper_df.to_csv('../data/processed/locations/location-mapper.csv', index=False)
display(location_mapper_df.head())

Unnamed: 0,location,mapping
0,Beauport,Beauport
1,Deschambault,Portneuf
2,Hochelaga,Mercier-Hochelaga-Maisonneuve
3,Stoneham,La Jacques-Cartier
4,Trois-Rivières,Trois-Rivières


In [53]:
with open('../data/processed/locations/location-mapper.pkl', 'wb') as f:
    pickle.dump(location_mapper, f)