# Location Mapping
The raw data contains over a thousand different locations, a lot of them with very low occurrences. This can considerably reduce the quality of our model. We must filter our locations carefully without reducing the size of our dataset. To do so, we will associate every location to its closest population center.
\
\
This notebook goes through all location values and maps them to their closest population center. Population centers are defined as either:
\
\
Regions
- Quebec Administrative Regions (Estrie, Outaouais, Montérégie etc.)

In [151]:
import numpy as np
import pandas as pd

from geopy.distance import geodesic
from geopy.location import Location
from geopy.geocoders import Nominatim
from tqdm import tqdm

from IPython.display import display

## Useful Methods & Resources

In [137]:
geolocator = Nominatim(user_agent='housing-qc')

In [138]:
def find_coordinates(locations):
    location_dict = []
    unknown_locations = []

    for location in tqdm(locations):
        try:
            geocode = geolocator.geocode(location + ', QC')
        except:
            geocode = None
        
        if geocode is None:
            unknown_locations.append(location)
            print(location + ' is an unknown location')
        else:
            location_dict.append({'Name': location, 'Latitude': geocode.latitude, 'Longitude': geocode.longitude})

    return pd.DataFrame(location_dict), unknown_locations

In [139]:
# Dictionnary for unknown locations
unknown_locations_dict = {}
unknown_locations_dict["L'Ile Des Soeurs"] = "Ile des soeurs"
unknown_locations_dict["St-Joseph-De-Ham-Sud"] = "Ham-Sud"
unknown_locations_dict["St-Mathieu-De-Laprairie"] = "Saint-Mathieu"
unknown_locations_dict["St-Denis-sur-Mer"] = "Saint-Denis"
unknown_locations_dict["St-Isidore-De-Laprairie"] = "Saint-Isidore"
unknown_locations_dict["St-Stanislas-De-Champlain"] = "Saint-Denis"
unknown_locations_dict["St-Sebastien-De-Frontenac"] = "Saint-Sebastien"
unknown_locations_dict["St-Simon-De-Rimouski"] = "Saint-Simon"
unknown_locations_dict["Ste-Francoise-De-Lotbiniere"] = "Saint-Francoise"
unknown_locations_dict["St-Guillaume-D'Upton"] = "Saint-Guillaume"
unknown_locations_dict["St-Adelphe-De-Champlain"] = "Saint-Adelphe"

In [140]:
def fix_unknown_location(unknown_location: str):
    if unknown_location in unknown_locations_dict.keys():
        return unknown_locations_dict[unknown_location]
    elif "ND-" in unknown_location:
        return unknown_location.replace("ND-", "Notre-Dame-")
    elif "JC" in unknown_location:
        return unknown_location.replace("JC", "Jacques-Cartier")
    elif "St-" in unknown_location:
        return unknown_location.replace("St-", "Saint-")
    elif "Ste-" in unknown_location:
        return unknown_location.replace("Ste-", "Sainte-")
    elif "Sts-" in unknown_location:
        return unknown_location.replace("Sts-", "Saints-")
    else:
        return unknown_location

## Raw Locations

### Get Locations from Raw Listings

In [141]:
home_df = pd.read_csv('../data/raw/home_listings.csv')
condo_df = pd.read_csv('../data/raw/condo_listings.csv')
raw_df = pd.concat([home_df, condo_df], axis=0, ignore_index=True)

raw_locations = list(raw_df['location'].value_counts().to_dict().keys())
print('Raw Locations: ' + str(raw_locations[0]) + ', ' + str(raw_locations[1]) + ', ' + str(raw_locations[2]) + ' etc.')

Raw Locations: Gatineau, Trois-Rivières, Beauport etc.


### Get Raw Locations Coordinates

In [142]:
raw_location_coordinates, unknown_locations = find_coordinates(raw_locations)

 10%|▉         | 113/1167 [00:57<12:15,  1.43it/s]

Ste-Catherine-de-la-JC is an unknown location


 17%|█▋        | 195/1167 [01:38<08:33,  1.89it/s]

ND-De-L'Ile-Perrot is an unknown location


 26%|██▋       | 307/1167 [02:34<07:25,  1.93it/s]

St-Mathieu-De-Laprairie is an unknown location


 32%|███▏      | 369/1167 [03:05<06:51,  1.94it/s]

St-Isidore-De-Laprairie is an unknown location


 42%|████▏     | 489/1167 [04:06<05:47,  1.95it/s]

St-Magloire-De-Bellechasse is an unknown location


 42%|████▏     | 494/1167 [04:08<05:34,  2.01it/s]

Sts-Anges is an unknown location


 45%|████▍     | 522/1167 [04:23<05:42,  1.88it/s]

ND-De-La-Merci is an unknown location


 52%|█████▏    | 609/1167 [05:06<04:55,  1.89it/s]

St-Donat-De-Rimouski is an unknown location


 57%|█████▋    | 671/1167 [05:37<04:38,  1.78it/s]

ND-De-La-Salette is an unknown location


 58%|█████▊    | 673/1167 [05:38<04:18,  1.91it/s]

St-Stanislas-De-Champlain is an unknown location


 60%|█████▉    | 700/1167 [05:52<04:09,  1.87it/s]

St-Sebastien-De-Frontenac is an unknown location


 60%|██████    | 703/1167 [05:53<03:34,  2.16it/s]

ND-De-Montauban is an unknown location


 66%|██████▌   | 765/1167 [06:24<03:36,  1.85it/s]

St-Marcel-De-L'Islet is an unknown location


 70%|███████   | 817/1167 [06:51<04:07,  1.41it/s]

St-Joseph-De-Ham-Sud is an unknown location


 71%|███████   | 829/1167 [07:00<07:45,  1.38s/it]

ND-Du-Lac is an unknown location


 76%|███████▌  | 883/1167 [07:27<02:43,  1.73it/s]

ND-De-La-Paix is an unknown location


 77%|███████▋  | 893/1167 [07:32<02:20,  1.94it/s]

Ste-Francoise-De-Lotbiniere is an unknown location


 77%|███████▋  | 896/1167 [07:33<02:17,  1.97it/s]

St-Simon-De-Rimouski is an unknown location


 77%|███████▋  | 904/1167 [07:41<06:07,  1.40s/it]

ND-De-Ham is an unknown location


 86%|████████▌ | 1005/1167 [08:32<01:16,  2.13it/s]

St-Guillaume-D'Upton is an unknown location


 88%|████████▊ | 1030/1167 [08:44<01:11,  1.93it/s]

ND-De-Stanbridge is an unknown location


 90%|█████████ | 1051/1167 [08:55<01:00,  1.93it/s]

St-Adelphe-De-Champlain is an unknown location


 93%|█████████▎| 1089/1167 [09:14<00:41,  1.89it/s]

St-Severin-De-Beauce is an unknown location


 95%|█████████▌| 1113/1167 [09:26<00:25,  2.14it/s]

St-Denis-sur-Mer is an unknown location


 99%|█████████▉| 1158/1167 [09:48<00:04,  1.90it/s]

ND-De-Bonsecours is an unknown location


100%|██████████| 1167/1167 [09:53<00:00,  1.97it/s]


### Format Unknown Locations

In [143]:
fixed_locations = list(map(fix_unknown_location, unknown_locations))
unknown_locations_mapper = {fixed_locations[i]: unknown_locations[i] for i in range(len(unknown_locations))}

raw_location_coordinates2, unknown_locations2 = find_coordinates(fixed_locations)
if len(unknown_locations2) == 0:
    raw_location_coordinates2 = { unknown_locations_mapper.get(k, k): v for k, v in raw_location_coordinates2.items() }
    raw_location_coordinates.update(raw_location_coordinates2)

100%|██████████| 25/25 [00:12<00:00,  2.01it/s]


### Save Final Raw Locations Coordinates

In [145]:
raw_location_coordinates.to_csv('../data/processed/raw_location_coordinates.csv')
raw_location_coordinates.head()

Unnamed: 0,Name,Latitude,Longitude
0,Ste-Catherine-de-la-Jacques-Cartier,46.844381,-71.615023
1,Notre-Dame-De-L'Ile-Perrot,45.351663,-73.902969
2,Saint-Mathieu,45.312563,-73.518448
3,Saint-Isidore,46.585058,-71.090469
4,Saint-Magloire-De-Bellechasse,46.592524,-70.440777


## Regions

In [146]:
regions_df = pd.read_csv('../data/references/qc-administrative-regions.csv')
regions = dict(zip(regions_df['City'], regions_df['Region']))

display(regions_df.head())

Unnamed: 0,City,Region
0,Rimouski,Bas-Saint-Laurent
1,Rivière-du-Loup,Bas-Saint-Laurent
2,Matane,Bas-Saint-Laurent
3,Alma,Saguenay-Lac-Saint-Jean
4,Saguenay,Saguenay-Lac-Saint-Jean


### Get Region City Coordinates

In [148]:
region_city_coordinates, unknown_cities = find_coordinates(regions.keys())

region_city_coordinates.to_csv('../data/processed/region_city_coordinates.csv')
region_city_coordinates.head()

100%|██████████| 40/40 [00:19<00:00,  2.02it/s]


Unnamed: 0,Name,Latitude,Longitude
0,Rimouski,48.450155,-68.529968
1,Rivière-du-Loup,47.835816,-69.536802
2,Matane,48.846877,-67.52955
3,Alma,48.548887,-71.651459
4,Saguenay,48.405959,-71.069183


In [None]:
def find_closest_location(latitude, longitude, locations: pd.DataFrame):
    distances = []

    for _, location in locations.iterrows():
        distances.append(geodesic((latitude, longitude), (location['Latitude'], location['Longitude'])))

    return locations.iloc[np.argmin(distances)]['Name']

In [None]:
def build_location_mapper(locations: pd.DataFrame, reference_locations: pd.DataFrame):
    location_mapper = {}
    
    for _, location in locations.iterrows:
        closest_location = find_closest_location(location['Latitude'], location['Longitude'], reference_locations)
        location_mapper[location] = closest_location
        
    return location_mapper