In [12]:
import os
import re

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

In [13]:
valid_CMA = [
    "St. John's", 
    "Halifax", 
    "Québec", 
    "Montréal", 
    "Ottawa - Gatineau (Ontario part / partie de l'Ontario)", 
    "Toronto", 
    "Kitchener - Cambridge - Waterloo", 
    "Hamilton", 
    "London", 
    "Winnipeg", 
    "Regina", 
    "Saskatoon", 
    "Edmonton", 
    "Calgary", 
    "Vancouver", 
    "Victoria"
]

valid_mont = [
    "Montréal", 
    "Montréal-Est", 
    "Montréal-Ouest", 
    "Westmount", 
    "Côte-Saint-Luc", 
    "Mont-Royal", 
    "Hampstead", 
    "Dorval", 
    "Pointe-Claire", 
    "Dollard-Des Ormeaux", 
    "Kirkland", 
    "Beaconsfield", 
    "Baie-D'Urfé", 
    "Sainte-Anne-de-Bellevue", 
    "Senneville"
]

Filter for the CMAs that we are interested in, and replace the Toronto CMA with the Greater Toronto Area region

In [39]:
gdf_cma = gpd.read_file('../data/boundaries/cen_cma/')

In [None]:
gdf_regions = gdf_cma.loc[gdf_cma['CMANAME'].isin(valid_CMA)].reset_index(drop=True)

# Substitute Toronto CMA for GTA
gdf_gta = gpd.read_file('../data/boundaries/greater-toronto-area.gpkg')
gdf_gta = gdf_gta.to_crs(gdf_regions.crs)
gdf_gta['CMANAME'] = 'Greater Toronto Area'
gdf_regions = pd.concat([gdf_regions, gdf_gta], ignore_index=True)
gdf_regions = gdf_regions[gdf_regions['CMANAME'] != 'Toronto']

gdf_regions = gdf_regions.rename(columns={'CMANAME': 'REGION_NAME'})
gdf_regions = gdf_regions.drop(columns=['DGUID', 'DGUIDP', 'CMATYPE', 'LANDAREA', 'PRUID'])
gdf_regions.to_file('../data/boundaries/valid_regions.gpkg', driver='GPKG')

Filter for the CSDs which are contained within a CMA/region, then replace all the CSDs on the Island of Montreal with the combined geography

In [115]:
gdf_csd = gpd.read_file('../data/boundaries/cen_csd/')
gdf_regions = gpd.read_file('../data/boundaries/valid_regions.gpkg')

In [116]:
gdf_csd = gpd.sjoin(gdf_csd, gdf_regions, how='inner', predicate='intersects')
gdf_csd = gdf_csd[gdf_csd.columns.intersection([
    'CSDUID', 'CSDNAME', 'geometry'  # Keep only these columns
])]
gdf_csd.to_file('../data/boundaries/valid_csds.gpkg', driver='GPKG')

In [117]:
# 1. First get ONLY Montreal CSDs by combining name and spatial filtering
montreal_boundary = gpd.read_file('../data/boundaries/ile-de-montreal.gpkg').to_crs(gdf_csd.crs)
montreal_csds = gpd.sjoin(
    gdf_csd,
    montreal_boundary,
    how='inner',
    predicate='within'  # Or 'intersects' if some border cases exist
)['CSDNAME'].unique().tolist()

# 2. Now filter using BOTH name list AND spatial confirmation
gdf_csd = gpd.read_file('../data/boundaries/valid_csds.gpkg')
to_remove = gdf_csd['CSDNAME'].isin(valid_mont) & gdf_csd['CSDUID'].isin(montreal_csds)
gdf_csd = gdf_csd[~to_remove]

# 3. Add unified Montreal (rest of your existing code works fine)
montreal_geom = montreal_boundary.geometry.iloc[0]
new_row = gpd.GeoDataFrame({
    'CSDNAME': ['Island of Montreal'],
    'geometry': [montreal_geom]
}, crs=gdf_csd.crs)

for col in gdf_csd.columns:
    if col not in new_row:
        new_row[col] = None

final_gdf = pd.concat([gdf_csd, new_row], ignore_index=True)
final_gdf.to_file('../data/boundaries/valid_csds.gpkg', driver='GPKG')