In [10]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

from shapely.geometry import MultiPolygon, Polygon
from shapely.validation import make_valid

In [2]:
CEN_YEARS = [1951, 1956, 1961, 1966, 1971, 1976, 1981, 1986, 1991, 1996, 2001, 2006, 2011, 2016, 2021]

FED_YEARS = [1952, 1966, 1976, 1987, 1996, 1999, 2003, 2013]
ONTED_YEARS = [1962, 1966, 1975, 1987, 1996, 2005, 2015]

In [3]:
mpoly_GTA = gpd.read_file('../data/geo/regions/GTA_2013_OrthoTile_Index/gta.gpkg').geometry.to_list()[0]

Identify FEDs and ONT-EDs from each redistricting period which are at least 50% within the GTA

In [36]:
for year in tqdm(FED_YEARS):
    gdf_fed_all = gpd.read_file(f'../data/geo/{year}_fed/fed_{year}/fed_{year}.shp')
    gdf_fed_all['gta_overlap'] = gdf_fed_all.apply(lambda row: row.geometry.intersection(mpoly_GTA).area / row.geometry.area, axis=1)
    gdf_fed_gta = gdf_fed_all[gdf_fed_all.gta_overlap >= 0.5]
    gdf_fed_gta.to_file(f'../data/geo/{year}_fed/fed_gta_{year}.gpkg', driver="GPKG")

100%|█████████████████████████████████████████████| 8/8 [00:12<00:00,  1.54s/it]


In [35]:
def extract_valid_geometry(geom):
    geom = make_valid(geom)  # Ensure it's valid
    if geom.geom_type == "GeometryCollection":
        # Extract the first Polygon or MultiPolygon found
        polygons = [g for g in geom.geoms if isinstance(g, (Polygon, MultiPolygon))]
        return MultiPolygon(polygons) if len(polygons) > 1 else polygons[0] if polygons else None
    return geom  # If it's already a valid Polygon/MultiPolygon, return it as is

In [44]:
for year in tqdm(ONTED_YEARS):
    gdf_onted_all = gpd.read_file(f'../data/geo/{year}_ont-ed/ont-ed_{year}.geojson')
    if year <= 1987:
        gdf_onted_all = gdf_onted_all[['id', 'RIDINGNAME', 'geometry']].rename(
            columns={'id': 'onted_id', 'RIDINGNAME': 'geoname'}
        )
    elif year == 1996:
        gdf_onted_all = gdf_onted_all[['FED_NUM', 'FED_NAME', 'RIDINGNO', 'RIDINGNAME', 'geometry']].rename(
            columns={'FED_NUM': 'fed_id', 'RIDINGNO': 'onted_id', 'RIDINGNAME': 'geoname'}
        )
    elif year == 2005:
        gdf_onted_all = gdf_onted_all[['RIDINGNO', 'RIDINGNAME', 'geometry']].rename(
            columns={'RIDINGNO': 'onted_id', 'RIDINGNAME': 'geoname'}
        )
    elif year == 2015:
        gdf_onted_all = gdf_onted_all[['ED_ID', 'RIDINGNAME', 'geometry']].rename(
            columns={'ED_ID': 'onted_id', 'RIDINGNAME': 'geoname'}
        )
    
    gdf_onted_all['geometry'] = gdf_onted_all['geometry'].apply(make_valid)
    gdf_onted_all["geometry"] = gdf_onted_all["geometry"].apply(extract_valid_geometry)
    # gdf_onted_all['geometry'] = gdf_onted_all.make_valid() #[gdf_onted_all.geometry.is_valid]
    
    gdf_onted_all['gta_overlap'] = gdf_onted_all.apply(lambda row: row.geometry.intersection(mpoly_GTA).area / row.geometry.area, axis=1)
    gdf_onted_gta = gdf_onted_all[gdf_onted_all.gta_overlap >= 0.5]
    
    gdf_onted_gta.to_file(f'../data/geo/{year}_ont-ed/ont-ed_gta_{year}.gpkg', driver='GPKG')

100%|█████████████████████████████████████████████| 7/7 [00:01<00:00,  5.53it/s]


Associate CT's to relevant FED/ONT-ED's, and save only relevant columns and CT's in the GTA. 

In [8]:
YEAR_CODES = {
    1951: {
        'num_pop_tot': ['pop__tot1951ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1956: {
        'num_pop_tot': ['pop__tot1956ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1961: {
        'num_pop_tot': ['pop__tot1961ttd'],
        'num_imm_tot': ['imb__tot1961ttd'],
        'num_imm_new': ['impi19611961tt1', 'impi19601961tt1', 'impi195819591961tt1', 'impi195619571961tt1', 'impi195119551961tt1'],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1966: {
        'num_pop_tot': ['pop__tot1966ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1971: {
        'num_pop_tot': ['pop__tot1971ttd'],
        'num_imm_tot': ['imb__tot1971ttd'],
        'num_imm_new': [],
        'avg_hou_inc': ['ihat_avg1971ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1971tt1', 'lnh_1resoffifr__1971tt1'],
    }, 
    1976: {
        'num_pop_tot': ['pop__tot1976ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1981: {
        'num_pop_tot': ['pop__tot1981ttd'],
        'num_imm_tot': ['imag_tot1981ttd'],
        'num_imm_new': ['impi197819811981tt1', 'impi197019771981tt1'],
        'avg_hou_inc': ['ihat_avg1981ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1981tt1', 'lnh_1resoffifr__1981tt1'],
    }, 
    1986: {
        'num_pop_tot': ['pop__tot1986ttd'],
        'num_imm_tot': ['imb__tot1986ttd'],
        'num_imm_new': ['impi198319861986tt1', 'impi197819821986tt1'],
        'avg_hou_inc': ['ihat_avg1986ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1986tt1', 'lnh_1resoffifr__1986tt1'],
    }, 
    1991: {
        'num_pop_tot': ['pop__tot1991ttd'],
        'num_imm_tot': ['imd__tot1991ttd'],
        'num_imm_new': ['impi198819911991tt1', 'impi198119871991tt1'],
        'avg_hou_inc': ['ihat_avg1991ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1991tt1', 'lnh_1resoffifr__1991tt1'],
    }, 
    1996: {
        'num_pop_tot': ['pop__tot1996ttd'],
        'num_imm_tot': ['imb__tot1996ttd'],
        'num_imm_new': ['impi199119961996tt1', 'impi198119901996tt1'],
        'avg_hou_inc': ['ihat_avg1996ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1996tt1', 'lnh_1resoffifr__1996tt1', 'lnh_mresenfr1996tt1'],
    }, 
    2001: {
        'num_pop_tot': ['pop__tot2001ttd'],
        'num_imm_tot': ['imb__tot2001ttd'],
        'num_imm_new': ['impi199620012001tt1'],
        'avg_hou_inc': ['ihat_avg2001ttn'],
        'num_not_vm_tot': ['vminnvis2001tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2001tt1', 'lnh_1resoffifr__2001tt1', 'lnh_mresenfr2001tt1'],
    }, 
    2006: {
        'num_pop_tot': ['pop__tot2006ttd'],
        'num_imm_tot': ['imb__tot2006ttd'],
        'num_imm_new': ['impi200120062006tt1', 'impi199620002006tt1'],
        'avg_hou_inc': ['ihat_avg2006ttn'],
        'num_not_vm_tot': ['vminnvis2006tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2006tt1', 'lnh_1resoffifr__2006tt1', 'lnh_mresenfr2006tt1'],
    }, 
    2011: {
        'num_pop_tot': ['pop__tot2011ttd'],
        'num_imm_tot': ['imb__tot2011ttd'],
        'num_imm_new': ['impi200620112011tt1', 'impi200120052011tt1'],
        'avg_hou_inc': ['ihat_avg2011ttn'],
        'num_not_vm_tot': ['vminnvis2011tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2011tt1', 'lnh_1resoffifr__2011tt1', 'lnh_mresenfr2011tt1'],
    }, 
    2016: {
        'num_pop_tot': ['pop__tot2016ttd'],
        'num_imm_tot': ['imb__tot2016ttd'],
        'num_imm_new': ['impi201120162016tt1', 'impi200620102016tt1'],
        'avg_hou_inc': ['ihat_avg2016ttn'],
        'num_not_vm_tot': ['vminnvis2016tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2016tt1', 'lnh_1resoffifr__2016tt1', 'lnh_mresenfr2016tt1'],
    }, 
    2021: {
        'num_pop_tot': ['pop__tot2021ttd'],
        'num_imm_tot': ['imb__tot2021ttd'],
        'num_imm_new': ['impi201620212021tt1', 'impi201120152021tt1'],
        'avg_hou_inc': ['ihat_avg2021ttn'],
        'num_not_vm_tot': ['vminnvis2021tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2021tt1', 'lnh_1resoffifr__2021tt1', 'lnh_mresenfr2021tt1'],
    },
}

In [52]:
# For each year, save those CT's which are in the GTA
for year in tqdm(CEN_YEARS):
    gdf_geo_all = gpd.read_file(f"../data/geo/{year}_ct/ct_{year}.zip")
    gdf_geo_all['gta_overlap'] = gdf_geo_all.apply(lambda row: row.geometry.intersection(mpoly_GTA).area / row.geometry.area, axis=1)
    gdf_geo_gta = gdf_geo_all[gdf_geo_all.gta_overlap > 0]
    gdf_geo_gta.to_file(f"../data/geo/{year}_ct/ct_gta_{year}.gpkg", driver='GPKG')

100%|███████████████████████████████████████████| 15/15 [00:11<00:00,  1.31it/s]


In [39]:
def get_census_year(year):
    """
    Returns the appropriate census year based on different rules for different periods:
    - Before 1961: return 1961
    - 1961-1980: round down to decade + 1 (1961, 1971)
    - After 1981: round down to nearest 5 + 1 (1981, 1986, 1991, etc.)
    
    Examples:
    1955 -> 1961
    1965 -> 1961
    1975 -> 1971
    1980 -> 1971
    1987 -> 1986
    2003 -> 2001
    """
    if year < 1961:
        return 1961
    elif year <= 1980:
        return year - ((year - 1961) % 10)
    else:
        return year - ((year - 1951) % 5)

def compute_ct_overlap(gdf_ed_gta, gdf_ct_gta):
    """
    Compute the proportion of each electoral district that is covered by census tracts,
    excluding any overlap with Lake Ontario.
    
    Args:
        gdf_ed_gta (GeoDataFrame): Electoral districts with 'geometry' column
        gdf_ct_gta (GeoDataFrame): Census tracts with 'geometry' column
    
    Returns:
        GeoDataFrame: Electoral districts with new 'ct_overlap' column showing proportion
                     of land area (excluding water) covered by census tracts
    """
    # Load Lake Ontario geometry
    lake_gdf = gpd.read_file('../data/geo/regions/can_lake_ontario.gpkg')
    
    # Ensure same CRS
    if lake_gdf.crs != gdf_ed_gta.crs:
        lake_gdf = lake_gdf.to_crs(gdf_ed_gta.crs)
    lake_ontario = lake_gdf.geometry.iloc[0]
    
    # Initialize ct_overlap column
    gdf_ed_gta['ct_overlap'] = 0.0
    
    # Create land_geometry excluding lake
    gdf_ed_gta['land_geometry'] = gdf_ed_gta['geometry'].apply(
        lambda x: make_valid(x).difference(make_valid(lake_ontario)) 
        if make_valid(x).intersects(make_valid(lake_ontario)) else x
    )
    
    # Find overlapping pairs efficiently
    pairs = gpd.sjoin(gdf_ct_gta, gdf_ed_gta, how="inner", predicate="intersects")
    
    # Compute overlap proportions
    for idx, ct_row in pairs.iterrows():
        ct_geom = gdf_ct_gta.loc[idx, 'geometry']
        ed_geom = gdf_ed_gta.loc[ct_row.index_right, 'land_geometry']
        
        intersection_prop = ct_geom.intersection(ed_geom).area / ed_geom.area
        
        if intersection_prop > 0:
            gdf_ed_gta.loc[ct_row.index_right, 'ct_overlap'] += intersection_prop
    
    # Clean up
    gdf_ed_gta['ct_overlap'] = gdf_ed_gta['ct_overlap'].clip(upper=1.0)
    gdf_ed_gta = gdf_ed_gta.drop(columns=['land_geometry'])
    
    return gdf_ed_gta

In [40]:
for year in tqdm(ONTED_YEARS):
    gdf_onted_gta = gpd.read_file(f'../data/geo/{year}_ont-ed/ont-ed_gta_{year}.gpkg')
    cen_year = get_census_year(year)
    gdf_ct_gta = gpd.read_file(f"../data/geo/{cen_year}_ct/ct_gta_{cen_year}.gpkg")

    gdf_onted_gta = compute_ct_overlap(gdf_onted_gta, gdf_ct_gta)

    gdf_onted_gta.to_file(f'../data/geo/{year}_ont-ed/ont-ed_gta_{year}_ct.gpkg')

for year in tqdm(FED_YEARS):
    gdf_fed_gta = gpd.read_file(f'../data/geo/{year}_fed/fed_gta_{year}.gpkg')
    cen_year = get_census_year(year)
    gdf_ct_gta = gpd.read_file(f"../data/geo/{cen_year}_ct/ct_gta_{cen_year}.gpkg")

    gdf_fed_gta = compute_ct_overlap(gdf_fed_gta, gdf_ct_gta)

    gdf_fed_gta.to_file(f'../data/geo/{year}_fed/fed_gta_{year}_ct.gpkg')

100%|██████████| 7/7 [00:04<00:00,  1.45it/s]
100%|██████████| 8/8 [00:05<00:00,  1.36it/s]


In [None]:
# For each year, for each riding (with >75% CT coverage), compute the CT overlap and proportionate stats for census values

# Save the final values for each riding as a separate file