In [2]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

from shapely.geometry import MultiPolygon, Polygon
from shapely.validation import make_valid

In [3]:
CEN_YEARS = [
    1951, 1956, 1961, 1966, 1971, 1976, 1981, 1986, 1991, 1996, 
    2001, 2006, 2011, 2016, 2021
]

FED_YEARS = [1952, 1966, 1976, 1987, 1996, 1999, 2003, 2013]
ONTED_YEARS = [1962, 1966, 1975, 1987, 1996, 2005, 2015]

FELXN_YEARS = [
    1962, 1963, 1965, 1968, 1972, 1974, 1979, 1980, 1984, 1988, 
    1993, 1997, 2000, 2004, 2006, 2008, 2011, 2015, 2019, 2021
]
ONTELXN_YEARS = [
    1963, 1967, 1971, 1975, 1977, 1981, 1985, 1987, 1990, 1995, 
    1999, 2003, 2007, 2011, 2014, 2018, 2022
]

In [4]:
YEAR_CODES = {
    1951: {
        'num_pop_tot': ['pop__tot1951ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1956: {
        'num_pop_tot': ['pop__tot1956ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1961: {
        'num_pop_tot': ['pop__tot1961ttd'],
        'num_imm_tot': ['imb__tot1961ttd'],
        'num_imm_new': ['impi19611961tt1', 'impi19601961tt1', 'impi195819591961tt1', 'impi195619571961tt1', 'impi195119551961tt1'],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1966: {
        'num_pop_tot': ['pop__tot1966ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1971: {
        'num_pop_tot': ['pop__tot1971ttd'],
        'num_imm_tot': ['imb__tot1971ttd'],
        'num_imm_new': [],
        'avg_hou_inc': ['ihat_avg1971ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1971tt1', 'lnh_1resoffifr__1971tt1'],
    }, 
    1976: {
        'num_pop_tot': ['pop__tot1976ttd'],
        'num_imm_tot': [],
        'num_imm_new': [],
        'avg_hou_inc': [],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': [],
    }, 
    1981: {
        'num_pop_tot': ['pop__tot1981ttd'],
        'num_imm_tot': ['imag_tot1981ttd'],
        'num_imm_new': ['impi197819811981tt1', 'impi197019771981tt1'],
        'avg_hou_inc': ['ihat_avg1981ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1981tt1', 'lnh_1resoffifr__1981tt1'],
    }, 
    1986: {
        'num_pop_tot': ['pop__tot1986ttd'],
        'num_imm_tot': ['imb__tot1986ttd'],
        'num_imm_new': ['impi198319861986tt1', 'impi197819821986tt1'],
        'avg_hou_inc': ['ihat_avg1986ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1986tt1', 'lnh_1resoffifr__1986tt1'],
    }, 
    1991: {
        'num_pop_tot': ['pop__tot1991ttd'],
        'num_imm_tot': ['imd__tot1991ttd'],
        'num_imm_new': ['impi198819911991tt1', 'impi198119871991tt1'],
        'avg_hou_inc': ['ihat_avg1991ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1991tt1', 'lnh_1resoffifr__1991tt1'],
    }, 
    1996: {
        'num_pop_tot': ['pop__tot1996ttd'],
        'num_imm_tot': ['imb__tot1996ttd'],
        'num_imm_new': ['impi199119961996tt1', 'impi198119901996tt1'],
        'avg_hou_inc': ['ihat_avg1996ttn'],
        'num_not_vm_tot': [],
        'num_enfr_home_tot': ['lnh_1resoffien__1996tt1', 'lnh_1resoffifr__1996tt1', 'lnh_mresenfr1996tt1'],
    }, 
    2001: {
        'num_pop_tot': ['pop__tot2001ttd'],
        'num_imm_tot': ['imb__tot2001ttd'],
        'num_imm_new': ['impi199620012001tt1'],
        'avg_hou_inc': ['ihat_avg2001ttn'],
        'num_not_vm_tot': ['vminnvis2001tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2001tt1', 'lnh_1resoffifr__2001tt1', 'lnh_mresenfr2001tt1'],
    }, 
    2006: {
        'num_pop_tot': ['pop__tot2006ttd'],
        'num_imm_tot': ['imb__tot2006ttd'],
        'num_imm_new': ['impi200120062006tt1', 'impi199620002006tt1'],
        'avg_hou_inc': ['ihat_avg2006ttn'],
        'num_not_vm_tot': ['vminnvis2006tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2006tt1', 'lnh_1resoffifr__2006tt1', 'lnh_mresenfr2006tt1'],
    }, 
    2011: {
        'num_pop_tot': ['pop__tot2011ttd'],
        'num_imm_tot': ['imb__tot2011ttd'],
        'num_imm_new': ['impi200620112011tt1', 'impi200120052011tt1'],
        'avg_hou_inc': ['ihat_avg2011ttn'],
        'num_not_vm_tot': ['vminnvis2011tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2011tt1', 'lnh_1resoffifr__2011tt1', 'lnh_mresenfr2011tt1'],
    }, 
    2016: {
        'num_pop_tot': ['pop__tot2016ttd'],
        'num_imm_tot': ['imb__tot2016ttd'],
        'num_imm_new': ['impi201120162016tt1', 'impi200620102016tt1'],
        'avg_hou_inc': ['ihat_avg2016ttn'],
        'num_not_vm_tot': ['vminnvis2016tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2016tt1', 'lnh_1resoffifr__2016tt1', 'lnh_mresenfr2016tt1'],
    }, 
    2021: {
        'num_pop_tot': ['pop__tot2021ttd'],
        'num_imm_tot': ['imb__tot2021ttd'],
        'num_imm_new': ['impi201620212021tt1', 'impi201120152021tt1'],
        'avg_hou_inc': ['ihat_avg2021ttn'],
        'num_not_vm_tot': ['vminnvis2021tt1'],
        'num_enfr_home_tot': ['lnh_1resoffien__2021tt1', 'lnh_1resoffifr__2021tt1', 'lnh_mresenfr2021tt1'],
    },
}

In [9]:
def get_census_year(year):
    """
    Returns the appropriate census year based on different rules for different periods:
    - Before 1961: return 1961
    - 1961-1980: round down to decade + 1 (1961, 1971)
    - After 1981: round down to nearest 5 + 1 (1981, 1986, 1991, etc.)
    
    Examples:
    1955 -> 1961
    1965 -> 1961
    1975 -> 1971
    1980 -> 1971
    1987 -> 1986
    2003 -> 2001
    """
    if year < 1961:
        return 1961
    elif year <= 1980:
        return year - ((year - 1961) % 10)
    else:
        return year - ((year - 1951) % 5)

def get_ed_year(year, is_ontario=True):
    """
    Returns the most recent electoral district year prior to the input year.
    
    Args:
        year (int): The year to look up
        is_ontario (bool): If True, use Ontario electoral districts years,
                       if False, use Federal electoral district years
    
    Returns:
        int: The most recent electoral district year

    Examples:
        get_ed_year(1962, is_ontario=True) -> 1962
        get_ed_year(1962, is_ontario=False) -> 1952
        get_ed_year(2003, is_ontario=True) -> 1996
        get_ed_year(2003, is_ontario=False) -> 2003
    """
    years = ONTED_YEARS if is_ontario else FED_YEARS
    valid_years = [y for y in years if y <= year]
    if not valid_years:
        return years[0]  # Return earliest year if input year is before all valid years
    return max(valid_years)

Create approximations for each of the census variables under consideration, for each election year. We want this at the level of electoral districts and approximated from census tracts.

In [72]:
def add_census_values_to_gdf(gdf_full, gdf_small, cen_year, cen_var):
    cols = YEAR_CODES[cen_year][cen_var]

    if len(cols) == 0:  
        gdf_small[cen_var] = np.nan
    elif len(cols) > 1:
        gdf_small.loc[:, cen_var] = gdf_full.loc[:, cols].sum(axis=1)
    elif cen_var == 'num_not_vm_tot':
        gdf_small['num_vm_tot'] = gdf_full[YEAR_CODES[cen_year]['num_pop_tot']] - gdf_full[cols]
    else:
        gdf_small.loc[:, cen_var] = gdf_full.loc[:, cols]

In [9]:
def compute_ed_stats(gdf_ed_gta, gdf_ct_gta, df_ct_cen, cen_year):
    gdf_ct_gta['geosid'] = gdf_ct_gta['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].astype(str)
    df_ct_cen['geosid'] = df_ct_cen['geosid'].apply(lambda x: x[:-2] + '.00' if x.endswith('.0') else x)

    gdf_full = pd.merge(
        gdf_ct_gta,
        df_ct_cen,
        on='geosid',
        how='left'
    )

    gdf_small = gdf_full[['geosid', 'geometry']].copy()
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_pop_tot')
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_imm_tot')
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_imm_new')
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'avg_hou_inc')
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_not_vm_tot')
    add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_enfr_home_tot')

    # Find intersections between census tracts and electoral districts
    pairs = gpd.sjoin(gdf_small, gdf_ed_gta, how="inner", predicate="intersects")

    # Calculate area weights for overlapping geometries
    pairs['overlap_area'] = pairs.apply(
        lambda row: row['geometry'].intersection(
            gdf_ed_gta.loc[row['index_right'], 'geometry']
        ).area / row['geometry'].area,
        axis=1
    )

    # List of columns to aggregate (excluding geosid, geometry)
    value_columns = [col for col in gdf_small.columns 
                    if col not in ['geosid', 'geometry']]

    # Initialize result dataframe with electoral district geometries
    result = gdf_ed_gta.copy()

    # Calculate weighted sums for each value column
    for col in value_columns:
        weighted_values = pairs[col] * pairs['overlap_area']
        result[col] = weighted_values.groupby(pairs['index_right']).sum()

    return result

In [13]:
for year in tqdm(ONTELXN_YEARS):
    ed_year = get_ed_year(year, is_ontario=True)
    cen_year = get_census_year(year)

    gdf_onted_gta = gpd.read_file(f'../data/geo/{ed_year}_ont-ed/ont-ed_gta_{ed_year}.gpkg')
    gdf_ct_gta = gpd.read_file(f"../data/geo/{cen_year}_ct/ct_gta_{cen_year}.gpkg")
    df_ct_cen = pd.read_csv(f"../data/census/{cen_year}_ct_wide/census_wide_{cen_year}_ct.csv")

    gdf_onted_stats = compute_ed_stats(gdf_onted_gta, gdf_ct_gta, df_ct_cen, cen_year)
    gdf_onted_stats = gdf_onted_stats[gdf_onted_stats.ct_overlap >= 0.75]

    gdf_onted_stats.to_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg')
    break

  0%|          | 0/17 [00:00<?, ?it/s]


In [14]:
gdf_onted_stats

Unnamed: 0,onted_id,geoname,gta_overlap,ct_overlap,geometry,num_pop_tot,num_imm_tot,num_imm_new,avg_hou_inc,num_not_vm_tot,num_enfr_home_tot
2,47,Oshawa,0.938015,0.942732,"POLYGON ((-79.00852 43.92252, -79.00590 43.916...",87646.071295,20168.760068,10089.54682,0.0,0.0,0.0
5,80,Armourdale,1.0,1.0,"MULTIPOLYGON (((-79.42007 43.79799, -79.40971 ...",48250.761317,14375.229932,5495.182498,0.0,0.0,0.0
6,81,Beaches,0.988158,0.973151,"MULTIPOLYGON (((-79.30174 43.65935, -79.30451 ...",44725.625741,13891.616143,6383.908222,0.0,0.0,0.0
7,82,Bellwoods,1.0,0.996049,"MULTIPOLYGON (((-79.40612 43.62767, -79.41194 ...",46666.793579,27030.49641,18997.879358,0.0,0.0,0.0
8,83,Bracondale,1.0,0.979014,"MULTIPOLYGON (((-79.42590 43.68204, -79.42435 ...",46276.90774,25291.339,16956.172822,0.0,0.0,0.0
9,84,Don Mills,1.0,1.0,"MULTIPOLYGON (((-79.35612 43.73468, -79.32654 ...",48429.646636,14196.137467,6782.225136,0.0,0.0,0.0
10,85,Dovercourt,1.0,0.992673,"MULTIPOLYGON (((-79.41530 43.62624, -79.42487 ...",63060.833265,31026.527007,21292.635907,0.0,0.0,0.0
11,86,Downsview,1.0,1.0,"MULTIPOLYGON (((-79.44580 43.79241, -79.44514 ...",79902.937461,27763.82011,14362.190419,0.0,0.0,0.0
12,87,Eglinton,1.0,1.0,"MULTIPOLYGON (((-79.37568 43.70692, -79.37174 ...",75037.341218,24191.608548,10029.437537,0.0,0.0,0.0
13,88,Etobicoke,1.0,1.0,"MULTIPOLYGON (((-79.51026 43.68419, -79.51410 ...",48084.283075,11052.966195,6132.479916,0.0,0.0,0.0


In [77]:
cen_year = 1961
gdf_ct_gta = gpd.read_file(f"../data/geo/{cen_year}_ct/ct_gta_{cen_year}.gpkg")
# gdf_ct_gta

df_ct_cen = pd.read_csv(f"../data/census/{cen_year}_ct_wide/census_wide_{cen_year}_ct.csv")
# df_ct_cen

ed_year = 1962
gdf_ed_gta = gpd.read_file(f'../data/geo/{ed_year}_ont-ed/ont-ed_gta_{ed_year}.gpkg')

# Check data types
# print("gdf_ct_gta geosid type:", gdf_ct_gta['geosid'].dtype)
# print("df_ct_cen geosid type:", df_ct_cen['geosid'].dtype)

# Convert geosid to string in both dataframes
gdf_ct_gta['geosid'] = gdf_ct_gta['geosid'].astype(str)
df_ct_cen['geosid'] = df_ct_cen['geosid'].astype(str)
df_ct_cen['geosid'] = df_ct_cen['geosid'].apply(lambda x: x[:-2] + '.00' if x.endswith('.0') else x)

# Merge dataframes
gdf_full = pd.merge(
    gdf_ct_gta,
    df_ct_cen,
    on='geosid',
    how='left'
)
gdf_small = gdf_full[['geosid', 'geometry']].copy()
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_pop_tot')
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_imm_tot')
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_imm_new')
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'avg_hou_inc')
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_not_vm_tot')
add_census_values_to_gdf(gdf_full, gdf_small, cen_year, 'num_enfr_home_tot')

# Check for any missing values after merge
# print("\nMissing values after merge:")
# print(gdf_new.isnull().sum())

pairs = gpd.sjoin(gdf_small, gdf_ed_gta, how="inner", predicate="intersects")
pairs

Unnamed: 0,geosid,geoname_left,geometry,num_pop_tot,num_imm_tot,num_imm_new,avg_hou_inc,num_not_vm_tot,num_enfr_home_tot,index_right,onted_id,geoname_right,gta_overlap,ct_overlap
0,5370006.00,0006.00,"MULTIPOLYGON (((-79.87703 43.27047, -79.87077 ...",6264.0,2734.0,1635.0,,,,0,20,Halton,0.995934,0.518402
1,5370014.00,0014.00,"MULTIPOLYGON (((-79.85549 43.27675, -79.85614 ...",8362.0,2810.0,1547.0,,,,0,20,Halton,0.995934,0.518402
2,5370022.00,0022.00,"MULTIPOLYGON (((-79.83015 43.27174, -79.83076 ...",4336.0,1497.0,724.0,,,,0,20,Halton,0.995934,0.518402
3,5370029.00,0029.00,"MULTIPOLYGON (((-79.82399 43.27262, -79.82400 ...",3349.0,1362.0,632.0,,,,0,20,Halton,0.995934,0.518402
4,5370062.00,0062.00,"MULTIPOLYGON (((-79.78236 43.26213, -79.78236 ...",3486.0,707.0,309.0,,,,0,20,Halton,0.995934,0.518402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,5350400.00,0400.00,"POLYGON ((-79.54350 43.58109, -79.54364 43.580...",6881.0,1615.0,803.0,,,,3,53,Peel,0.986201,0.240567
360,5350580.00,0 .00,"POLYGON ((-78.99950 43.82550, -79.01244 43.855...",2536.0,615.0,373.0,,,,4,79,York North,0.873533,0.273449
360,5350580.00,0 .00,"POLYGON ((-78.99950 43.82550, -79.01244 43.855...",2536.0,615.0,373.0,,,,25,100,Scarborough East,1.000000,0.996097
360,5350580.00,0 .00,"POLYGON ((-78.99950 43.82550, -79.01244 43.855...",2536.0,615.0,373.0,,,,2,47,Oshawa,0.938015,0.942732


In each of the data frames for the elections which contain census approximations for each electoral District, add the vote share for that riding in the given year for the three major parties as well as an other category.