In [3]:
import os
import re

import numpy as np
from scipy import stats
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

In [4]:
from constants import FELXN_YEARS, ONTELXN_YEARS

In [11]:
FELXN_RESULTS = {
    1962: {'cons1': 37.2, 'cons2': 0.0, 'lib': 37.0, 'ndp': 13.6},
    1963: {'cons1': 32.8, 'cons2': 0.0, 'lib': 41.5, 'ndp': 13.2},
    1965: {'cons1': 32.4, 'cons2': 0.0, 'lib': 40.2, 'ndp': 17.9},
    1968: {'cons1': 31.4, 'cons2': 0.0, 'lib': 45.4, 'ndp': 17.0},
    1972: {'cons1': 35.0, 'cons2': 0.0, 'lib': 38.4, 'ndp': 17.8},
    1974: {'cons1': 35.5, 'cons2': 0.0, 'lib': 43.2, 'ndp': 15.4},
    1979: {'cons1': 35.9, 'cons2': 0.0, 'lib': 40.1, 'ndp': 17.9},
    1980: {'cons1': 32.5, 'cons2': 0.0, 'lib': 44.3, 'ndp': 19.8},
    1984: {'cons1': 50.0, 'cons2': 0.0, 'lib': 28.0, 'ndp': 18.8},
    1988: {'cons1': 43.0, 'cons2': 0.0, 'lib': 32.0, 'ndp': 20.4},
    1993: {'cons1': 16.0, 'cons2': 18.7, 'lib': 41.3, 'ndp': 6.9},
    1997: {'cons1': 18.8, 'cons2': 19.4, 'lib': 38.5, 'ndp': 11.0},
    2000: {'cons1': 12.2, 'cons2': 25.5, 'lib': 40.8, 'ndp': 8.5},
    2004: {'cons1': 29.6, 'cons2': 0.0, 'lib': 36.7, 'ndp': 15.7},
    2006: {'cons1': 36.3, 'cons2': 0.0, 'lib': 30.2, 'ndp': 17.5},
    2008: {'cons1': 37.7, 'cons2': 0.0, 'lib': 26.3, 'ndp': 18.2},
    2011: {'cons1': 39.6, 'cons2': 0.0, 'lib': 18.9, 'ndp': 30.6},
    2015: {'cons1': 31.9, 'cons2': 0.0, 'lib': 39.5, 'ndp': 19.7},
    2019: {'cons1': 34.4, 'cons2': 0.0, 'lib': 33.1, 'ndp': 15.9},
    2021: {'cons1': 33.7, 'cons2': 0.0, 'lib': 32.6, 'ndp': 17.8},
    2025: {'cons1': 41.3, 'cons2': 0.0, 'lib': 43.7, 'ndp': 6.3},
}

ONTELXN_RESULTS = {
    1963: {'cons1': 48.9, 'cons2': 0.0, 'lib': 35.1, 'ndp': 15.5},
    1967: {'cons1': 42.3, 'cons2': 0.0, 'lib': 31.7, 'ndp': 25.9},
    1971: {'cons1': 44.5, 'cons2': 0.0, 'lib': 27.8, 'ndp': 27.1},
    1975: {'cons1': 36.1, 'cons2': 0.0, 'lib': 34.3, 'ndp': 28.9},
    1977: {'cons1': 39.7, 'cons2': 0.0, 'lib': 31.4, 'ndp': 28.0},
    1981: {'cons1': 44.4, 'cons2': 0.0, 'lib': 33.7, 'ndp': 21.2},
    1985: {'cons1': 37.0, 'cons2': 0.0, 'lib': 37.9, 'ndp': 23.8},  
    1987: {'cons1': 24.7, 'cons2': 0.0, 'lib': 47.3, 'ndp': 25.7},
    1990: {'cons1': 23.5, 'cons2': 0.0, 'lib': 32.4, 'ndp': 37.6},
    1995: {'cons1': 44.8, 'cons2': 0.0, 'lib': 31.1, 'ndp': 20.6},
    1999: {'cons1': 45.1, 'cons2': 0.0, 'lib': 39.9, 'ndp': 12.6},
    2003: {'cons1': 34.7, 'cons2': 0.0, 'lib': 46.4, 'ndp': 14.7},
    2007: {'cons1': 31.6, 'cons2': 0.0, 'lib': 42.3, 'ndp': 16.8},
    2011: {'cons1': 35.4, 'cons2': 0.0, 'lib': 37.6, 'ndp': 22.6},
    2014: {'cons1': 31.2, 'cons2': 0.0, 'lib': 38.7, 'ndp': 23.7},
    2018: {'cons1': 40.5, 'cons2': 0.0, 'lib': 19.6, 'ndp': 33.6},
    2022: {'cons1': 40.8, 'cons2': 0.0, 'lib': 23.9, 'ndp': 23.7},
    2025: {'cons1': 43.0, 'cons2': 0.0, 'lib': 30.0, 'ndp': 18.6}
}

Compute the correlation between each of the party vote shares and census variables for every election

In [6]:
def compute_corr(df, census_var, party_var):
    if df[party_var].isnull().all() or df[census_var].isnull().all():
        return np.nan
    return np.round(stats.pearsonr(df[census_var], df[party_var]).statistic, 4)

In [7]:
cols = [
    'year', 'region',
    'corr_pct_imm_cons1', 'corr_pct_imm_cons2', 'corr_pct_imm_lib', 'corr_pct_imm_ndp',
    # 'corr_avg_hou_inc_cons1', 'corr_avg_hou_inc_cons2', 'corr_avg_hou_inc_lib', 'corr_avg_hou_inc_ndp',
]
rows = []

# print('PROVINCIAL')
for year in ONTELXN_YEARS:
    df_onted_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'ontario',
        compute_corr(df_onted_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'ndp_pct'),
        # compute_corr(df_onted_stats, 'avg_hou_inc', 'cons1_pct'),
        # compute_corr(df_onted_stats, 'avg_hou_inc', 'cons2_pct'),
        # compute_corr(df_onted_stats, 'avg_hou_inc', 'lib_pct'),
        # compute_corr(df_onted_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

# print('FEDERAL')
for year in FELXN_YEARS:
    df_fed_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_felxn/fed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'federal',
        compute_corr(df_fed_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'ndp_pct'),
        # compute_corr(df_fed_stats, 'avg_hou_inc', 'cons1_pct'),
        # compute_corr(df_fed_stats, 'avg_hou_inc', 'cons2_pct'),
        # compute_corr(df_fed_stats, 'avg_hou_inc', 'lib_pct'),
        # compute_corr(df_fed_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

corrs = pd.DataFrame.from_records(data=rows, columns=cols)
corrs.to_csv('../data/elections/stats/ed_corrs.csv', index=False)

Compute the mean vote share for the top 5 immigrant ridings every election, the vote share in the GTA, and the general election vote share, all per party

In [8]:
def get_top_5_imm_vote(df, party_pct_var):
    top_5_mean = df.nlargest(5, 'pct_imm')[party_pct_var].mean()
    return top_5_mean

def get_gta_vote(df, party_votes_var, num_gta_votes):
    return (df[party_votes_var].sum() / num_gta_votes) * 100

In [12]:
cols = [
    'year', 'region', 'party',
    'top_5_imm_pct', 'gta_pct', 'full_pct'
]
rows = []

party_vars = [
    ('cons1', 'cons1_pct', 'cons1_votes'),
    ('cons2', 'cons2_pct', 'cons2_votes'),
    ('lib', 'lib_pct', 'lib_votes'),
    ('ndp', 'ndp_pct', 'ndp_votes'),
    # ('oth', 'oth_pct', 'oth_votes'),
]
party_vote_cols = [item[2] for item in party_vars] + ['oth_votes']

for year in tqdm(ONTELXN_YEARS):
    df_onted_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg').drop(columns='geometry'))
    num_gta_votes = df_onted_stats[party_vote_cols].to_numpy().sum()

    for party_tag, party_pct_var, party_votes_var in party_vars:
        row = [
            year, 
            'ontario',
            party_tag,
            get_top_5_imm_vote(df_onted_stats, party_pct_var),
            get_gta_vote(df_onted_stats, party_votes_var, num_gta_votes),
            ONTELXN_RESULTS[year][party_tag],
        ]
        rows.append(row)

for year in tqdm(FELXN_YEARS):
    df_fed_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_felxn/fed_stats_{year}.gpkg').drop(columns='geometry'))
    num_gta_votes = df_fed_stats[party_vote_cols].to_numpy().sum()

    for party_tag, party_pct_var, party_votes_var in party_vars:
        row = [
            year, 
            'federal',
            party_tag,
            get_top_5_imm_vote(df_fed_stats, party_pct_var),
            get_gta_vote(df_fed_stats, party_votes_var, num_gta_votes),
            FELXN_RESULTS[year][party_tag],
        ]
        rows.append(row)

df_res = pd.DataFrame.from_records(data=rows, columns=cols)
df_res.to_csv('../data/elections/stats/ed_top_5_imm_results.csv', index=False)

100%|██████████| 18/18 [00:00<00:00, 56.31it/s]
100%|██████████| 21/21 [00:00<00:00, 49.51it/s]
