In [7]:
import os
import re

import numpy as np
from scipy import stats
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

In [2]:
FELXN_YEARS = [
    1962, 1963, 1965, 1968, 1972, 1974, 1979, 1980, 1984, 1988, 
    1993, 1997, 2000, 2004, 2006, 2008, 2011, 2015, 2019, 2021
]
ONTELXN_YEARS = [
    1963, 1967, 1971, 1975, 1977, 1981, 1985, 1987, 1990, 1995, 
    1999, 2003, 2007, 2011, 2014, 2018, 2022
]

Compute the correlation between each of the party vote shares and census variables for every election

In [64]:
def compute_corr(df, census_var, party_var):
    if df[party_var].isnull().all() or df[census_var].isnull().all():
        return np.nan
    return np.round(stats.pearsonr(df[census_var], df[party_var]).statistic, 4)

In [68]:
cols = [
    'year', 'region',
    'corr_pct_imm_cons1', 'corr_pct_imm_cons2', 'corr_pct_imm_lib', 'corr_pct_imm_ndp',
    'corr_avg_hou_inc_cons1', 'corr_avg_hou_inc_cons2', 'corr_avg_hou_inc_lib', 'corr_avg_hou_inc_ndp',
]
rows = []

# print('PROVINCIAL')
for year in ONTELXN_YEARS:
    df_onted_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'ontario',
        compute_corr(df_onted_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'ndp_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'cons1_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'cons2_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'lib_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

# print('FEDERAL')
for year in FELXN_YEARS:
    df_fed_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_felxn/fed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'federal',
        compute_corr(df_fed_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'ndp_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'cons1_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'cons2_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'lib_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

corrs = pd.DataFrame.from_records(data=rows, columns=cols)
corrs.to_csv('../data/elections/stats/ed_corrs.csv', index=False)

Compute the trend line for the correlation between party vote shares and census variables over time

Compute the mean vote share for each party in the five most immigrant ridings for every election, and compute the trend line over time. Also whether the mean of the top 5 is significantly different from the mean of all ridings.

In [95]:
def compute_sig_diff_top_imm_ridings(df, party_var):
    if party_var == 'cons2_pct': # BUG: df[party_var].isnull().all():
        return np.nan, np.nan, np.nan

    top_5_mean = df.nlargest(5, 'pct_imm')[party_var].mean()
    top_5 = df.nlargest(5, 'pct_imm')[party_var]  # Top 5 ridings by immigrant %
    full_data = df[party_var]  

    # if party_var == 'cons2_pct':  # BUG: FIX
    #     print(top_5)
    #     print(full_data)

    result = stats.permutation_test((top_5, full_data), statistic=lambda x, y: np.mean(x) - np.mean(y), permutation_type='independent', n_resamples=10000)

    return np.round(top_5_mean, 4), np.round(result.statistic, 4), np.round(result.pvalue, 4)

In [96]:
cols = [
    'year', 'region',
    'mean_top_vote_cons1', 'diff_vote_full_mean_cons1', 'p_val_cons1',
    'mean_top_vote_cons2', 'diff_vote_full_mean_cons2', 'p_val_cons2',
    'mean_top_vote_lib', 'diff_vote_full_mean_lib', 'p_val_lib',
    'mean_top_vote_ndp', 'diff_vote_full_mean_ndp', 'p_val_ndp',
]
rows = []

# print('PROVINCIAL')
for year in tqdm(ONTELXN_YEARS):
    df_onted_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg').drop(columns='geometry'))

    sig_diff_row = [
        year,
        'ontario',
        *compute_sig_diff_top_imm_ridings(df_onted_stats, 'cons1_pct'),
        *compute_sig_diff_top_imm_ridings(df_onted_stats, 'cons2_pct'),
        *compute_sig_diff_top_imm_ridings(df_onted_stats, 'lib_pct'),
        *compute_sig_diff_top_imm_ridings(df_onted_stats, 'ndp_pct'),
    ]
    rows.append(sig_diff_row)

# print('FEDERAL')
for year in tqdm(FELXN_YEARS):
    df_fed_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_felxn/fed_stats_{year}.gpkg').drop(columns='geometry'))
    
    sig_diff_row = [
        year,
        'federal',
        *compute_sig_diff_top_imm_ridings(df_fed_stats, 'cons1_pct'),
        *compute_sig_diff_top_imm_ridings(df_fed_stats, 'cons2_pct'),
        *compute_sig_diff_top_imm_ridings(df_fed_stats, 'lib_pct'),
        *compute_sig_diff_top_imm_ridings(df_fed_stats, 'ndp_pct'),
    ]
    rows.append(sig_diff_row)

corrs = pd.DataFrame.from_records(data=rows, columns=cols)
corrs.to_csv('../data/elections/stats/ed_sig_diff.csv', index=False)

100%|██████████| 17/17 [00:09<00:00,  1.80it/s]
100%|██████████| 20/20 [00:10<00:00,  1.83it/s]
