In [7]:
import os
import re

import numpy as np
from scipy import stats
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

In [2]:
FELXN_YEARS = [
    1962, 1963, 1965, 1968, 1972, 1974, 1979, 1980, 1984, 1988, 
    1993, 1997, 2000, 2004, 2006, 2008, 2011, 2015, 2019, 2021
]
ONTELXN_YEARS = [
    1963, 1967, 1971, 1975, 1977, 1981, 1985, 1987, 1990, 1995, 
    1999, 2003, 2007, 2011, 2014, 2018, 2022
]

Compute the correlation between each of the party vote shares and census variables for every election

In [64]:
def compute_corr(df, census_var, party_var):
    if df[party_var].isnull().all() or df[census_var].isnull().all():
        return np.nan
    return np.round(stats.pearsonr(df[census_var], df[party_var]).statistic, 4)

In [68]:
cols = [
    'year', 'region',
    'corr_pct_imm_cons1', 'corr_pct_imm_cons2', 'corr_pct_imm_lib', 'corr_pct_imm_ndp',
    'corr_avg_hou_inc_cons1', 'corr_avg_hou_inc_cons2', 'corr_avg_hou_inc_lib', 'corr_avg_hou_inc_ndp',
]
rows = []

# print('PROVINCIAL')
for year in ONTELXN_YEARS:
    df_onted_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_ont-elxn/ont-ed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'ontario',
        compute_corr(df_onted_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_onted_stats, 'pct_imm', 'ndp_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'cons1_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'cons2_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'lib_pct'),
        compute_corr(df_onted_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

# print('FEDERAL')
for year in FELXN_YEARS:
    df_fed_stats = pd.DataFrame(gpd.read_file(f'../data/elections/{year}_felxn/fed_stats_{year}.gpkg').drop(columns='geometry'))
    corrs_row = [
        year,
        'federal',
        compute_corr(df_fed_stats, 'pct_imm', 'cons1_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'cons2_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'lib_pct'),
        compute_corr(df_fed_stats, 'pct_imm', 'ndp_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'cons1_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'cons2_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'lib_pct'),
        compute_corr(df_fed_stats, 'avg_hou_inc', 'ndp_pct'),
    ]
    rows.append(corrs_row)

corrs = pd.DataFrame.from_records(data=rows, columns=cols)
corrs.to_csv('../data/elections/stats/ed_corrs.csv', index=False)

Compute the trend line for the correlation between party vote shares and census variables over time

Compute the mean vote share for each party in the five most immigrant ridings for every election, and compute the trend line over time. Also whether the mean of the top 5 is significantly different from the mean of all ridings.

PROVINCIAL
1963 Observed Mean Difference: 7.6250 P-value: 0.0514
1967 Observed Mean Difference: 4.0569 P-value: 0.3262
1971 Observed Mean Difference: 2.3465 P-value: 0.6563
1975 Observed Mean Difference: -0.9808 P-value: 0.7623
1977 Observed Mean Difference: -1.6831 P-value: 0.6887
1981 Observed Mean Difference: 7.1868 P-value: 0.0772
1985 Observed Mean Difference: 8.3299 P-value: 0.1136
1987 Observed Mean Difference: 4.1662 P-value: 0.3092
1990 Observed Mean Difference: 6.1203 P-value: 0.0826
1995 Observed Mean Difference: 12.4192 P-value: 0.0068
1999 Observed Mean Difference: 16.6116 P-value: 0.0018
2003 Observed Mean Difference: 13.4524 P-value: 0.0014
2007 Observed Mean Difference: 12.0792 P-value: 0.0028
2011 Observed Mean Difference: 2.4634 P-value: 0.5129
2014 Observed Mean Difference: 1.8083 P-value: 0.6465
2018 Observed Mean Difference: 1.8613 P-value: 0.5473
2022 Observed Mean Difference: 3.0197 P-value: 0.3664
FEDERAL
1962 Observed Mean Difference: 3.1978 P-value: 0.2692
196