In [1]:
import pymc3 as pm
from entity import State, National

print("Running on PyMC3 v{}".format(pm.__version__))

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


Running on PyMC3 v3.9.3


### Load Election Data

In [2]:
from poll_data import get_polls, past_state_election_results
from entity import State, National

state_polls, national_polls = get_polls(2020, refresh=False)
df_past_elections = past_state_election_results()

# load poll data into state and national objects
states = [State(abbr, df, df_past_elections.loc[abbr]) for abbr, df in state_polls.items()]
national = National(national_polls)

Using pre-downloaded polls


### Model Fitting

In [3]:
import numpy as np

election_model = pm.Model()

# Model inspired from http://www.stat.columbia.edu/~gelman/research/published/election15Feb.pdf
with election_model:
    for state in states:
        # prior for each state
        # TODO: compute variance for each state from the past elections
        # Choosing 0.041 for now from 
        p0 = pm.Normal("p_%s" % state.abbr, mu=state.get_dem_share(2016), sigma=0.041)
        
        p_dems = np.array([p.p_dem for p in state.polls])
        months_to_election = np.array([p.months_from_election for p in state.polls])
        poll_sizes = np.array([p.size for p in state.polls])

        if len(poll_sizes) == 0:
            sigma = np.sqrt(1.0 * p0 * (1-p0) / poll_sizes + 0.0002 * months_to_election)
        else:
            sigma = 0.05
            
        Y_obs = pm.Normal("Y_obs_%s" % state.abbr, mu=p0, sigma=sigma, observed=p_dems)

    trace = pm.sample(5000, init='adapt_diag')

Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [p_WY, p_WI, p_WV, p_DC, p_WA, p_VA, p_VT, p_UT, p_TX, p_TN, p_SD, p_SC, p_RI, p_PA, p_OR, p_OK, p_OH, p_ND, p_NC, p_NY, p_NM, p_NJ, p_NH, p_NV, p_NE, p_MT, p_MO, p_MS, p_MN, p_MI, p_MA, p_MD, p_ME, p_LA, p_KY, p_KS, p_IA, p_IN, p_IL, p_ID, p_HI, p_GA, p_FL, p_DE, p_CT, p_CO, p_CA, p_AR, p_AZ, p_AK, p_AL]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 58 seconds.


In [4]:
for state in states:
    samples = trace.get_values('p_%s' % state.abbr)
    print(
        "%s Prior: %.3f  Polls: %.3f  P_win: %.2f" % (
            state.abbr, state.get_dem_share(2016), 
            samples.mean(),
            (samples > 0.5).mean()
        )
    )

AL Prior: 0.356  Polls: 0.383  P_win: 0.00
AK Prior: 0.416  Polls: 0.449  P_win: 0.03
AZ Prior: 0.481  Polls: 0.515  P_win: 0.99
AR Prior: 0.357  Polls: 0.383  P_win: 0.00
CA Prior: 0.661  Polls: 0.657  P_win: 1.00
CO Prior: 0.527  Polls: 0.543  P_win: 0.95
CT Prior: 0.571  Polls: 0.571  P_win: 0.96
DE Prior: 0.560  Polls: 0.559  P_win: 0.93
FL Prior: 0.494  Polls: 0.514  P_win: 1.00
GA Prior: 0.473  Polls: 0.496  P_win: 0.31
HI Prior: 0.674  Polls: 0.673  P_win: 1.00
ID Prior: 0.317  Polls: 0.316  P_win: 0.00
IL Prior: 0.590  Polls: 0.590  P_win: 0.99
IN Prior: 0.399  Polls: 0.425  P_win: 0.00
IA Prior: 0.449  Polls: 0.495  P_win: 0.28
KS Prior: 0.389  Polls: 0.428  P_win: 0.00
KY Prior: 0.343  Polls: 0.398  P_win: 0.00
LA Prior: 0.398  Polls: 0.398  P_win: 0.00
ME Prior: 0.516  Polls: 0.561  P_win: 1.00
MD Prior: 0.640  Polls: 0.649  P_win: 1.00
MA Prior: 0.647  Polls: 0.680  P_win: 1.00
MI Prior: 0.499  Polls: 0.538  P_win: 1.00
MN Prior: 0.508  Polls: 0.539  P_win: 1.00
MS Prior: 0

In [12]:
ff = []
for state in states:
    samples = trace.get_values('p_%s' % state.abbr)
    ff.append({
        "abbr": state.abbr, 
        "prior": state.get_dem_share(2016), 
        "posterior": samples.mean(),
        "p_win": (samples > 0.5).mean()
    })
df_results = pd.DataFrame(ff)

In [5]:
import pandas as pd
ff = []
for state in states:
    samples = trace.get_values('p_%s' % state.abbr)
    ff.append({"state": state.abbr, "value": (samples > 0.5).mean()})
state_df = pd.DataFrame(ff)
state_df['value'] = 1 - state_df['value']


In [7]:
import plotly.express as px  # Be sure to import express
fig = px.choropleth(
    state_df,  # Input Pandas DataFrame
    locations="state",  # DataFrame column with locations
    color="value",  # DataFrame column with color values
    hover_name="state", # DataFrame column hover info
    color_continuous_scale = 'Bluered',
    color_continuous_midpoint = 0.5,
    locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
    title_text = 'State Rankings', # Create a Title
    geo_scope='usa',  # Plot only the USA instead of globe
)
fig.show()

### States to be flipped in favor of Biden

In [16]:
df_results[(df_results.prior < 0.5) & (df_results.posterior >= 0.5)].abbr.values

array(['AZ', 'FL', 'MI', 'NC', 'OH', 'PA', 'WI'], dtype=object)