## Libraries

In [25]:
import pandas as pd
import numpy as np
import scipy.stats as stat

from math import sqrt
from mlgear.utils import show, display_columns
from surveyweights import normalize_weights


def margin_of_error(n=None, sd=None, p=None, type='proportion', interval_size=0.95):
    z_lookup = {0.8: 1.28, 0.85: 1.44, 0.9: 1.65, 0.95: 1.96, 0.99: 2.58}
    if interval_size not in z_lookup.keys():
        raise ValueError('{} not a valid `interval_size` - must be {}'.format(interval_size,
                                                                              ', '.join(list(z_lookup.keys()))))
    if type == 'proportion':
        se = sqrt(p * (1 - p)) / sqrt(n)
    elif type == 'continuous':
        se = sd / sqrt(n)
    else:
        raise ValueError('{} not a valid `type` - must be proportion or continuous')
    
    z = z_lookup[interval_size]
    return se * z


def print_pct(pct, digits=0):
    pct = pct * 100
    pct = np.round(pct, digits)
    if pct >= 100:
        if digits == 0:
            val = '>99.0%'
        else:
            val = '>99.'
            for d in range(digits - 1):
                val += '9'
            val += '9%'
    elif pct <= 0:
        if digits == 0:
            val = '<0.1%'
        else:
            val = '<0.'
            for d in range(digits - 1):
                val += '0'
            val += '1%'
    else:
        val = '{}%'.format(pct)
    return val


def calc_result(biden_vote, trump_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 2
    BIDEN_BIAS = -4 # Electoral college edge
    N_SIMS = 100000
    
    biden_moe = margin_of_error(n=n, p=biden_vote/100, interval_size=interval)
    trump_moe = margin_of_error(n=n, p=trump_vote/100, interval_size=interval)
    undecided = (100 - biden_vote - trump_vote) / 2

    biden_mean = biden_vote + undecided * 0.25
    biden_raw_moe = biden_moe * 100
    biden_allocate_undecided = undecided * 0.4
    biden_margin = biden_raw_moe + biden_allocate_undecided + GENERAL_POLLING_ERROR
    
    trump_mean = trump_vote + undecided * 0.25
    trump_raw_moe = trump_moe * 100
    trump_allocate_undecided = undecided * 0.4
    trump_margin = trump_raw_moe + trump_allocate_undecided + GENERAL_POLLING_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    
    biden_sigma = biden_margin / 100 / normed_sigma
    biden_sims = np.random.normal(biden_mean / 100, biden_sigma, N_SIMS)
    
    trump_sigma = trump_margin / 100 / normed_sigma
    trump_sims = np.random.normal(trump_mean / 100, trump_sigma, N_SIMS)
    
    chance_pass = np.sum([sim[0] + BIDEN_BIAS / 100 > sim[1] for sim in zip(biden_sims, trump_sims)]) / N_SIMS
    
    low, high = np.percentile(biden_sims - trump_sims, [20, 80]) * 100
    
    return {'mean': biden_mean - trump_mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': biden_raw_moe + trump_raw_moe,
            'margin': (biden_margin + trump_margin) / 2,
            'sigma': (biden_sigma + trump_sigma) / 2,
            'chance_pass': chance_pass}


def print_result(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < -100:
        first = -100
    print(('Result Biden +{} (80% CI: {} to {}) (Weighted N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) (Biden {} likely to win)').format(mean,
                                                           first,
                                                           second,
                                                           n,
                                                           raw_moe,
                                                           margin,
                                                           sigma,
                                                           chance_pass))
    print('-')

## Load Processed Data

In [26]:
survey = pd.read_csv('responses_processed_national_weighted.csv').fillna('Not presented')

## Turnout

In [27]:
survey_ = survey.loc[survey['lv_registered'] == 'Yes']
survey_['lv_already_voted'].value_counts(normalize=True) * survey_.groupby('lv_already_voted')['weight'].mean() * 100
# Per this survey, 37% of registered voters voted
# Assuming 52M actually already voted...
# Assuming 153M registed voters...
# Actual is assumed to be 34%
# Thus 0.918 adjustment

Don't know     0.632210
No            59.418991
Yes           37.171851
dtype: float64

In [28]:
TURNOUT_MARGIN = 4
TURNOUT_BIAS = 0.918
turnout = (survey['lv_index'] * survey['weight'] * 100).mean()
print('Turnout in poll: {}%'.format(np.round(turnout, 1)))
weighted_n = int(np.round(survey['weight'].apply(lambda w: 1 if w > 1 else w).sum()))
turnout_moe = margin_of_error(n=weighted_n, p=turnout/100, interval_size=0.8)
print('Estimated voting population: {}% (80% CI: {}% to {}%)'.format(np.round(turnout * TURNOUT_BIAS, 1),
                                                                     np.round(turnout * TURNOUT_BIAS - turnout_moe * 100 - TURNOUT_MARGIN, 1),
                                                                     np.round(turnout * TURNOUT_BIAS + turnout_moe * 100 + TURNOUT_MARGIN, 1)))

RV_MARGIN = 4
RV_BIAS = 0.918
num_rv = (survey['rv_index'] * survey['weight'] * 100).mean()
rv_moe = margin_of_error(n=weighted_n, p=num_rv/100, interval_size=0.8)
print('Estimated registered voters: {}% (80% CI: {}% to {}%)'.format(np.round(num_rv * RV_BIAS, 1),
                                                                     np.round(num_rv * RV_BIAS - rv_moe * 100 - RV_MARGIN, 1),
                                                                     np.round(num_rv * RV_BIAS + rv_moe * 100 + RV_MARGIN, 1)))

Turnout in poll: 68.3%
Estimated voting population: 62.7% (80% CI: 57.4% to 68.0%)
Estimated registered voters: 80.4% (80% CI: 75.5% to 85.3%)


## Trump-Biden

In [29]:
survey['vote_trump_biden'].value_counts(normalize=True)

Joe Biden, the Democrat         0.642408
Donald Trump, the Republican    0.193797
Do not intend to vote           0.069126
Not decided                     0.050882
Another candidate               0.040341
Unsure                          0.002433
I did not vote                  0.001014
Name: vote_trump_biden, dtype: float64

In [30]:
options = ['Joe Biden, the Democrat', 'Donald Trump, the Republican', 'Another candidate', 'Not decided']
survey_ = survey.loc[survey['vote_trump_biden'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
    
print('## NATIONAL UNWEIGHTED ##')
n = len(survey_)
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(biden_vote=votes['Joe Biden, the Democrat'],
                           trump_vote=votes['Donald Trump, the Republican'],
                           n=n))
    
print('## NATIONAL WEIGHTED ##')
weighted_n = int(np.round(survey_['weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(biden_vote=votes['Joe Biden, the Democrat'],
                           trump_vote=votes['Donald Trump, the Republican'],
                           n=weighted_n))

print('## NATIONAL WEIGHTED + RV ##')
rv_weighted_n = int(np.round(survey_['rv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['rv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(biden_vote=votes['Joe Biden, the Democrat'],
                           trump_vote=votes['Donald Trump, the Republican'],
                           n=rv_weighted_n))

print('## NATIONAL WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(biden_vote=votes['Joe Biden, the Democrat'],
                           trump_vote=votes['Donald Trump, the Republican'],
                           n=lv_weighted_n))

## NATIONAL UNWEIGHTED ##
Joe Biden, the Democrat         69.267760
Donald Trump, the Republican    20.896175
Another candidate                4.349727
Not decided                      5.486339
Name: vote_trump_biden, dtype: float64
Result Biden +48.4 (80% CI: 43.9 to 52.8) (Weighted N=4575) (raw_moe=1.6pts, margin=4.8pts, sigma=3.7pts) (Biden >99.9% likely to win)
-
## NATIONAL WEIGHTED ##
Joe Biden, the Democrat         49.905696
Donald Trump, the Republican    40.229647
Another candidate                3.491332
Not decided                      6.373325
dtype: float64
Result Biden +9.7 (80% CI: 4.6 to 14.7) (Weighted N=1899) (raw_moe=2.9pts, margin=5.4pts, sigma=4.2pts) (Biden 82.9% likely to win)
-
## NATIONAL WEIGHTED + RV ##
Joe Biden, the Democrat         51.043475
Donald Trump, the Republican    41.121272
Another candidate                3.581995
Not decided                      4.253258
dtype: float64
Result Biden +9.9 (80% CI: 5.2 to 14.6) (Weighted N=1857) (raw_moe=2.9pts, ma