## Libraries

In [9]:
import pandas as pd
import numpy as np
import scipy.stats as stat

from math import sqrt
from mlgear.utils import show, display_columns
from surveyweights import normalize_weights


def margin_of_error(n=None, sd=None, p=None, type='proportion', interval_size=0.95):
    z_lookup = {0.8: 1.28, 0.85: 1.44, 0.9: 1.65, 0.95: 1.96, 0.99: 2.58}
    if interval_size not in z_lookup.keys():
        raise ValueError('{} not a valid `interval_size` - must be {}'.format(interval_size,
                                                                              ', '.join(list(z_lookup.keys()))))
    if type == 'proportion':
        se = sqrt(p * (1 - p)) / sqrt(n)
    elif type == 'continuous':
        se = sd / sqrt(n)
    else:
        raise ValueError('{} not a valid `type` - must be proportion or continuous')
    
    z = z_lookup[interval_size]
    return se * z


def print_pct(pct, digits=0):
    pct = pct * 100
    pct = np.round(pct, digits)
    if pct >= 100:
        if digits == 0:
            val = '>99.0%'
        else:
            val = '>99.'
            for d in range(digits - 1):
                val += '9'
            val += '9%'
    elif pct <= 0:
        if digits == 0:
            val = '<0.1%'
        else:
            val = '<0.'
            for d in range(digits - 1):
                val += '0'
            val += '1%'
    else:
        val = '{}%'.format(pct)
    return val


def calc_result(dem_vote, rep_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 3.0
    TIME_SHIFT_ERROR = 1.0
    N_SIMS = 100000
    
    dem_moe = margin_of_error(n=n, p=dem_vote/100, interval_size=interval)
    rep_moe = margin_of_error(n=n, p=rep_vote/100, interval_size=interval)
    undecided = 100 - dem_vote - rep_vote

    dem_mean = dem_vote + undecided * 0.25
    dem_raw_moe = dem_moe * 100
    dem_allocate_undecided = undecided * 0.4
    dem_margin = dem_raw_moe + dem_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    rep_mean = rep_vote + undecided * 0.25
    rep_raw_moe = rep_moe * 100
    rep_allocate_undecided = undecided * 0.4
    rep_margin = rep_raw_moe + rep_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    
    dem_sigma = dem_margin / 100 / normed_sigma
    dem_sims = np.random.normal(dem_mean / 100, dem_sigma, N_SIMS)
    
    rep_sigma = rep_margin / 100 / normed_sigma
    rep_sims = np.random.normal(rep_mean / 100, rep_sigma, N_SIMS)
    
    chance_pass = np.sum([sim[0] > sim[1] for sim in zip(dem_sims, rep_sims)]) / N_SIMS
    
    low, high = np.percentile(dem_sims - rep_sims, [20, 80]) * 100
    
    return {'mean': dem_mean - rep_mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': dem_raw_moe + rep_raw_moe,
            'margin': dem_margin + rep_margin,
            'sigma': np.mean([dem_sigma, rep_sigma]),
            'chance_pass': chance_pass}


def print_result(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < -100:
        first = -100
    print(('Result Dems +{} (80% CI: {} to {}) (Weighted N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) (Dems {} likely to win)').format(mean,
                                                          first,
                                                          second,
                                                          n,
                                                          raw_moe,
                                                          margin,
                                                          sigma,
                                                          chance_pass))
    print('-')

## Load Processed Data

In [10]:
survey = pd.read_csv('repsonses_processed_national_weighted.csv').fillna('Not presented')

## Generic Congressional Ballot

In [11]:
survey['vote_rep'].value_counts(normalize=True)

A Democratic candidate    0.587379
A Republican candidate    0.189927
Not decided               0.104976
Do not intend to vote     0.076659
Another candidate         0.018608
Unsure                    0.014361
I did not vote            0.008091
Name: vote_rep, dtype: float64

In [12]:
options = ['A Democratic candidate', 'A Republican candidate', 'Another candidate', 'Not decided']
survey_ = survey.loc[survey['vote_rep'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
    
print('## NATIONAL UNWEIGHTED ##')
n = len(survey_)
votes = survey_['vote_rep'].value_counts(normalize=True) * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(dem_vote=votes['A Democratic candidate'],
                           rep_vote=votes['A Republican candidate'],
                           n=n))
    
print('## NATIONAL WEIGHTED ##')
weighted_n = int(np.round(survey_['weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_rep'].value_counts(normalize=True) * survey_.groupby('vote_rep')['weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(dem_vote=votes['A Democratic candidate'],
                           rep_vote=votes['A Republican candidate'],
                           n=weighted_n))

print('## NATIONAL WEIGHTED + RV ##')
rv_weighted_n = int(np.round(survey_['rv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_rep'].value_counts(normalize=True) * survey.groupby('vote_rep')['rv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(dem_vote=votes['A Democratic candidate'],
                           rep_vote=votes['A Republican candidate'],
                           n=rv_weighted_n))

print('## NATIONAL WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_rep'].value_counts(normalize=True) * survey.groupby('vote_rep')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(dem_vote=votes['A Democratic candidate'],
                           rep_vote=votes['A Republican candidate'],
                           n=lv_weighted_n))

## NATIONAL UNWEIGHTED ##
A Democratic candidate    65.199820
A Republican candidate    21.082173
Another candidate          2.065559
Not decided               11.652447
Name: vote_rep, dtype: float64
Result Dems +44.1 (80% CI: 34.5 to 53.7) (Weighted N=4454) (raw_moe=1.7pts, margin=20.7pts, sigma=8.1pts) (Dems >99.9% likely to win)
-
## NATIONAL WEIGHTED ##
A Democratic candidate    45.628474
A Republican candidate    40.001964
Another candidate          2.428476
Not decided               11.941085
dtype: float64
Result Dems +5.6 (80% CI: -4.8 to 16.1) (Weighted N=1845) (raw_moe=2.9pts, margin=22.4pts, sigma=8.8pts) (Dems 67.4% likely to win)
-
## NATIONAL WEIGHTED + RV ##
A Democratic candidate    47.744633
A Republican candidate    41.962123
Another candidate          2.392239
Not decided                7.901004
dtype: float64
Result Dems +5.8 (80% CI: -3.1 to 14.7) (Weighted N=1799) (raw_moe=3.0pts, margin=19.2pts, sigma=7.5pts) (Dems 70.6% likely to win)
-
## NATIONAL WEIGHTED + L