## Libraries

In [33]:
import pandas as pd
import numpy as np
import scipy.stats as stat

from math import sqrt
from mlgear.utils import show, display_columns
from surveyweights import normalize_weights, run_weighting_iteration


def margin_of_error(n=None, sd=None, p=None, type='proportion', interval_size=0.95):
    z_lookup = {0.8: 1.28, 0.85: 1.44, 0.9: 1.65, 0.95: 1.96, 0.99: 2.58}
    if interval_size not in z_lookup.keys():
        raise ValueError('{} not a valid `interval_size` - must be {}'.format(interval_size,
                                                                              ', '.join(list(z_lookup.keys()))))
    if type == 'proportion':
        se = sqrt(p * (1 - p)) / sqrt(n)
    elif type == 'continuous':
        se = sd / sqrt(n)
    else:
        raise ValueError('{} not a valid `type` - must be proportion or continuous')
    
    z = z_lookup[interval_size]
    return se * z


def print_pct(pct, digits=0):
    pct = pct * 100
    pct = np.round(pct, digits)
    if pct >= 100:
        if digits == 0:
            val = '>99.0%'
        else:
            val = '>99.'
            for d in range(digits - 1):
                val += '9'
            val += '9%'
    elif pct <= 0:
        if digits == 0:
            val = '<0.1%'
        else:
            val = '<0.'
            for d in range(digits - 1):
                val += '0'
            val += '1%'
    else:
        val = '{}%'.format(pct)
    return val


def calc_result(biden_vote, trump_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 7.5
    TIME_SHIFT_ERROR = 1.0
    N_SIMS = 100000
    
    biden_moe = margin_of_error(n=n, p=biden_vote/100, interval_size=interval)
    trump_moe = margin_of_error(n=n, p=trump_vote/100, interval_size=interval)
    undecided = (100 - biden_vote - trump_vote) / 2

    biden_mean = biden_vote + undecided * 0.25
    biden_raw_moe = biden_moe * 100
    biden_allocate_undecided = undecided * 0.4
    biden_margin = biden_raw_moe + biden_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    trump_mean = trump_vote + undecided * 0.25
    trump_raw_moe = trump_moe * 100
    trump_allocate_undecided = undecided * 0.4
    trump_margin = trump_raw_moe + trump_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    
    biden_sigma = biden_margin / 100 / normed_sigma
    biden_sims = np.random.normal(biden_mean / 100, biden_sigma, N_SIMS)
    
    trump_sigma = trump_margin / 100 / normed_sigma
    trump_sims = np.random.normal(trump_mean / 100, trump_sigma, N_SIMS)
    
    chance_pass = np.sum([sim[0] > sim[1] for sim in zip(biden_sims, trump_sims)]) / N_SIMS
    
    low, high = np.percentile(biden_sims - trump_sims, [20, 80]) * 100
    
    return {'mean': biden_mean - trump_mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': biden_raw_moe + trump_raw_moe,
            'margin': (biden_margin + trump_margin) / 2,
            'sigma': (biden_sigma + trump_sigma) / 2,
            'chance_pass': chance_pass}


def print_result(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < -100:
        first = -100
    print(('Result Biden {} (80% CI: {} to {}) (Weighted N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) (Biden {} likely to win)').format(mean,
                                                           first,
                                                           second,
                                                           n,
                                                           raw_moe,
                                                           margin,
                                                           sigma,
                                                           chance_pass))
    print('-')
    



def calc_result_sen(dem_vote, rep_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 5.0
    TIME_SHIFT_ERROR = 0.5
    N_SIMS = 100000
    
    dem_moe = margin_of_error(n=n, p=dem_vote/100, interval_size=interval)
    rep_moe = margin_of_error(n=n, p=rep_vote/100, interval_size=interval)
    undecided = 100 - dem_vote - rep_vote

    dem_mean = dem_vote + undecided * 0.25
    dem_raw_moe = dem_moe * 100
    dem_allocate_undecided = undecided * 0.4
    dem_margin = dem_raw_moe + dem_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    rep_mean = rep_vote + undecided * 0.25
    rep_raw_moe = rep_moe * 100
    rep_allocate_undecided = undecided * 0.4
    rep_margin = rep_raw_moe + rep_allocate_undecided + GENERAL_POLLING_ERROR + TIME_SHIFT_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    
    dem_sigma = dem_margin / 100 / normed_sigma
    dem_sims = np.random.normal(dem_mean / 100, dem_sigma, N_SIMS)
    
    rep_sigma = rep_margin / 100 / normed_sigma
    rep_sims = np.random.normal(rep_mean / 100, rep_sigma, N_SIMS)
    
    chance_pass = np.sum([sim[0] > sim[1] for sim in zip(dem_sims, rep_sims)]) / N_SIMS
    
    low, high = np.percentile(dem_sims - rep_sims, [20, 80]) * 100
    
    return {'mean': dem_mean - rep_mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': dem_raw_moe + rep_raw_moe,
            'margin': (dem_margin + rep_margin) / 2,
            'sigma': (dem_sigma + rep_sigma) / 2,
            'chance_pass': chance_pass}


def print_result_sen(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < -100:
        first = -100
    print(('Result Hegar (D) {} (80% CI: {} to {}) (Weighted N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) (Hegar {} likely to win)').format(mean,
                                                           first,
                                                           second,
                                                           n,
                                                           raw_moe,
                                                           margin,
                                                           sigma,
                                                           chance_pass))
    print('-')

## Load Processed Data

In [34]:
survey = pd.read_csv('responses_processed_national_weighted.csv').fillna('Not presented')

## State Presidential Models

In [39]:
POTUS_CENSUS = {'Alabama': {'Hillary Clinton': 0.3436, 'Donald Trump': 0.6208},
                'Alaska': {'Hillary Clinton': 0.3655, 'Donald Trump': 0.5128},
                'Arizona': {'Hillary Clinton': 0.4513, 'Donald Trump': 0.4867},
                'Arkansas': {'Hillary Clinton': 0.3365, 'Donald Trump': 0.6057},
                'California': {'Hillary Clinton': 0.6173, 'Donald Trump': 0.3162},
                'Colorado': {'Hillary Clinton': 0.4816, 'Donald Trump': 0.4325},
                'Connecticut': {'Hillary Clinton': 0.5457, 'Donald Trump': 0.4093},
                'Delaware': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461}, # RFH
                'Washington DC': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Florida': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Georgia': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Hawaii': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Idaho': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Illinois': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Indiana': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Iowa': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Kansas': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Kentucky': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Louisiana': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Maine': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Maryland': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Massachusetts': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Michigan': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Minnesota': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Mosourri': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Montana': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Nebraska': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Nevada': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'New Hampshire': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'New Jersey': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'New Mexico': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'New York': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'North Carolina': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'North Dakota': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Ohio': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Oklahoma': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Oregon': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Pennsylvania': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Rhode Island': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'South Carolina': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'South Dakota': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Tennessee': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Texas': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Utah': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Vermont': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Virginia': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Washington': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'West Virginia': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Wisconsin': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461},
                'Wyoming': {'Hillary Clinton': 0.482, 'Donald Trump': 0.461 }}

for state in POTUS_CENSUS.keys():
    print('## {} ##'.format(state.upper()))
    state_survey = survey.copy()
    potus_census = {'vote2016': POTUS_CENSUS[state].copy()}
    potus_census['vote2016']['Other'] = 1 - potus_census['vote2016']['Hillary Clinton'] - potus_census['vote2016']['Donald Trump']
    output = run_weighting_iteration(state_survey, census=potus_census, weigh_on=['vote2016'], verbose=0)
    potus_weights = output['weights']['vote2016']
    potus_weights = state_survey['vote2016'].astype(str).replace(potus_weights)
    state_survey['weight'] = normalize_weights(state_survey['weight'] * potus_weights)
    state_survey['lv_weight'] = normalize_weights(state_survey['weight'] * state_survey['lv_index'])

    options = ['Donald Trump', 'Hillary Clinton', 'Other']
    survey_ = state_survey.loc[state_survey['vote2016'].isin(options)].copy()
    survey_['weight'] = normalize_weights(survey_['weight'])
    survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
    survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
    lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
    votes = survey_['vote2016'].value_counts(normalize=True) * survey_.groupby('vote2016')['lv_weight'].mean() * 100
    votes = votes[options] * (100 / votes[options].sum())
    raw_result = potus_census['vote2016']['Hillary Clinton'] - potus_census['vote2016']['Donald Trump']
    print('Raw result: {}'.format(np.round(raw_result * 100, 1)))
    print(votes)
    print('-')
    print_result(**calc_result(biden_vote=votes['Hillary Clinton'],
                               trump_vote=votes['Donald Trump'],
                               n=lv_weighted_n))

    options = ['Joe Biden, the Democrat', 'Donald Trump, the Republican', 'Another candidate', 'Not decided']
    survey_ = state_survey.loc[state_survey['vote_trump_biden'].isin(options)].copy()
    survey_['weight'] = normalize_weights(survey_['weight'])
    survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

    votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['lv_weight'].mean() * 100
    votes = votes[options] * (100 / votes[options].sum())
    print(votes)
    print('-')
    print_result(**calc_result(biden_vote=votes['Joe Biden, the Democrat'],
                               trump_vote=votes['Donald Trump, the Republican'],
                               n=lv_weighted_n))
    print('-')

## ALABAMA ##
Raw result: -27.7
Donald Trump       62.432885
Hillary Clinton    33.965672
Other               3.601443
dtype: float64
-
Result Biden -28.5 (80% CI: -38.6 to -18.3) (Weighted N=1238) (raw_moe=3.5pts, margin=11.0pts, sigma=8.6pts) (Biden 0.9% likely to win)
-
Joe Biden, the Democrat         41.552825
Donald Trump, the Republican    51.899543
Another candidate                3.073792
Not decided                      3.473839
dtype: float64
-
Result Biden -10.3 (80% CI: -21.1 to 0.5) (Weighted N=1238) (raw_moe=3.6pts, margin=11.6pts, sigma=9.1pts) (Biden 21.0% likely to win)
-
-
## ALASKA ##
Raw result: -14.7
Donald Trump       51.564426
Hillary Clinton    36.125587
Other              12.309987
dtype: float64
-
Result Biden -15.4 (80% CI: -27.3 to -3.7) (Weighted N=1294) (raw_moe=3.5pts, margin=12.7pts, sigma=9.9pts) (Biden 13.5% likely to win)
-
Joe Biden, the Democrat         45.286516
Donald Trump, the Republican    46.216785
Another candidate                4.823058
Not

Result Biden 1.3 (80% CI: -9.3 to 11.9) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 54.1% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided                      3.574506
dtype: float64
-
Result Biden 9.9 (80% CI: -0.8 to 20.7) (Weighted N=1329) (raw_moe=3.5pts, margin=11.7pts, sigma=9.1pts) (Biden 78.1% likely to win)
-
-
## IOWA ##
Raw result: 2.1
Donald Trump       46.466485
Hillary Clinton    47.754170
Other               5.779345
dtype: float64
-
Result Biden 1.3 (80% CI: -9.3 to 12.0) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 54.1% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided                      3.574506
dtype: float64
-
Result Biden 9.9 (80% CI: -1.0 to 20.8) (Weighted N=1329) (raw_moe=3.5pts, margin=11.7pts, s

Raw result: 2.1
Donald Trump       46.466485
Hillary Clinton    47.754170
Other               5.779345
dtype: float64
-
Result Biden 1.3 (80% CI: -9.3 to 11.8) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 54.0% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided                      3.574506
dtype: float64
-
Result Biden 9.9 (80% CI: -1.0 to 20.6) (Weighted N=1329) (raw_moe=3.5pts, margin=11.7pts, sigma=9.1pts) (Biden 77.9% likely to win)
-
-
## NEW MEXICO ##
Raw result: 2.1
Donald Trump       46.466485
Hillary Clinton    47.754170
Other               5.779345
dtype: float64
-
Result Biden 1.3 (80% CI: -9.3 to 11.9) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 54.3% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided              

Raw result: 2.1
Donald Trump       46.466485
Hillary Clinton    47.754170
Other               5.779345
dtype: float64
-
Result Biden 1.3 (80% CI: -9.4 to 11.8) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 53.9% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided                      3.574506
dtype: float64
-
Result Biden 9.9 (80% CI: -0.8 to 20.8) (Weighted N=1329) (raw_moe=3.5pts, margin=11.7pts, sigma=9.1pts) (Biden 78.2% likely to win)
-
-
## VIRGINIA ##
Raw result: 2.1
Donald Trump       46.466485
Hillary Clinton    47.754170
Other               5.779345
dtype: float64
-
Result Biden 1.3 (80% CI: -9.3 to 11.9) (Weighted N=1329) (raw_moe=3.5pts, margin=11.4pts, sigma=8.9pts) (Biden 54.0% likely to win)
-
Joe Biden, the Democrat         51.374217
Donald Trump, the Republican    41.473600
Another candidate                3.577676
Not decided                

## Senate Models

In [36]:
options = ['A Democratic candidate', 'A Republican candidate', 'Another candidate', 'Not decided']
survey_ = tx_national_survey.loc[survey['vote_senate'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
    
print('## NATIONAL TX-WEIGHTED ##')
weighted_n = int(np.round(survey_['weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=weighted_n))

print('## NATIONAL TX-WEIGHTED + RV ##')
rv_weighted_n = int(np.round(survey_['rv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['rv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=rv_weighted_n))

print('## NATIONAL TX-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=lv_weighted_n))
    
print('## TEXAS TX-WEIGHTED ##')
survey_ = tx_state_survey.loc[tx_state_survey['vote_senate'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
weighted_n = int(np.round(survey_['weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=weighted_n))

print('## TEXAS TX-WEIGHTED + RV ##')
rv_weighted_n = int(np.round(survey_['rv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['rv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=rv_weighted_n))

print('## TEXAS TX-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_senate'].value_counts(normalize=True) * survey_.groupby('vote_senate')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_sen(**calc_result_sen(dem_vote=votes['A Democratic candidate'],
                                   rep_vote=votes['A Republican candidate'],
                                   n=lv_weighted_n))

NameError: name 'tx_national_survey' is not defined