## Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stat

from math import sqrt
from mlgear.utils import show, display_columns
from surveyweights import normalize_weights


def margin_of_error(n=None, sd=None, p=None, type='proportion', interval_size=0.95):
    z_lookup = {0.8: 1.28, 0.85: 1.44, 0.9: 1.65, 0.95: 1.96, 0.99: 2.58}
    if interval_size not in z_lookup.keys():
        raise ValueError('{} not a valid `interval_size` - must be {}'.format(interval_size,
                                                                              ', '.join(list(z_lookup.keys()))))
    if type == 'proportion':
        se = sqrt(p * (1 - p)) / sqrt(n)
    elif type == 'continuous':
        se = sd / sqrt(n)
    else:
        raise ValueError('{} not a valid `type` - must be proportion or continuous')
    
    z = z_lookup[interval_size]
    return se * z


def print_pct(pct, digits=0):
    pct = pct * 100
    pct = np.round(pct, digits)
    if pct >= 100:
        if digits == 0:
            val = '>99.0%'
        else:
            val = '>99.'
            for d in range(digits - 1):
                val += '9'
            val += '9%'
    elif pct <= 0:
        if digits == 0:
            val = '<0.1%'
        else:
            val = '<0.'
            for d in range(digits - 1):
                val += '0'
            val += '1%'
    else:
        val = '{}%'.format(pct)
    return val


def calc_result(for_vote, against_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 4.0
    ACQUIESENCE_BIAS = -5.0
    N_SIMS = 10000000
    
    for_moe = margin_of_error(n=n, p=for_vote/100, interval_size=interval)
    against_moe = margin_of_error(n=n, p=against_vote/100, interval_size=interval)
    undecided = 100 - for_vote - against_vote
    mean = for_vote + undecided * 0.25 + ACQUIESENCE_BIAS
    raw_moe = for_moe * 100 + against_moe * 100
    
    allocate_undecided = undecided * 0.4
    margin = raw_moe + allocate_undecided + GENERAL_POLLING_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    sigma = margin / 100 / normed_sigma
    
    sims = np.random.normal(mean / 100, sigma, N_SIMS)
    chance_pass = np.sum([s > 0.5 for s in sims]) / N_SIMS
    low, high = np.percentile(sims, [20, 80]) * 100
    
    return {'mean': mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': raw_moe, 'margin': margin, 'sigma': sigma, 'chance_pass': chance_pass}


def print_result(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < 0:
        first = 0
    print(('Result {} (80% CI: {} to {}) (N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) ({} likely to pass)').format(mean,
                                                      first,
                                                      second,
                                                      n,
                                                      raw_moe,
                                                      margin,
                                                      sigma,
                                                      chance_pass))
    print(('{} (80% CI: {} to {}) ({})').format(mean,
                                                first,
                                                second,
                                                chance_pass))
    print('-')
    

def calc_result_tb(biden_vote, trump_vote, n, interval=0.8):
    GENERAL_POLLING_ERROR = 2.0
    N_SIMS = 100000
    
    biden_moe = margin_of_error(n=n, p=biden_vote/100, interval_size=interval)
    trump_moe = margin_of_error(n=n, p=trump_vote/100, interval_size=interval)
    undecided = (100 - biden_vote - trump_vote) / 2

    biden_mean = biden_vote + undecided * 0.25
    biden_raw_moe = biden_moe * 100
    biden_allocate_undecided = undecided * 0.4
    biden_margin = biden_raw_moe + biden_allocate_undecided + GENERAL_POLLING_ERROR
    
    trump_mean = trump_vote + undecided * 0.25
    trump_raw_moe = trump_moe * 100
    trump_allocate_undecided = undecided * 0.4
    trump_margin = trump_raw_moe + trump_allocate_undecided + GENERAL_POLLING_ERROR
    
    cdf_value = 0.5 + 0.5 * interval
    normed_sigma = stat.norm.ppf(cdf_value)
    
    biden_sigma = biden_margin / 100 / normed_sigma
    biden_sims = np.random.normal(biden_mean / 100, biden_sigma, N_SIMS)
    
    trump_sigma = trump_margin / 100 / normed_sigma
    trump_sims = np.random.normal(trump_mean / 100, trump_sigma, N_SIMS)
    
    chance_pass = np.sum([sim[0] > sim[1] for sim in zip(biden_sims, trump_sims)]) / N_SIMS
    
    low, high = np.percentile(biden_sims - trump_sims, [20, 80]) * 100
    
    return {'mean': biden_mean - trump_mean, 'high': high, 'low': low, 'n': n,
            'raw_moe': biden_raw_moe + trump_raw_moe,
            'margin': biden_margin + trump_margin,
            'sigma': np.mean([biden_sigma, trump_sigma]),
            'chance_pass': chance_pass}


def print_result_tb(mean, high, low, n, raw_moe, margin, sigma, chance_pass):
    mean = np.round(mean, 1)
    first = np.round(high, 1)
    second = np.round(low, 1)
    sigma = np.round(sigma * 100, 1)
    raw_moe = np.round(raw_moe, 1)
    margin = np.round(margin, 1)
    chance_pass = print_pct(chance_pass, 1)
    if second < first:
        _ = first
        first = second
        second = _
    if second > 100:
        second = 100
    if first < -100:
        first = -100
    print(('Result Biden {} (80% CI: {} to {}) (Weighted N={}) (raw_moe={}pts, margin={}pts, '
           'sigma={}pts) (Biden {} likely to win)').format(mean,
                                                           first,
                                                           second,
                                                           n,
                                                           raw_moe,
                                                           margin,
                                                           sigma,
                                                           chance_pass))
    print('-')

## Load Processed Data

In [2]:
ca_national_survey = pd.read_csv('responses_processed_ca_weighted.csv').fillna('Not presented')
ca_state_survey = pd.read_csv('responses_processed_ca_state_ca_weighted.csv').fillna('Not presented')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## California Trump-Clinton

In [3]:
options = ['Donald Trump', 'Hillary Clinton', 'Other']
survey_ = ca_national_survey.loc[ca_national_survey['vote2016'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
survey_['vote2016'].value_counts(normalize=True) * survey_.groupby('vote2016')['weight'].mean() * 100

Donald Trump       31.62
Hillary Clinton    61.73
Other               6.65
dtype: float64

In [4]:
options = ['Donald Trump', 'Hillary Clinton', 'Other']
survey_ = ca_state_survey.loc[ca_state_survey['vote2016'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
survey_['vote2016'].value_counts(normalize=True) * survey_.groupby('vote2016')['weight'].mean() * 100

Donald Trump       30.907333
Hillary Clinton    62.624379
Other               6.468288
dtype: float64

## California Trump-Biden

In [5]:
options = ['Joe Biden, the Democrat', 'Donald Trump, the Republican', 'Another candidate', 'Not decided']
survey_ = ca_national_survey.loc[ca_national_survey['vote_trump_biden'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

print('## NATIONAL CA-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_tb(**calc_result_tb(biden_vote=votes['Joe Biden, the Democrat'],
                                 trump_vote=votes['Donald Trump, the Republican'],
                                 n=lv_weighted_n))

survey_ = ca_state_survey.loc[ca_state_survey['vote_trump_biden'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])
print('## CALIFORNIA CA-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_trump_biden'].value_counts(normalize=True) * survey_.groupby('vote_trump_biden')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result_tb(**calc_result_tb(biden_vote=votes['Joe Biden, the Democrat'],
                                 trump_vote=votes['Donald Trump, the Republican'],
                                 n=lv_weighted_n))

## NATIONAL CA-WEIGHTED + LV ##
Joe Biden, the Democrat         60.703037
Donald Trump, the Republican    31.804225
Another candidate                3.838530
Not decided                      3.654208
dtype: float64
Result Biden 28.9 (80% CI: 24.4 to 33.4) (Weighted N=1904) (raw_moe=2.8pts, margin=9.8pts, sigma=3.8pts) (Biden >99.9% likely to win)
-
## CALIFORNIA CA-WEIGHTED + LV ##
Joe Biden, the Democrat         67.684460
Donald Trump, the Republican    23.068013
Another candidate                4.240733
Not decided                      5.006795
dtype: float64
Result Biden 44.6 (80% CI: 38.3 to 50.9) (Weighted N=366) (raw_moe=5.9pts, margin=13.6pts, sigma=5.3pts) (Biden >99.9% likely to win)
-


## California Propositions

In [6]:
# Comprehension issues (~15% raw support for a 100% income tax) suggest an acquiesence bias of ~5pts

options = ['Vote for / Support', 'Vote against / Oppose', 'Don’t know / Undecided']

survey_ = ca_national_survey.loc[ca_national_survey['vote_measure_100pct_income_tax'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

print('## vote_measure_100pct_income_tax NATIONAL CA-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_measure_100pct_income_tax'].value_counts(normalize=True) * survey_.groupby('vote_measure_100pct_income_tax')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(for_vote=votes['Vote for / Support'],
                           against_vote=votes['Vote against / Oppose'],
                           n=lv_weighted_n))
    
survey_ = ca_state_survey.loc[ca_state_survey['vote_measure_100pct_income_tax'].isin(options)].copy()
survey_['weight'] = normalize_weights(survey_['weight'])
survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

print('## vote_measure_100pct_income_tax CALI CA-WEIGHTED + LV ##')
lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
votes = survey_['vote_measure_100pct_income_tax'].value_counts(normalize=True) * survey_.groupby('vote_measure_100pct_income_tax')['lv_weight'].mean() * 100
votes = votes[options] * (100 / votes[options].sum())
print(votes)
print_result(**calc_result(for_vote=votes['Vote for / Support'],
                           against_vote=votes['Vote against / Oppose'],
                           n=lv_weighted_n))
print('-')

## vote_measure_100pct_income_tax NATIONAL CA-WEIGHTED + LV ##
Vote for / Support        15.467064
Vote against / Oppose     72.365818
Don’t know / Undecided    12.167119
dtype: float64
Result 13.5 (80% CI: 6.2 to 20.8) (N=2027) (raw_moe=2.3pts, margin=11.2pts, sigma=8.7pts) (<0.1% likely to pass)
13.5 (80% CI: 6.2 to 20.8) (<0.1%)
-
## vote_measure_100pct_income_tax CALI CA-WEIGHTED + LV ##
Vote for / Support        13.895341
Vote against / Oppose     70.498738
Don’t know / Undecided    15.605922
dtype: float64
Result 12.8 (80% CI: 2.6 to 23.0) (N=387) (raw_moe=5.2pts, margin=15.5pts, sigma=12.1pts) (0.1% likely to pass)
12.8 (80% CI: 2.6 to 23.0) (0.1%)
-
-


In [8]:
ca_measures = sorted([c for c in ca_national_survey.columns if 'vote_measure_ca_' in c and 'meta' not in c])

for measure in ca_measures:        
    survey_ = ca_national_survey.loc[ca_national_survey[measure].isin(options)].copy()
    survey_['weight'] = normalize_weights(survey_['weight'])
    survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
    survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

    print('## {} NATIONAL CA-WEIGHTED + LV ##'.format(measure))
    lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
    votes = survey_[measure].value_counts(normalize=True) * survey_.groupby(measure)['lv_weight'].mean() * 100
    votes = votes[options] * (100 / votes[options].sum())
    print(votes)
    print('Raw N: {}'.format(len(survey_)))
    print_result(**calc_result(for_vote=votes['Vote for / Support'],
                               against_vote=votes['Vote against / Oppose'],
                               n=lv_weighted_n))
    
    survey_ = ca_state_survey.loc[ca_state_survey[measure].isin(options)].copy()
    survey_['weight'] = normalize_weights(survey_['weight'])
    survey_['rv_weight'] = normalize_weights(survey_['rv_weight'])
    survey_['lv_weight'] = normalize_weights(survey_['lv_weight'])

    print('## {} CALI CA-WEIGHTED + LV ##'.format(measure))
    lv_weighted_n = int(np.round(survey_['lv_weight'].apply(lambda w: 1 if w > 1 else w).sum()))
    votes = survey_[measure].value_counts(normalize=True) * survey_.groupby(measure)['lv_weight'].mean() * 100
    votes = votes[options] * (100 / votes[options].sum())
    print(votes)
    print('(Cali) Raw N: {}'.format(len(survey_)))
    print_result(**calc_result(for_vote=votes['Vote for / Support'],
                               against_vote=votes['Vote against / Oppose'],
                               n=lv_weighted_n))
    print('-')

## vote_measure_ca_prop14_stem_cell NATIONAL CA-WEIGHTED + LV ##
Vote for / Support        55.999621
Vote against / Oppose     25.651585
Don’t know / Undecided    18.348795
dtype: float64
Raw N: 1673
Result 55.6 (80% CI: 45.2 to 66.0) (N=693) (raw_moe=4.5pts, margin=15.9pts, sigma=12.4pts) (67.4% likely to pass)
55.6 (80% CI: 45.2 to 66.0) (67.4%)
-
## vote_measure_ca_prop14_stem_cell CALI CA-WEIGHTED + LV ##
Vote for / Support        52.307280
Vote against / Oppose     32.201597
Don’t know / Undecided    15.491123
dtype: float64
(Cali) Raw N: 885
Result 51.2 (80% CI: 40.0 to 62.3) (N=333) (raw_moe=6.8pts, margin=17.0pts, sigma=13.2pts) (53.6% likely to pass)
51.2 (80% CI: 40.0 to 62.3) (53.6%)
-
-
## vote_measure_ca_prop15_property_tax NATIONAL CA-WEIGHTED + LV ##
Vote for / Support        57.559648
Vote against / Oppose     26.645505
Don’t know / Undecided    15.794847
dtype: float64
Raw N: 1663
Result 56.5 (80% CI: 46.7 to 66.4) (N=657) (raw_moe=4.7pts, margin=15.0pts, sigma=11.7pts