<h1>Process Poll Data</h1>

Process info from Monmouth University 2020 national poll

In [46]:
import os 
import numpy as np
import pandas as pd 

<h2>Rename Columns</h2>

In [47]:
nat_2020 = pd.read_csv("../data/national_march_2020/MUP213_NATL_archive.tab", sep = "\t")

In [48]:
nat_2020 = nat_2020.rename(columns = {
        'Q3': 'top_household_concern',
        'QD1': 'registered_vote',
        'Q11': 'likely_to_vote', 
        'Q12': 'vote_choice', 
        'Q12B': 'vote_choice_undecided', 
        'Q13': 'approve_trump',
        'Q14': 'approve_biden',
        'Q16': 'elec_enthusiasm',
        'Q19': 'focused_imp_issues',
        'QD2': 'party',
        'QD2A': 'party_unaffiliated', 
        'QD3': 'political_leaning', 
        'QD5': 'age', 
        'QD5A': 'age_bin', 
        'QD7': 'latino', 
        'QD8': 'race', 
        'QD10': 'gender'
})

In [49]:
nat_2020 = nat_2020.drop(columns = [c for c in nat_2020.columns if 'Q' in c])

<h2>Recode Variables</h2>

<h3>Vote Choice</h3>

In [50]:
nat_2020['vote_choice_recoded'] = nat_2020.apply(lambda s: np.where(s['vote_choice'] in [6.0, 8.0, 9.0], s['vote_choice_undecided'], s['vote_choice']), axis = 1)
nat_2020['vote_choice_recoded'] = nat_2020['vote_choice_recoded'].apply(lambda s: np.where(s in [3.0, 9.0], 3.0, s))

<h3>Party Leaning</h3>

In [51]:
nat_2020['party_recoded'] = nat_2020.apply(lambda s: np.where(s['party'] in [4.0, 9.0], s['party_unaffiliated'], s['party']), axis = 1)
nat_2020['party_recoded'] = nat_2020['party_recoded'].apply(lambda s: np.where(s in [3.0, 9.0], 3.0, s))

<h3>Age</h3>

Convert so they're all in bins

In [52]:
nat_2020['age_recoded'] = nat_2020.apply(lambda s: np.where(not pd.isnull(s['age_bin']), s['age_bin'], s['age']), axis = 1)

nat_2020['age_recoded'] = nat_2020['age_recoded'].apply(lambda s: np.where(s >= 18 and s <= 34 and not pd.isnull(s), 1.0, 
                                                        np.where(s >= 35 and s <= 54, 2.0, 3.0)))

<h3>Approval Ratings</h3>

In [54]:
nat_2020['approve_trump'] = nat_2020['approve_trump'] == 1
nat_2020['approve_biden'] = nat_2020['approve_biden'] == 1

0      1
1      1
2      1
3      2
4      9
      ..
846    2
847    2
848    1
849    2
850    1
Name: approve_trump, Length: 851, dtype: int64

In [55]:
nat_2020['approve_biden']

0      5
1      3
2      3
3      1
4      1
      ..
846    2
847    2
848    1
849    1
850    5
Name: approve_biden, Length: 851, dtype: int64

<h3>Race</h3>

Overwrite if hispanic/latino

In [53]:
nat_2020['latino'] = nat_2020['latino'] == 1

In [33]:
def recode_race(s):
    if s['latino'] == 1 or s['race'] == 4: 
        return 4
    elif s['race'] != 5 and s['race'] != 9:
        return s['race']
    else:
        return 9

In [34]:
nat_2020['race_recoded'] = nat_2020.apply(recode_race, axis = 1)

<h3>Gender</h3>

In [37]:
nat_2020['male'] = nat_2020['gender'] == 1

<h3>Save</h3>

In [38]:
nat_2020.to_csv("../data/nat_2020_cleaned.csv", index = False)

<h2>Add Dummy Variables</h2>

In [39]:
categoricals = ['approve_trump', 'top_household_concern', 'registered_vote',
                'likely_to_vote', 'vote_choice', 'vote_choice_undecided', 'approve_biden', 
                'elec_enthusiasm', 'focused_imp_issues', 'focused_imp_issues', 
                'political_leaning', 'race_recoded', 'party_recoded', 'age_recoded']

In [40]:
for category in categoricals:
    nat_2020[category] = nat_2020[category].astype('category')


dummies = pd.get_dummies(nat_2020[categoricals])

In [41]:
nat_2020 = nat_2020[[c for c in nat_2020.columns if c not in categoricals]]

In [42]:
nat_2020 = pd.concat([nat_2020, dummies], axis = 1)

In [43]:
nat_2020 = nat_2020[(nat_2020['vote_choice_recoded'] != 3.0) | 
                    (nat_2020['vote_choice_recoded'] != 7.0) |
                    (pd.isnull(nat_2020['vote_choice_recoded']) == False)]






<h3>Save</h3>

In [44]:
nat_2020.to_csv("../data/nat_2020_cleaned_with_dummies.csv", index = False)