<h1>Process Poll Data</h1>

Process info from Monmouth University 2020 national poll

In [7]:
import os 
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split

<h2>Rename Columns</h2>

In [8]:
nat_2020 = pd.read_csv("../data/national_march_2020/MUP213_NATL_archive.tab", sep = "\t")

In [9]:
nat_2020 = nat_2020.rename(columns = {
        'Q3': 'top_household_concern',
        'QD1': 'registered_vote',
        'Q11': 'likely_to_vote', 
        'Q12': 'vote_choice', 
        'Q12B': 'vote_choice_undecided', 
        'Q13': 'approve_trump',
        'Q14': 'approve_biden',
        'Q15': 'optimistic',
        'Q16': 'elec_enthusiasm',
        'Q17': 'economic_situation',
        'Q19': 'focused_imp_issues',
        'QD2': 'party',
        'QD2A': 'party_unaffiliated', 
        'QD3': 'political_leaning', 
        'QD4': 'education',
        'QD5': 'age', 
        'QD5A': 'age_bin', 
        'QD7': 'latino', 
        'QD8': 'race', 
        'QD10': 'gender', 
        'QD11': 'state'
})

In [10]:
nat_2020 = nat_2020.drop(columns = [c for c in nat_2020.columns if 'Q' in c])

In [11]:
nat_2020['state']

0      24.0
1      34.0
2      37.0
3      37.0
4      13.0
       ... 
846    26.0
847    24.0
848    12.0
849    22.0
850    48.0
Name: state, Length: 851, dtype: float64

<h2>Recode Variables</h2>

<h3>Vote Choice</h3>

In [12]:
# if they didn't initially answer their vote choice fill in with the probe question
# combine other categories
nat_2020['vote_choice_recoded'] = nat_2020.apply(lambda s: np.where(s['vote_choice'] in [6.0, 7.0, 8.0, 9.0], s['vote_choice_undecided'], s['vote_choice']), axis = 1)
nat_2020['vote_choice_recoded'] = nat_2020['vote_choice_recoded'].apply(lambda s: np.where(s in [3.0, 9.0], 3.0, s))

<h3>Party Leaning</h3>

In [13]:
# if they didn't answer their party leaning, fill in with probe question 
# combine other categories
nat_2020['party_recoded'] = nat_2020.apply(lambda s: np.where(s['party'] in [4.0, 9.0], s['party_unaffiliated'], s['party']), axis = 1)
nat_2020['party_recoded'] = nat_2020['party_recoded'].apply(lambda s: np.where(s in [3.0, 9.0], 3.0, s))

<h3>Age</h3>

Convert so they're all in bins

In [14]:
nat_2020['age_recoded'] = nat_2020.apply(lambda s: np.where(not pd.isnull(s['age_bin']), s['age_bin'], s['age']), axis = 1)

nat_2020['age_recoded'] = nat_2020['age_recoded'].apply(lambda s: np.where(s >= 18 and s <= 34 and not pd.isnull(s), 1.0, 
                                                        np.where(s >= 35 and s <= 54, 2.0, 3.0)))

<h3>Approval Ratings</h3>

In [15]:
# True if they approve of Trump, False otherwise
nat_2020['approve_trump'] = nat_2020['approve_trump'] == 1
# True if they approve of Biden, False otherwise
nat_2020['approve_biden'] = nat_2020['approve_biden'] == 1

<h3>Race</h3>

Overwrite if hispanic/latino

In [16]:
nat_2020['latino'] = nat_2020['latino'] == 1

In [17]:
def recode_race(s):
    if s['latino'] == 1 or s['race'] == 4: 
        return 4
    elif s['race'] != 5 and s['race'] != 9:
        return s['race']
    else:
        return 9

In [18]:
# combine latino categories and other categories 
nat_2020['race_recoded'] = nat_2020.apply(recode_race, axis = 1)

<h3>Gender</h3>

In [19]:
# True if they're a man, false otherwise
nat_2020['male'] = nat_2020['gender'] == 1

<h3>Registered to Vote</h3>

In [20]:
# True if they're registered to vote, false otherwise
nat_2020['registered_vote'] = nat_2020['registered_vote'] == 1

<h3>Focused on Important Issues</h3>

In [21]:
# True if they think Trump is focused on important issues, False otherwise
nat_2020['focused_imp_issues'] = nat_2020['focused_imp_issues'] == 1

<h3>Combine Concerns</h3>

In [22]:
# combine some of the similar household concern categories 
nat_2020['top_household_concern'] = nat_2020['top_household_concern'].apply(lambda s: np.where(s in [6.0, 7.0], 6.0, s))
nat_2020['top_household_concern'] = nat_2020['top_household_concern'].apply(lambda s: np.where(s in [10.0, 11.0], 10.0, s))

<h3>Education</h3>

In [23]:
# turn into no high school, some college or vocational, college or others 
def recode_education(s):
    if s <= 3:
        return 1
    elif s > 4 and s <=6:
        return 2
    elif s >= 7:
        return 3
    else:
        return 9

In [24]:
nat_2020['education_recoded'] = nat_2020['education'].apply(recode_education)

<h3>Political Leaning</h3>

In [25]:
# combine into liberal, conservative, moderate, or other
def recode_political_leaning(s):
    if s <= 2:
        return 1
    elif s == 4 or s == 9:
        return 2
    elif s == 3:
        return 3
    else:
        return 4

In [26]:
nat_2020['political_leaning'] = nat_2020['political_leaning'].apply(recode_political_leaning)

<h3>Optimistic</h3>

In [27]:
# true if optimistic about website, false otherwise
nat_2020['optimistic'] = nat_2020['optimistic'] <= 2

<h2>Add Dummy Variables</h2>

In [28]:
categoricals = ['top_household_concern', 'registered_vote', 
                'likely_to_vote', 'vote_choice', 'vote_choice_undecided', 
                'elec_enthusiasm', 'focused_imp_issues', 'focused_imp_issues', 
                'political_leaning', 'race_recoded', 'party_recoded', 'age_recoded']

In [29]:
for category in categoricals:
    nat_2020[category] = nat_2020[category].astype('category')


dummies = pd.get_dummies(nat_2020[categoricals])

In [30]:
nat_2020 = nat_2020[[c for c in nat_2020.columns if c not in categoricals]]

In [31]:
nat_2020 = pd.concat([nat_2020, dummies], axis = 1)

In [32]:
# DROP OBSERVATIONS WHERE FALSE
nat_2020 = nat_2020[(nat_2020['vote_choice_recoded'] != 3.0) & 
                    (nat_2020['vote_choice_recoded'] != 7.0) &
                    (pd.isnull(nat_2020['vote_choice_recoded']) == False)]






<h2>Drop Extra Columns</h2>

In [33]:
to_drop = ['latino', 'age', 'education', 'race', 'age_bin', 'gender']
nat_2020 = nat_2020.drop(columns = to_drop)

In [34]:
nat_2020.to_csv("../data/nat_2020_cleaned.csv", index = False)

<h2>Train Test Split</h2>

In [35]:
X_train, X_test = train_test_split(nat_2020, test_size = 0.2, random_state = 42)

In [36]:
X_train.to_csv('../data/nat_2020_train.csv', index = False)
X_test.to_csv('../data/nat_2020_test.csv', index = False)