In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels as sm
from statsmodels.regression.linear_model import OLS
from sklearn.metrics import mean_squared_error as mse

%matplotlib inline

In [2]:
anes = pd.read_csv('data/anes_pilot_2020ets_csv.csv')

In [3]:
data = anes.copy(deep = True)

In [4]:
sanity = data.copy()

# make 9 for missing NA

In [5]:
data_9 = ['follow', 'votemail2', 'voterid1', 'voterid2', 'vote16', 'hopeful', 'worried', 'irritated', 'talk3', 
      'primaryvote', 'vote20jb', 'vote20bs', 'richpoor2', 'lcd', 'lcr', 'freemkt1a',
     'freemkt1b', 'govsize1a', 'govsize1b', 'ineqinc1a', 'antirelig1',
     'antirelig2', 'racist1','racist2', 'immignum', 'wall7', 'pathway', 
     'return', 'open', 'affact', 'hlthcare1', 'hlthcare2', 'covid1', 'covid2',
     'covid_elect', 'abort1', 'abort_imp', 'abort2', 'gayadopt', 'transmilit1a',
     'harass', 'diversity7', 'experts', 'science', 'exphelp', 'excessive',
     'compro1', 'compro2', 'pcorrect', 'selfcensor', 'childrear1', 'childrear2',
     'childrear3', 'rural2', 'rural3', 'rural4', 'mis_covid1', 'conf_covid1',
     'mis_covid2', 'conf_covid2', 'impeach1', 'impeach2', 'pid1r', 'pid2r', 
     'pidstr', 'pidlean', 'pidstr1', 'pidstr2', 'pidstr3', 'rr1', 'rr2',
     'rr3', 'rr4', 'relig2b', 'att1', 'att2', 'att3', 'fundmt1', 'fundmt2',
     'impact1', 'impact2', 'impact3', 'impact4', 'impact5', 'impact7', 'impact8',
     'impact9', 'whitejob', 'femid1a', 'femid1b', 'femid2_fem', 'knowtran1',
     'knowtran2_1', 'knowtran2_2', 'knowtran2_3', 'knowtran2_4', 'knowtran2_5',
     'knowtran2_6', 'knowgay1', 'knowgay2_1', 'knowgay2_2', 'knowgay2_3', 'knowgay2_4',
     'knowgay2_5', 'knowgay2_6', 'stress1', 'stress2', 'stress3', 'stress4',
     'depress1', 'depress2', 'depress3', 'depress4', 'depress5', 'callout1',
     'speech1', 'callout2', 'speech2', 'divmoral', 'forgive3', 'moralsup1',
     'duepro3', 'moralcert3', 'forgive5', 'intelsup1', 'moralcert1', 'duepro4',
     'moralcert6', 'forgive4', 'moralcert2', 'duepro1', 'callout3a','callout3b',
     'moralsup2', 'duepro2', 'intelsup2', 'duepro5', 'moralcert4', 'forgive6', 
     'moralcert5', 'link_bla','link_his', 'link_asi','link_whi', 'link_hpi', 'link_nat',
     'selfmon1', 'selfmon2', 'selfmon3', 'selfmon4', 'selfmon5', 'selfmon6', 'home_cps',
     'viol3_dems', 'viol3_reps', 'viol1', 'viol2a', 'viol2b', 'exptravel_ever', 
     'exphomesch', 'expconvert', 'expholiday', 'exparrest', 'exppubasst', 'expfight',
     'expavoid', 'callout_social', 'callout_person', 'expknowimmig', 'expbuyus',
     'expretire', 'expknowpris', 'marital1', 'home_ownership', 'vote20turnoutjb', 'pid7',
     'race7']
     
     
     
     
     

In [6]:
for column in data_9: 
    data[column].replace(9, np.nan, inplace=True)

In [7]:
#sanity check
data['selfmon2'].value_counts()

7.0    1498
5.0     718
4.0     352
3.0     285
2.0     135
1.0      90
Name: selfmon2, dtype: int64

In [8]:
#sanity check
data['expavoid'].value_counts()

2.0    1811
1.0    1266
Name: expavoid, dtype: int64

# make 88 for NA --> NA


In [9]:
data_88 = ['votemail1a', 'covidpres7', 'dtleader1', 'jbleader1', 
     'bsleader1', 'billtax1', 'guarinc1', 'regulate1', 'strpres1a',
     'leastgrp', 'tol_rally', 'tol_pres', 'tol_teach', 'freecol1',
     'loans1', 'forgive1a', 'forgive1b', 'forgive2a', 'forgive2b']





In [10]:
for column in data_88: 
    data[column].replace(88, np.nan, inplace=True)


In [11]:
#sanity check 
data['dtleader1'].value_counts()

5.0    575
1.0    385
2.0    223
3.0    168
4.0    147
Name: dtleader1, dtype: int64

# make 77 for NA --> NA

In [12]:
data_77 = ['votemail1b', 'healthcarepres7', 'dtleader2', 'jbleader2', 
     'bsleader2', 'billtax2', 'guarinc2', 'freemkt2', 'freemkt3',
     'govsize2', 'govsize3', 'regulate2', 'strpres1b', 'transmilit1b',
     'freecol2', 'loans2']








In [13]:
for column in data_77: 
    data[column].replace(77, np.nan, inplace=True)


In [14]:
#sanity check 
data['dtleader2'].value_counts()

5.0    617
1.0    376
2.0    212
3.0    192
4.0    185
Name: dtleader2, dtype: int64

# make 8 for misssing NA

In [15]:
data_8 = ['turnout16a', 'turnout16a1', 'freemkt1a', 'freemkt1b',
     'govsize1a', 'govsize1b', 'antirelig1', 'antirelig2', 'racist1',
     'racist2', 'transmilit1a', 'pcorrect', 'selfcensor', 'relig2a',
     'femid1a', 'callout1', 'speech1', 'callout2', 'speech2', 'divmoral',
     'forgive3', 'moralsup1', 'duepro3', 'moralcert3', 'forgive5','intelsup1', 
     'moralcert1', 'duepro4', 'moralcert6', 'forgive4', 'moralcert2', 'duepro1',
     'callout3a','callout3b', 'moralsup2', 'duepro2', 'intelsup2', 'duepro5',
     'moralcert4', 'forgive6', 'moralcert5', 'home_anes', 'viol2a']

In [16]:
for column in data_8:
    data[column].replace(8, np.nan, inplace=True)

In [17]:
#sanity check 
data['selfcensor'].value_counts()

2.0    485
3.0    470
1.0    272
4.0    147
5.0    123
Name: selfcensor, dtype: int64

# make 7 for NA --> NA

In [18]:
data_7 = ['turnout16b', 'relig2b', 'femid1b', 'stress1', 'stress2', 'stress3',
     'stress4', 'depress1', 'depress2', 'depress3', 'depress4', 'depress5', 
     'link_bla','link_his','link_asi','link_whi', 'link_hpi', 'link_nat',
     'selfmon1', 'selfmon2', 'selfmon3', 'selfmon4', 'selfmon5', 'selfmon6',
     'home_cps', 'viol2b']

In [19]:
for column in data_7:
    data[column].replace(7, np.nan, inplace=True)

In [20]:
#sanity check 
data['stress1'].value_counts()

3.0    531
4.0    366
2.0    304
5.0    208
1.0    172
Name: stress1, dtype: int64

# make 99 for missing NA

In [21]:
data_99 = ['apppres7', 'frnpres7', 'immpres7', 'econpres7', 'healthcarepres7', 
     'dtcares', 'dtdignif', 'dtauth', 'dtdiv', 'dtknow', 'jbleader1',
     'jbleader2', 'jbcares', 'jbdignif', 'jbhonest', 'jbauth', 'jbdiv',
     'jbknow', 'bsleader1', 'bsleader2', 'bscares', 'bsdignif', 'bshonest',
     'bsauth', 'bsdiv', 'bsknow', 'freemkt2', 'freemkt3', 'govsize2', 'govsize3',
           'forgive1a', 'forgive1b','forgive2b','finworry', 'confecon', 'taxecon', 
     'regulate2', 'strpres1b', 'tol_rally', 'tol_pres', 'tol_teach',
     'transmilit1b', 'freecol2', 'forgive2a']






In [22]:
for column in data_99:
    data[column].replace(99, np.nan, inplace=True)

In [23]:
#sanity check 
data['bscares'].value_counts()

5.0    774
1.0    674
2.0    657
3.0    588
4.0    380
Name: bscares, dtype: int64

# 66 

In [24]:
data_66 = ['freemkt2', 'freemkt3', 'govsize2', 'govsize3', 'pid2r', 
     'facebook1', 'twitter1', 'forgive1a', 'forgive1b', 'forgive2a',
     'forgive2b']


#freemkt1&2 has 99 

In [25]:
for column in data_66: 
    data[column].replace(66, np.nan, inplace=True)
    

In [26]:
#sanity check 
data['forgive2b'].value_counts()

4.0    214
3.0    129
5.0     97
7.0     87
6.0     83
2.0     69
1.0     63
Name: forgive2b, dtype: int64

In [27]:
data['govsize2'].value_counts()

4.0    184
5.0    123
7.0    123
6.0    100
3.0     87
1.0     73
2.0     62
Name: govsize2, dtype: int64

# 6

In [28]:
data_6 = ['pidstr',  'pidlean', 'pidstr1', 'pidstr2', 'pidstr3', 'relig2a',
     'att2', 'att3', 'fundmt1', 'fundmt2', 'whitejob', 'femid2_fem', 'femid2_anti',
     'femid2_nei', 'callout3a','callout3b','viol3_dems', 'viol3_reps', 'viol3_both']

In [29]:
for column in data_6:
    data[column].replace(6, np.nan, inplace=True)

In [30]:
data['viol3_reps'].value_counts()

2.0    341
4.0    339
3.0    292
1.0    250
5.0    244
Name: viol3_reps, dtype: int64

In [31]:
data['pidstr'].value_counts()

1.0    1426
2.0     717
Name: pidstr, dtype: int64

In [32]:
data['fundmt2'].value_counts()

1.0    220
3.0    199
2.0    197
5.0     74
4.0     74
Name: fundmt2, dtype: int64

# Dropping irrelevant columns 

In [33]:
data.drop(inplace=True, columns=[ 'votemail1b', 'V1', 'StartDate', 'EndDate', '_v1', 'RecordedDate', 
                                 'ResponseId','qmetadata_Browser', 'qmetadata_Version', '_v2', 
                                 'qmetadata_Resolution', 'check','relig1_11_TEXT', 'mauga', 'pk_cjus', 
                                 'pk_germ', 'ethnic1', 'ethnic2', 'ethnic3', 'mixed', 'hpi', 'asian', 'namer',
                                'black', 'white', 'race1_hpi', 'race1_asian', 'race1_namer', 'race1_black', 'race1_white',
                                'hispanicr', 'nonhispanic_white', 'hispanic_white', 'hispanic', 'pid2r', 'pk_index',
                                'survexp1', 'survexp2', 'dejavu', 'surv_comp', 'RAND_INC', 'XCONTACT', 'XFORGIVE', 'FEEDBACK_GROUP',
                                'INSTRUCTION_GROUP', 'form', 'serious', 'admit', 'geer1896', 'pk_sen', 'pk_spend', 'expshark', 'inc_anes',
                                'inc_cps', 'inc_cpsmod', 'marital2', 'whites_1', 'whites_2', 'whites_3',
                                'whites_4', 'whites_5', 'whites_6', 'whites_7', 'blacks_1', 'blacks_2', 'blacks_3',
                                'blacks_4', 'blacks_5', 'blacks_6', 'blacks_7', 'hisp_1', 'hisp_2', 'hisp_3',
                                'hisp_4', 'hisp_5', 'hisp_6', 'hisp_7', 'asians_1', 'asians_2', 'asians_3', 'asians_4',
                                'asians_5', 'asians_6','asians_7', 'survexp1', 'survexp2', 'dejavu', 'citizen1', 'citizen2',
                                'particip_count', 'pk_mauga_correct', 'pk_germ_correct', 'pk_cjus_correct', 'pk_sen_correct',
                                'pk_spend_correct', 'pk_cjus_correctb', 'pk_index', 'pidstr1_dr', 'pidstr1_ind', 'pidstr2_dr',
                                 'pidstr2_ind','pidstr3_dr', 'pidstr3_ind', 'pid7str', 'hispanic', 'hispanic_white', 'nonhispanic_white',
                                'hispanicr', 'race1_white', 'race1_black', 'race1_namer', 'race1_hpi', 'white', 'black',
                                'namer', 'asian', 'hpi', 'mixed'])


 
# drop pg 114 until 117 
#drop pidd2R


#USE PID 7 AS DEMO FOR PID
#race7 as demo 

# simple imputing! note: have to drop columns after

In [34]:
from sklearn.impute import SimpleImputer
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
imp_med.fit(data)
SimpleImputer()
X = data
print(imp_med.transform(X))


[[1. 1. 4. ... 1. 2. 1.]
 [2. 1. 5. ... 1. 3. 0.]
 [2. 4. 4. ... 5. 3. 0.]
 ...
 [1. 1. 2. ... 1. 1. 1.]
 [1. 1. 2. ... 1. 3. 1.]
 [2. 1. 2. ... 1. 3. 1.]]


In [35]:
check_for_nan = data.isnull()
print (check_for_nan)

      follow   reg1  votemail1a  votecount  votemail2  voterid1  voterid2  \
0      False  False       False      False      False     False     False   
1      False  False       False      False      False     False     False   
2      False  False       False      False      False     False     False   
3      False  False       False      False      False     False     False   
4      False  False       False      False      False     False     False   
...      ...    ...         ...        ...        ...       ...       ...   
3075   False  False        True      False      False     False     False   
3076   False  False        True      False      False     False     False   
3077   False  False        True      False      False     False     False   
3078   False  False        True      False      False     False     False   
3079   False  False        True      False      False     False     False   

      turnout16a  turnout16a1  turnout16b  ...  USIPAddress    age  \
0    

In [37]:
data.isnull().values.any()

True

In [38]:
data.isna().sum().sum()

195060