In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels as sm
from statsmodels.regression.linear_model import OLS
from sklearn.metrics import mean_squared_error as mse

%matplotlib inline

In [2]:
anes = pd.read_csv('data/anes_pilot_2020ets_csv.csv')

In [3]:
data = anes.copy(deep = True)

In [4]:
data.drop(inplace=True, columns=[ 'V1', 'StartDate', 'EndDate', '_v1', 'RecordedDate', 'ResponseId','qmetadata_Browser', 'qmetadata_Version', '_v2', 'qmetadata_Resolution', 'check','relig1_11_TEXT', 'mauga', 'pk_cjus', 'pk_germ', 'ethnic1', 'ethnic2', 'ethnic3'])

In [5]:
data.dropna(inplace=True)

In [6]:
data.head()

Unnamed: 0,follow,reg1,votemail1a,votemail1b,votecount,votemail2,voterid1,voterid2,turnout16a,turnout16a1,...,rr_scale,white,black,namer,asian,hpi,mixed,race7,vidx,vidknow
0,1,1,4,77,4,4,1,2,1,6,...,9,1,0,0,0,0,0,1,2.0,1
1,2,1,5,77,3,4,1,1,1,6,...,16,1,0,0,0,0,0,1,3.0,0
2,2,4,4,77,3,3,1,1,2,6,...,6,0,0,0,0,0,0,5,3.0,0
3,2,1,6,77,3,4,1,1,1,6,...,13,1,0,0,0,0,0,1,3.0,1
4,4,1,1,77,3,2,1,2,1,6,...,4,0,1,0,0,0,0,2,2.0,0


In [7]:
data.drop(data[data['mis_covid1'] == 9].index, inplace=True)

In [8]:
data.drop(data[data['mis_covid2'] == 9].index, inplace=True)

In [9]:
data['mis_covid2'].value_counts(normalize=True)

2    0.795573
1    0.204427
Name: mis_covid2, dtype: float64

In [10]:
data['facebook1'].value_counts()

1     1001
66     621
2      551
3      401
4      229
5      107
7       85
6       77
Name: facebook1, dtype: int64

# target variablr

In [11]:
data.rename(columns={"mis_covid1": 'covid_lab'}, inplace = True)

In [12]:
data.rename(columns={"mis_covid2": 'covid_vax'}, inplace = True)

In [13]:
data.covid_lab.replace((2,1), (0,1), inplace=True)

In [14]:
data.covid_vax.replace((2,1), (0,1), inplace=True)

In [15]:
data['covid_mis_score'] = data['covid_lab'] + data['covid_vax']

In [16]:
data['covid_mis_score'].value_counts()

0    1577
1    1002
2     493
Name: covid_mis_score, dtype: int64

# feature selection: decision tree

In [23]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification


In [24]:
X = data
y = data['covid_mis_score']

In [26]:
rfecv = RFECV(estimator=DecisionTreeClassifier(),
           step=1,
           cv=StratifiedKFold(10),
           scoring='accuracy')

rfecv.fit(X,y)

RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
      estimator=DecisionTreeClassifier(), scoring='accuracy')

In [27]:
RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
      estimator=DecisionTreeClassifier(), scoring='accuracy')

RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
      estimator=DecisionTreeClassifier(), scoring='accuracy')

In [28]:
print("Optimum number of features: %d" % rfecv.n_features_)

Optimum number of features: 1


In [29]:
df_features = pd.DataFrame(columns = ['feature', 'support', 'ranking'])

for i in range(X.shape[1]):
    row = {'feature': i, 'support': rfecv.support_[i], 'ranking': rfecv.ranking_[i]}
    df_features = df_features.append(row, ignore_index=True)
    
df_features.sort_values(by='ranking').head(10)

Unnamed: 0,feature,support,ranking
452,452,True,1
451,451,False,2
450,450,False,3
449,449,False,4
448,448,False,5
447,447,False,6
446,446,False,7
445,445,False,8
444,444,False,9
443,443,False,10
