In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels as sm
from statsmodels.regression.linear_model import OLS
from sklearn.metrics import mean_squared_error as mse

%matplotlib inline

In [2]:
anes = pd.read_csv('data/anes_pilot_2020ets_csv.csv')

In [3]:
data = anes.copy(deep = True)

In [4]:
data.drop(inplace=True, columns=[ 'V1', 'StartDate', 'EndDate', '_v1', 'RecordedDate', 'ResponseId','qmetadata_Browser', 'qmetadata_Version', '_v2', 'qmetadata_Resolution', 'check','relig1_11_TEXT', 'mauga', 'pk_cjus', 'pk_germ', 'ethnic1', 'ethnic2', 'ethnic3'])

In [5]:
data.dropna(inplace=True)

In [6]:
data.head()

Unnamed: 0,follow,reg1,votemail1a,votemail1b,votecount,votemail2,voterid1,voterid2,turnout16a,turnout16a1,...,rr_scale,white,black,namer,asian,hpi,mixed,race7,vidx,vidknow
0,1,1,4,77,4,4,1,2,1,6,...,9,1,0,0,0,0,0,1,2.0,1
1,2,1,5,77,3,4,1,1,1,6,...,16,1,0,0,0,0,0,1,3.0,0
2,2,4,4,77,3,3,1,1,2,6,...,6,0,0,0,0,0,0,5,3.0,0
3,2,1,6,77,3,4,1,1,1,6,...,13,1,0,0,0,0,0,1,3.0,1
4,4,1,1,77,3,2,1,2,1,6,...,4,0,1,0,0,0,0,2,2.0,0


In [7]:
data['votemail1b'].value_counts()

77    1497
1      472
2      284
4      265
7      231
3      177
5       84
6       68
Name: votemail1b, dtype: int64

In [8]:
data['mis_covid1'].value_counts(normalize=True)

2    0.556205
1    0.442495
9    0.001300
Name: mis_covid1, dtype: float64

In [9]:
data.drop(data[data['mis_covid1'] == 9].index, inplace=True)

In [10]:
data.drop(data[data['mis_covid2'] == 9].index, inplace=True)

In [11]:
data['mis_covid2'].value_counts(normalize=True)

2    0.795573
1    0.204427
Name: mis_covid2, dtype: float64

In [12]:
data['facebook1'].value_counts()

1     1001
66     621
2      551
3      401
4      229
5      107
7       85
6       77
Name: facebook1, dtype: int64

# target variable time

In [13]:
#renaming column to be more reflective 
data.rename(columns={"mis_covid1": 'covid_lab'}, inplace = True)

In [14]:
#renaming column to be more reflective 
data.rename(columns={"mis_covid2": 'covid_vax'}, inplace = True)

In [15]:
#changing the values so that a misinformed response is a 1 and an informed response is a 0 
data.covid_lab.replace((2,1), (0,1), inplace=True)

In [16]:
#changing the values so that a misinformed response is a 1 and an informed response is a 0 
data.covid_vax.replace((2,1), (0,1), inplace=True)

In [17]:
#creating a new column - my target variable : a covid misinformation score that combines the two scores from above 
data['covid_mis_score'] = data['covid_lab'] + data['covid_vax']

In [18]:
# looking at the distribution of scores 
data['covid_mis_score'].value_counts()

0    1577
1    1002
2     493
Name: covid_mis_score, dtype: int64

# PCA TIME

In [19]:
X = data.drop(columns=['covid_mis_score'], axis=1).values
y = data['covid_mis_score'].values


In [20]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
X = StandardScaler().fit_transform(X)



In [23]:
from sklearn.decomposition import PCA

# Instantiate PCA
pca = PCA(n_components=.9)

# Fit PCA
principalComponents = pca.fit_transform(X)

In [26]:
df = pd.DataFrame(data = principalComponents)

target = pd.Series(data['covid_mis_score'], name='covid_mis_score')

result_df = pd.concat([df, target], axis=1)
result_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,covid_mis_score
0,-12.877622,1.39203,1.161187,12.227746,9.563137,-8.008106,-5.214783,0.471178,-0.164906,-4.77564,...,1.536519,0.36335,-1.418881,-1.265129,2.670137,-1.091212,0.067607,-2.813018,2.127873,0.0
1,-12.985477,3.871361,4.147229,-0.014748,-0.402384,0.238286,1.071918,0.543284,-1.582759,-3.274836,...,0.312065,0.112042,0.732399,0.038272,0.239438,0.578691,0.230516,-0.142289,0.721962,0.0
2,-14.044918,-5.540263,-4.148771,6.382533,-0.723177,1.239731,-2.389513,-3.89461,-3.598717,-1.137212,...,2.122709,2.211847,-1.97264,-1.466307,0.613691,-1.970127,-0.238571,-1.480236,-1.514887,0.0
3,-13.148392,5.266683,-0.403316,1.497424,3.455635,-2.218039,0.577216,-0.666941,-1.967656,-7.039026,...,1.156364,-0.002293,-0.04916,-0.656953,0.398277,-0.192152,0.04872,-0.807418,0.956621,0.0
4,-12.761139,1.040967,-4.152609,-1.005863,-2.99662,-4.390942,3.869029,0.949048,2.416722,-0.094967,...,0.584633,-0.472453,0.661439,0.553679,-0.040789,1.261413,0.867093,-0.307365,0.700723,0.0


In [None]:
#julian note: make a predictive model, this is something to use for a random forest - the problem is in thee translation

In [24]:
plt.style.use('seaborn-dark')
fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('First Principal Component ', fontsize = 15)
ax.set_ylabel('Second Principal Component ', fontsize = 15)
ax.set_title('Principal Component Analysis (2PCs) for Iris Dataset', fontsize = 20)

targets = [0, 1, 2]
colors = ['r', 'g', 'b']
for target, color in zip(targets, colors):
    indicesToKeep = data['covid_mis_score'] == target
    ax.scatter(result_df.loc[indicesToKeep, 'PC1'], 
               result_df.loc[indicesToKeep, 'PC2'], 
               c = color, 
               s = 50)
ax.legend(targets)
ax.grid()

# notes 11/2
- do a corr thing ?
- feature collection 