This is the initial PCA applied to our subset of ESS 2018 features.

In [1]:
# Import packages
import pandas as pd
import numpy as np

# Standardizing
from sklearn.preprocessing import RobustScaler

# PCA
from sklearn.decomposition import PCA

# Data Prep

In [2]:
# Loads data
df = pd.read_csv("../../data/processed/ESS2018_Cleaned.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,nwspol,netusoft,ppltrst,pplfair,pplhlp,polintr,psppsgva,actrolga,psppipla,cptppola,...,evpdemp,evlvptn,evmar,bthcld,anvcld,alvgptn,acldnmr,aftjbyc,advcyc,plnftr
0,60.0,5.0,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,...,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,4.0,2.0
1,10.0,5.0,7.0,8.0,7.0,2.0,3.0,2.0,3.0,2.0,...,1.0,1.0,1.0,1.0,4.0,4.0,4.0,2.0,2.0,4.0
2,60.0,4.0,5.0,7.0,7.0,4.0,2.0,1.0,3.0,2.0,...,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,2.0
3,45.0,5.0,3.0,9.0,5.0,3.0,2.0,2.0,3.0,1.0,...,1.0,1.0,1.0,1.0,3.0,4.0,4.0,3.0,4.0,1.0
4,30.0,1.0,5.0,8.0,4.0,2.0,1.0,1.0,1.0,3.0,...,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,7.0


In [4]:
# Drop missing values
df = df.dropna()

In [5]:
# Check new length of dataframe
len(df)

25667

In [6]:
# Initialize a standard scaler object
scaler = RobustScaler()

In [7]:
# Scale the data
df = pd.DataFrame(scaler.fit_transform(df),
                  columns=df.columns)

# PCA

In [8]:
df.columns

Index(['nwspol', 'netusoft', 'ppltrst', 'pplfair', 'pplhlp', 'polintr',
       'psppsgva', 'actrolga', 'psppipla', 'cptppola', 'trstprl', 'trstlgl',
       'trstplc', 'trstplt', 'trstprt', 'trstep', 'trstun', 'vote', 'contplt',
       'wrkprty', 'wrkorg', 'badge', 'sgnptit', 'pbldmn', 'bctprd', 'pstplonl',
       'clsprty', 'stflife', 'stfeco', 'stfgov', 'stfdem', 'stfedu', 'stfhlth',
       'gincdif', 'freehms', 'hmsfmlsh', 'hmsacld', 'euftf', 'imsmetn',
       'imdfetn', 'impcntr', 'imbgeco', 'imueclt', 'imwbcnt', 'frprtpl',
       'gvintcz', 'poltran', 'ifredu', 'ifrjob', 'evfredu', 'evfrjob',
       'topinfr', 'btminfr', 'wltdffr', 'recskil', 'recexp', 'recknow',
       'recimg', 'recgndr', 'sofrdst', 'sofrwrk', 'sofrpr', 'sofrprv',
       'ppldsrv', 'jstprev', 'pcmpinj', 'ipcrtiv', 'imprich', 'ipeqopt',
       'ipshabt', 'impsafe', 'impdiff', 'ipfrule', 'ipudrst', 'ipmodst',
       'ipgdtim', 'impfree', 'iphlppl', 'ipsuces', 'ipstrgv', 'ipadvnt',
       'ipbhprp', 'iprspot', 'iply

In [9]:
# Initialize a PCA object
pca = PCA(n_components=5)

In [10]:
# Fit the model
mod = pca.fit(df)

In [11]:
# Get loadings with items
loads = pd.DataFrame(mod.components_.T,
            index=df.columns)

In [12]:
# Finds features for principal components
pc1_keys = loads[0].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]].keys()
pc2_keys = loads[1].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]].keys()
pc3_keys = loads[2].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]].keys()
pc4_keys = loads[3].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]].keys()
pc5_keys = loads[4].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]].keys()

In [13]:
# Unique, important keys
pc_keys = list(set(np.concatenate([np.array(pc1_keys), np.array(pc2_keys), np.array(pc3_keys), np.array(pc4_keys),
                                   np.array(pc5_keys)])))

In [14]:
pc_keys

['sofrpr',
 'ipudrst',
 'btminfr',
 'alvgptn',
 'sofrwrk',
 'gvintcz',
 'nwspol',
 'imsmetn',
 'advcyc',
 'impcntr',
 'anvcld',
 'sofrdst',
 'stfeco',
 'psppsgva',
 'impenv',
 'stfdem',
 'freehms',
 'netusoft',
 'hmsfmlsh',
 'imdfetn',
 'psppipla',
 'ipadvnt',
 'polintr',
 'ipmodst',
 'ifredu',
 'imwbcnt',
 'ipcrtiv',
 'aftjbyc',
 'vote',
 'trstprt',
 'ipbhprp',
 'iplylfr',
 'impsafe',
 'imbgeco',
 'clsprty',
 'imprich',
 'poltran']

In [15]:
# renaming dictionary for key variables
# Sourced from the data protocols pdf
col_names = {'impcntr':"poor_immigrants", 'imueclt':"culture_immigrants", 'ipmodst':"modesty_humble", 'impenv':"environment", 'anvcld':"childfree", 
             'psppsgva':"have_say_in_gov", 'ipbhprp':"behave_properly", 'nwspol':"time_pol_news", 'imprich':"materialism", 'ipcrtiv':'creativity', 
             'poltran':"pol_transparency_exists", 'clsprty':"partisanship", 'polintr':"interest_in_pol", 'sofrwrk':"hard_work_pays", 'netusoft':"internet_use",
             'ifredu':"fair_access_edu", 'euftf':"euro_unification", 'hmsacld':"lgbt_adopt", 'psppipla':"have_say_in_pol", 'stfgov':"nat_gov", 'trstprt':"trust_parties", 
             'btminfr':"unfair_earnings_bot10", 'sofrpr':"care_for_poor", 'freehms':"free_lgbt", 'imbgeco':"immigr_good_for_econ", 'iplylfr':"loyalty",
             'ipadvnt':"adventure", 'imwbcnt':"immigr_make_cntry_bttr", 'ipudrst':"good_2_understand_diff_ppl", 'stfdem':"satis_democracy", "hmsfmlsh":"lgbt_family_ashamed",
             'gvintcz':"gov_cares_4_all", 'imsmetn':"majority_immigrants", 'impsafe':"safety", 'advcyc':"divorce_w_children", 'imdfetn':"minority_immigrants", 
             'aftjbyc':"full_job_&_kids", "stfeco":"satis_econ", "sofrdst":"prefer_eql_wealth_dist", "alvgptn":"approve_living_w_non_spouse"}
loads = loads.rename(col_names, axis = 0)

In [16]:
# PC1's most important features
loads[0].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]]

have_say_in_pol       -0.206865
free_lgbt             -0.203235
satis_democracy       -0.190982
have_say_in_gov       -0.190127
gov_cares_4_all       -0.184516
behave_properly        0.061836
modesty_humble         0.068458
lgbt_family_ashamed    0.101314
safety                 0.102053
interest_in_pol        0.122276
Name: 0, dtype: float64

**Disenfranchised** social conservatism.

In [17]:
# PC2's most important features
loads[1].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]]

interest_in_pol              -0.070456
partisanship                 -0.024925
care_for_poor                -0.024890
fair_access_edu              -0.022131
vote                         -0.021383
good_2_understand_diff_ppl    0.027443
immigr_good_for_econ          0.027835
modesty_humble                0.037191
behave_properly               0.045237
time_pol_news                 0.984730
Name: 1, dtype: float64

**News** Consumption.

In [18]:
# PC3's most important features
loads[2].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]]

pol_transparency_exists   -0.188393
satis_democracy           -0.187484
gov_cares_4_all           -0.180356
trust_parties             -0.176674
satis_econ                -0.158345
poor_immigrants            0.173940
divorce_w_children         0.241305
childfree                  0.254454
free_lgbt                  0.256991
internet_use               0.284202
Name: 2, dtype: float64

Anti-institutional (distrust), **social liberalism**, & pro immigration.

In [19]:
# PC4's most important features
loads[3].sort_values()[[1,2,3,4,5,-5, -4,-3,-2,-1]]

safety                       -0.396606
modesty_humble               -0.356817
good_2_understand_diff_ppl   -0.303244
environment                  -0.286135
loyalty                      -0.264193
prefer_eql_wealth_dist        0.054164
bot_10_pay_fair               0.061644
time_pol_news                 0.077200
hard_work_pays                0.094891
care_for_poor                 0.096588
Name: 3, dtype: float64

**Individualist**.

In [27]:
# PC5's most important features
loads[4].sort_values()[[1,2,3,4,5,-5,-4,-3,-2,-1]]

poor_immigrants          -0.179538
minority_immigrants      -0.175134
majority_immigrants      -0.138372
immigr_make_cntry_bttr   -0.137553
immigr_good_for_econ     -0.129662
adventure                 0.196311
fair_access_edu           0.201639
creativity                0.257315
internet_use              0.333211
materialism               0.509015
Name: 4, dtype: float64

**Materialist**/Protectionist.

In [28]:
# Get loadings with items
pd.DataFrame(mod.components_.T,
            index=df.columns).loc[pc_keys].rename(col_names, 
                                                  axis = 0).sort_values(by = 0, 
                                                                        ascending = False).style.background_gradient(cmap='seismic')

Unnamed: 0,0,1,2,3,4,5
childfree,-0.182636,-0.012951,0.254454,0.023125,0.048798,0.375919
full_job_&_kids,-0.10756,0.005856,0.147688,0.019525,0.051042,0.373623
divorce_w_children,-0.132754,-0.01452,0.241305,0.05355,0.071139,0.349307
behave_properly,0.061836,0.045237,-0.15767,-0.406896,-0.043991,0.189844
approve_living_w_non_spouse,-0.066608,-0.003938,0.109858,-0.001489,0.03603,0.146327
safety,0.102053,0.015623,-0.106137,-0.396606,0.043275,0.121084
modesty_humble,0.068458,0.037191,0.000603,-0.356817,-0.24548,0.111824
fair_access_edu,-0.150221,-0.022131,-0.003947,-0.048234,0.201639,0.111722
interest_in_pol,0.122276,-0.070456,-0.021656,0.009429,0.052216,0.105471
free_lgbt,-0.203235,0.017626,0.256991,-0.058762,-0.056924,0.104334


In [23]:
# % of variance explained by each component
pd.DataFrame(np.round(mod.explained_variance_ratio_, 2)*100,
            index=["PC1", "PC2", "PC3", "PC4", "PC5"]).T

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
0,13.0,8.0,6.0,6.0,4.0,3.0


37% of variance explained by the 6 above categories.