# Results Notebook Stats 170b
### Raymond Nguyen, Sherman Lu

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from tqdm import tqdm_notebook
from pprint import pprint
from scipy.stats import pearsonr
from scipy import sparse

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline
sns.set_style('darkgrid')

In [2]:
scores = pd.read_csv('data/cleaned_scores_final.csv')
lda_features = pd.read_csv('data/lda_results.csv')
empath_scores = pd.read_csv('data/empathScores.csv')
liwc_mfd_nrc = pd.read_csv('data/LIWC_MFD_NRC_features.csv').drop('Unnamed: 0', axis=1)

In [3]:
liwc_mfd_nrc.head()

Unnamed: 0,mtkid,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function.,...,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,negative_nrc,positive_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,A1VBZWAHLWQMXM,30097,50.94,89.27,16.41,18.74,30097,17.15,83.29,49.67,...,0.026148,0.024852,0.023955,0.028739,0.017476,0.063858,0.044355,0.025317,0.018141,0.035152
1,A3CXK1KSRGU27V,22,93.26,32.48,31.94,94.75,22,4.55,77.27,36.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0
2,A2AI4HOWRCG7KZ,34401,64.15,74.85,17.97,43.34,34401,16.57,82.43,49.25,...,0.021157,0.028451,0.01517,0.020866,0.019849,0.041587,0.046847,0.017582,0.016187,0.03499
3,A3VKJDDU7MMUVD,6599,76.07,74.42,29.28,64.27,6599,15.31,80.1,44.43,...,0.025307,0.029701,0.014851,0.019397,0.028035,0.040006,0.054099,0.01576,0.013942,0.03546
4,A1HJAT4A0FVIOZ,7793,85.17,68.43,44.95,71.11,7793,16.08,80.39,46.26,...,0.015398,0.022071,0.009496,0.01373,0.023611,0.02823,0.039523,0.012319,0.011549,0.026562


In [4]:
scores.head()

Unnamed: 0,mtkid,Harm,Fairness,Loyalty,Authority,Purity,ICS_score,Political Involvement,Political Leaning
0,A3L0DCUXI7X3A9,18,17,12,20,19,6,3,conservative
1,A3I40B0FATY8VH,21,24,24,15,3,8,7,conservative
2,A2MCRVU8I9VNHG,17,22,11,16,22,8,4,conservative
3,A1PJUYJ7W2LKKQ,24,24,10,17,9,-4,5,slightly
4,A1KZ21TSAYUHO4,25,29,16,19,29,4,2,slightly


In [5]:
lda_features.head()

Unnamed: 0,Document_No,Dominant_Topic,Dominant_Topic_Perc_Contrib,Dominant_Topic_Keywords,Text,mtkid,Topic_List
0,0,229.0,0.181,"people, make, time, man, call, good, talk, lif...",what if we found out that theres no such thi...,8298177,"[229, 233, 241, 92, 5, 130, 108, 279, 188, 228..."
1,1,367.0,0.2918,"bragging_right, cars_battle, dream_mustang, bu...",what you should be saying is the american peo...,A0467121226LDESR9HWNF,"[367, 108, 47, 188, 159, 229, 82, 92, 233, 213..."
2,2,47.0,0.1854,"good, win, great, watch, year, tonight, play, ...",literally me jj ale s dvomi y im going g...,A102VHF4S7WXQF,"[47, 108, 242, 144, 229, 81, 5, 213, 74, 82]"
3,3,73.0,0.07,"pottery, fucktrump, bacteria, simon, hairbecau...",if i have an ear infection caused by bacteria...,A110ICG2VQUGXS,"[73, 47, 108, 241, 32, 341]"
4,4,373.0,0.1074,"vote, trump, american, people, election, state...",i wish everyone followed this logic in every ...,A11A83GK7BV037,"[373, 108, 229, 5, 89, 213, 121, 228, 294, 0, ..."


In [6]:
empath_scores.head()

Unnamed: 0,mtkid,HarmVirtue,HarmVice,FairnessVirtue,FairnessVice,IngroupVirtue,IngroupVice,AuthorityVirtue,AuthorityVice,PurityVirtue,MoralityGeneral,PurityVice
0,A3L0DCUXI7X3A9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A3I40B0FATY8VH,0.000259,0.004577,0.007773,0.006218,0.004405,0.001727,0.004145,0.002245,0.000777,0.007773,0.000605
2,A2MCRVU8I9VNHG,0.0,0.006309,0.003155,0.003155,0.002366,0.001577,0.005521,0.001577,0.003155,0.009464,0.001577
3,A1PJUYJ7W2LKKQ,0.000134,0.001412,0.001008,0.000739,0.003227,0.000202,0.000134,0.000336,0.000134,0.002554,6.7e-05
4,A1KZ21TSAYUHO4,0.0,0.0029,0.00232,0.0,0.0029,0.0,0.00116,0.0,0.0,0.00174,0.0


In [7]:
empath_scores.columns = ['mtkid'] + ['empath_{}'.format(empath_col) for empath_col in empath_scores.columns if empath_col != 'mtkid']

In [8]:
data = pd.merge(left=scores, right=lda_features, how='inner', left_on='mtkid', right_on='mtkid')
data = pd.merge(left=data, right=empath_scores, how='inner', left_on='mtkid', right_on='mtkid')
data = pd.merge(left=data, right=liwc_mfd_nrc, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]
data.head()

Unnamed: 0,mtkid,harm,fairness,loyalty,authority,purity,ics_score,political involvement,political leaning,document_no,...,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,negative_nrc,positive_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,A3L0DCUXI7X3A9,18,17,12,20,19,6,3,conservative,289,...,0.0,0.0,0.0,0.025,0.0,0.0,0.05,0.0,0.0,0.0
1,A3I40B0FATY8VH,21,24,24,15,3,8,7,conservative,275,...,0.029543,0.027316,0.019822,0.028891,0.021397,0.056696,0.043934,0.022972,0.020256,0.032693
2,A2MCRVU8I9VNHG,17,22,11,16,22,8,4,conservative,171,...,0.016545,0.018491,0.008759,0.021898,0.020925,0.040389,0.044769,0.020438,0.009732,0.038443
3,A1PJUYJ7W2LKKQ,24,24,10,17,9,-4,5,slightly,80,...,0.022523,0.031532,0.008234,0.02354,0.021796,0.033324,0.047903,0.012787,0.016517,0.033517
4,A1KZ21TSAYUHO4,25,29,16,19,29,4,2,slightly,64,...,0.020221,0.048455,0.004578,0.016406,0.04235,0.011064,0.06715,0.006105,0.017169,0.038153


In [9]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 396
Data columns (total 138 columns):
mtkid                          object
harm                           int64
fairness                       int64
loyalty                        int64
authority                      int64
purity                         int64
ics_score                      int64
political involvement          int64
political leaning              object
document_no                    int64
dominant_topic                 float64
dominant_topic_perc_contrib    float64
dominant_topic_keywords        object
text                           object
topic_list                     object
empath_harmvirtue              float64
empath_harmvice                float64
empath_fairnessvirtue          float64
empath_fairnessvice            float64
empath_ingroupvirtue           float64
empath_ingroupvice             float64
empath_authorityvirtue         float64
empath_authorityvice           float64
empath_purityvirtu

# Baseline Model

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn import metrics

In [11]:
def print_regression_errors(y_test, y_pred):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2 = metrics.explained_variance_score(y_test, y_pred)

    errorsDict = {"MAE": mae, 
                  "MSE": mse, 
                  "RMSE": rmse}
                  #"R2": r2} # NOT ADJUSTED R2!! CAREFUL

    for k, v in sorted(errorsDict.items()):
        print("{}: {}".format(k, v))

In [12]:
MFT_CATEGORIES = ['harm', 'fairness', 'loyalty', 'authority', 'purity']

X = data['text']
#y = data[MFT_CATEGORIES[0]]

In [13]:
vect = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df=0.1)
X_dtm = vect.fit_transform(X)

In [14]:
X_dtm

<397x5539 sparse matrix of type '<class 'numpy.int64'>'
	with 513427 stored elements in Compressed Sparse Row format>

X_train_dtm, X_test_dtm, y_train, y_test = train_test_split(X_dtm, y, test_size=0.2, random_state=0)

X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [15]:
def nb_model(X_dtm, y, mft_category, alpha=1.24):
    X_train_dtm, X_test_dtm, y_train, y_test = train_test_split(X_dtm, y, test_size=0.2, random_state=0)
    
    nb = MultinomialNB(alpha=alpha)
    nb.fit(X_train_dtm, y_train)
    y_pred = nb.predict(X_test_dtm)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y_test, y_pred)
    print()
    return nb
    
def rf_model(X_dtm, y, mft_category, n_estimators=10):
    X_train_dtm, X_test_dtm, y_train, y_test = train_test_split(X_dtm, y, test_size=0.2, random_state=0)
    
    rf = RandomForestRegressor()
    rf.fit(X_train_dtm, y_train)
    y_pred = rf.predict(X_test_dtm)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y_test, y_pred)
    print()
    return rf

def nb_modelCV(X_dtm, y, mft_category, alpha=1.24, cv=10):    
    nb = MultinomialNB(alpha=alpha)
    y_pred = cross_val_predict(estimator=nb, X=X_dtm, y=y, cv=cv)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y, y_pred)
    print()
    
    nb.fit(X_dtm, y)
    return nb
    
def rf_modelCV(X_dtm, y, mft_category, n_estimators=50, cv=10):    
    rf = RandomForestRegressor(n_estimators=n_estimators)
    
    y_pred = cross_val_predict(estimator=rf, X=X_dtm, y=y, cv=cv)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y, y_pred)
    print()
    
    rf.fit(X_dtm, y)
    return rf

def lr_model(X_dtm, y, mft_category, alpha=0.2):
    X_train_dtm, X_test_dtm, y_train, y_test = train_test_split(X_dtm, y, test_size=0.2, random_state=0)
    
    lr = Ridge(alpha=alpha)
    lr.fit(X_train_dtm, y_train)
    y_pred = lr.predict(X_test_dtm)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y_test, y_pred)
    print()
    return lr

def lr_modelCV(X_dtm, y, mft_category, cv=10, alpha=0.2):    
    lr = Ridge(alpha=alpha)
    
    y_pred = cross_val_predict(estimator=lr, X=X_dtm, y=y, cv=cv)
    print('MFT: {}'.format(mft_category))
    print_regression_errors(y, y_pred)
    print()
    
    lr.fit(X_dtm, y)
    return lr

In [16]:
nb_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    nb = nb_model(X_dtm, y, mft)
    nb_predicted['predicted_{}'.format(mft)] = nb.predict(X_dtm)

MFT: harm
MAE: 6.2125
MSE: 63.3875
RMSE: 7.961626718202757

MFT: fairness
MAE: 5.55
MSE: 54.05
RMSE: 7.351870510285121

MFT: loyalty
MAE: 7.0375
MSE: 76.6125
RMSE: 8.752856676537094

MFT: authority
MAE: 7.3375
MSE: 76.0625
RMSE: 8.721381771256205

MFT: purity
MAE: 7.8375
MSE: 98.5625
RMSE: 9.92786482583239



In [17]:
nb_predicted_df = pd.DataFrame(nb_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(nb_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.406
Pearson's Correlation for fairness: 0.367
Pearson's Correlation for loyalty: 0.497
Pearson's Correlation for authority: 0.564
Pearson's Correlation for purity: 0.6


In [18]:
lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X_dtm, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X_dtm)

lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

MFT: harm
MAE: 4.504250504772163
MSE: 37.34875053772829
RMSE: 6.111362412566309

MFT: fairness
MAE: 4.075988024422034
MSE: 29.962642526809503
RMSE: 5.4738142576095425

MFT: loyalty
MAE: 5.673519534890994
MSE: 58.79482015793076
RMSE: 7.667778045687731

MFT: authority
MAE: 5.4978613984517795
MSE: 54.47187099245286
RMSE: 7.380506147443606

MFT: purity
MAE: 7.232522951239394
MSE: 81.15531497751523
RMSE: 9.008624477550123

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.736
Pearson's Correlation for fairness: 0.713
Pearson's Correlation for loyalty: 0.697
Pearson's Correlation for authority: 0.698
Pearson's Correlation for purity: 0.716


In [19]:
for mft in MFT_CATEGORIES:
    y = data[mft]
    rf_model(X_dtm, y, mft)

MFT: harm
MAE: 4.58875
MSE: 40.009125
RMSE: 6.325276673790642

MFT: fairness
MAE: 4.2625
MSE: 29.28275
RMSE: 5.411353804733156

MFT: loyalty
MAE: 5.0375
MSE: 37.507999999999996
RMSE: 6.124377519389215

MFT: authority
MAE: 5.404999999999999
MSE: 43.79625
RMSE: 6.617873525536734

MFT: purity
MAE: 7.428750000000001
MSE: 74.10512500000002
RMSE: 8.608433365020607



# LIWC Features Only

In [20]:
liwc_col = liwc_mfd_nrc.iloc[:, 1: 82].columns
mfd_col = liwc_mfd_nrc.iloc[:, 93: 103].columns
nrc_col = liwc_mfd_nrc.iloc[:, 104:].columns

In [21]:
liwc = liwc_mfd_nrc.loc[:, ['mtkid'] + liwc_col.tolist()]
mfd = liwc_mfd_nrc.loc[:, ['mtkid'] + mfd_col.tolist()]
nrc = liwc_mfd_nrc.loc[:, ['mtkid'] + nrc_col.tolist()]

In [22]:
liwc.head()

Unnamed: 0,mtkid,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function.,...,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler
0,A1VBZWAHLWQMXM,30097,50.94,89.27,16.41,18.74,30097,17.15,83.29,49.67,...,0.28,0.89,0.46,0.19,3.06,1.46,1.17,0.2,0.36,0.02
1,A3CXK1KSRGU27V,22,93.26,32.48,31.94,94.75,22,4.55,77.27,36.36,...,4.55,0.0,0.0,0.0,4.55,0.0,4.55,0.0,0.0,0.0
2,A2AI4HOWRCG7KZ,34401,64.15,74.85,17.97,43.34,34401,16.57,82.43,49.25,...,0.31,1.03,0.23,0.19,1.99,0.55,0.76,0.39,0.35,0.01
3,A3VKJDDU7MMUVD,6599,76.07,74.42,29.28,64.27,6599,15.31,80.1,44.43,...,0.2,0.59,0.47,0.27,3.7,1.55,1.33,0.27,0.18,0.02
4,A1HJAT4A0FVIOZ,7793,85.17,68.43,44.95,71.11,7793,16.08,80.39,46.26,...,0.35,1.1,0.76,0.18,1.69,0.27,1.04,0.24,0.14,0.03


In [23]:
data = pd.merge(left=scores, right=liwc, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]

X = data[[col.lower() for col in liwc.columns]].drop('mtkid', axis=1)

lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X)

MFT: harm
MAE: 4.872295782146655
MSE: 43.58577049641811
RMSE: 6.601952021668902

MFT: fairness
MAE: 4.295522972296607
MSE: 34.3541970361271
RMSE: 5.861245348569457

MFT: loyalty
MAE: 5.888976635712993
MSE: 62.505407320052875
RMSE: 7.906036131972385

MFT: authority
MAE: 6.1073414405844355
MSE: 68.6710737715259
RMSE: 8.286801178472059

MFT: purity
MAE: 7.7654284957670505
MSE: 104.94908585392741
RMSE: 10.244466108779287



In [24]:
lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.465
Pearson's Correlation for fairness: 0.468
Pearson's Correlation for loyalty: 0.495
Pearson's Correlation for authority: 0.513
Pearson's Correlation for purity: 0.534


# MFD Features Only

In [25]:
mfd.head()

Unnamed: 0,mtkid,HarmVirtue,HarmVice,FairnessVirtue,FairnessVice,IngroupVirtue,IngroupVice,AuthorityVirtue,AuthorityVice,PurityVirtue,PurityVice
0,A1VBZWAHLWQMXM,0.09,0.16,0.04,0.01,0.16,0.08,0.16,0.05,0.06,0.02
1,A3CXK1KSRGU27V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2AI4HOWRCG7KZ,0.1,0.1,0.07,0.06,0.08,0.03,0.19,0.04,0.03,0.02
3,A3VKJDDU7MMUVD,0.15,0.27,0.02,0.0,0.03,0.0,0.23,0.02,0.03,0.03
4,A1HJAT4A0FVIOZ,0.08,0.12,0.05,0.03,0.15,0.01,0.06,0.0,0.06,0.03


In [26]:
data = pd.merge(left=scores, right=mfd, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]

X = data[[col.lower() for col in mfd.columns]].drop('mtkid', axis=1)

lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X)

MFT: harm
MAE: 4.049243917841415
MSE: 26.8443085552902
RMSE: 5.181149346939364

MFT: fairness
MAE: 3.657089979057372
MSE: 21.452559553439304
RMSE: 4.631690787761992

MFT: loyalty
MAE: 5.088443976163424
MSE: 40.45522595498009
RMSE: 6.360442276680144

MFT: authority
MAE: 5.396064576849269
MSE: 47.100256000205235
RMSE: 6.862962625587089

MFT: purity
MAE: 7.164877596186073
MSE: 76.98073878289634
RMSE: 8.773866809046986



In [27]:
lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.214
Pearson's Correlation for fairness: 0.174
Pearson's Correlation for loyalty: 0.156
Pearson's Correlation for authority: 0.138
Pearson's Correlation for purity: 0.194


# NRC features only

In [28]:
data.head()

Unnamed: 0,mtkid,harm,fairness,loyalty,authority,purity,ics_score,political involvement,political leaning,harmvirtue,harmvice,fairnessvirtue,fairnessvice,ingroupvirtue,ingroupvice,authorityvirtue,authorityvice,purityvirtue,purityvice
0,A3L0DCUXI7X3A9,18,17,12,20,19,6,3,conservative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A3I40B0FATY8VH,21,24,24,15,3,8,7,conservative,0.05,0.15,0.07,0.01,0.12,0.04,0.21,0.09,0.02,0.01
2,A2MCRVU8I9VNHG,17,22,11,16,22,8,4,conservative,0.0,0.0,0.1,0.05,0.1,0.0,0.19,0.05,0.05,0.0
3,A1PJUYJ7W2LKKQ,24,24,10,17,9,-4,5,slightly,0.01,0.07,0.02,0.0,0.08,0.0,0.07,0.0,0.02,0.0
4,A1KZ21TSAYUHO4,25,29,16,19,29,4,2,slightly,0.0,0.0,0.0,0.0,0.04,0.0,0.08,0.0,0.04,0.0


In [29]:
data = pd.merge(left=scores, right=nrc, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]

X = data[[col.lower() for col in nrc.columns]].drop('mtkid', axis=1)

lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X)

MFT: harm
MAE: 4.048589998528807
MSE: 26.663069704958257
RMSE: 5.163629508878252

MFT: fairness
MAE: 3.6317073111003317
MSE: 20.754241820477503
RMSE: 4.555682366065209

MFT: loyalty
MAE: 4.940105323356371
MSE: 36.424523241764426
RMSE: 6.035273253280619

MFT: authority
MAE: 5.169387558593466
MSE: 40.669631846382494
RMSE: 6.377274640971839

MFT: purity
MAE: 6.785745050961183
MSE: 63.93762338545393
RMSE: 7.996100511215071



In [30]:
lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.121
Pearson's Correlation for fairness: 0.135
Pearson's Correlation for loyalty: 0.167
Pearson's Correlation for authority: 0.153
Pearson's Correlation for purity: 0.208


# Empath Only

In [31]:
data = pd.merge(left=scores, right=empath_scores, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]

X = data[[col.lower() for col in empath_scores.columns]].drop('mtkid', axis=1)

lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X)

MFT: harm
MAE: 4.002852872208138
MSE: 26.106110967768437
RMSE: 5.109413955412934

MFT: fairness
MAE: 3.6321211723949065
MSE: 20.661223130103544
RMSE: 4.545461817032846

MFT: loyalty
MAE: 4.964582563123653
MSE: 36.6484194472675
RMSE: 6.0537938061407

MFT: authority
MAE: 5.161899382219757
MSE: 40.77923236987019
RMSE: 6.385861912840755

MFT: purity
MAE: 6.855342536297931
MSE: 65.1155223506246
RMSE: 8.069418959914314



In [32]:
lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.103
Pearson's Correlation for fairness: 0.13
Pearson's Correlation for loyalty: 0.107
Pearson's Correlation for authority: 0.131
Pearson's Correlation for purity: 0.139


# Final Model w/ All Features

In [33]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.decomposition import PCA

In [34]:
data.columns

Index(['mtkid', 'harm', 'fairness', 'loyalty', 'authority', 'purity',
       'ics_score', 'political involvement', 'political leaning',
       'empath_harmvirtue', 'empath_harmvice', 'empath_fairnessvirtue',
       'empath_fairnessvice', 'empath_ingroupvirtue', 'empath_ingroupvice',
       'empath_authorityvirtue', 'empath_authorityvice', 'empath_purityvirtue',
       'empath_moralitygeneral', 'empath_purityvice'],
      dtype='object')

In [35]:
unwanted_columns = ['document_no', 'dominant_topic_perc_contrib', 'text', 'dominant_topic_keywords', 'mtkid', 'political leaning', 'political involvement']

In [36]:
data = pd.merge(left=scores, right=lda_features, how='inner', left_on='mtkid', right_on='mtkid')
data = pd.merge(left=data, right=empath_scores, how='inner', left_on='mtkid', right_on='mtkid')
data = pd.merge(left=data, right=liwc_mfd_nrc, how='inner', left_on='mtkid', right_on='mtkid')
data.columns = [col.lower() for col in data.columns]
data.head()

Unnamed: 0,mtkid,harm,fairness,loyalty,authority,purity,ics_score,political involvement,political leaning,document_no,...,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,negative_nrc,positive_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,A3L0DCUXI7X3A9,18,17,12,20,19,6,3,conservative,289,...,0.0,0.0,0.0,0.025,0.0,0.0,0.05,0.0,0.0,0.0
1,A3I40B0FATY8VH,21,24,24,15,3,8,7,conservative,275,...,0.029543,0.027316,0.019822,0.028891,0.021397,0.056696,0.043934,0.022972,0.020256,0.032693
2,A2MCRVU8I9VNHG,17,22,11,16,22,8,4,conservative,171,...,0.016545,0.018491,0.008759,0.021898,0.020925,0.040389,0.044769,0.020438,0.009732,0.038443
3,A1PJUYJ7W2LKKQ,24,24,10,17,9,-4,5,slightly,80,...,0.022523,0.031532,0.008234,0.02354,0.021796,0.033324,0.047903,0.012787,0.016517,0.033517
4,A1KZ21TSAYUHO4,25,29,16,19,29,4,2,slightly,64,...,0.020221,0.048455,0.004578,0.016406,0.04235,0.011064,0.06715,0.006105,0.017169,0.038153


In [37]:
data['topic_list'] = data['topic_list'].map(lambda x: x.strip('[]').split(','))
data = data.assign(**pd.get_dummies(data['topic_list'].apply(lambda x:pd.Series(x)).stack().reset_index(level=1, drop=True), prefix='topic').sum(level=0))
data.head()

Unnamed: 0,mtkid,harm,fairness,loyalty,authority,purity,ics_score,political involvement,political leaning,document_no,...,topic_83,topic_86,topic_88,topic_89,topic_90,topic_91,topic_92,topic_93,topic_94,topic_97
0,A3L0DCUXI7X3A9,18,17,12,20,19,6,3,conservative,289,...,0,0,0,0,0,0,0,0,0,0
1,A3I40B0FATY8VH,21,24,24,15,3,8,7,conservative,275,...,0,0,0,0,0,0,0,0,0,0
2,A2MCRVU8I9VNHG,17,22,11,16,22,8,4,conservative,171,...,0,0,0,0,0,0,0,0,0,0
3,A1PJUYJ7W2LKKQ,24,24,10,17,9,-4,5,slightly,80,...,0,0,0,0,0,0,0,0,0,0
4,A1KZ21TSAYUHO4,25,29,16,19,29,4,2,slightly,64,...,0,0,0,0,0,0,0,0,0,0


In [39]:
X = pd.DataFrame(X_dtm.toarray(), columns=vect.get_feature_names())

In [40]:
features = data.drop(['topic_list'] + unwanted_columns + MFT_CATEGORIES, axis=1)
X_final = np.concatenate((X, features), axis=1)

mm_scale = MinMaxScaler()
X_final = mm_scale.fit_transform(X_final)

#X_final = pd.concat(objs=[X, features], axis=1)
X_final = np.concatenate((X.values, features), axis=1)

In [41]:
X_final.shape

(397, 6088)

In [42]:
X_final_sparse = sparse.csr_matrix(X_final)

In [43]:
nb_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    nb = nb_model(X_final_sparse, y, mft)
    nb_predicted['predicted_{}'.format(mft)] = nb.predict(X_final_sparse)

MFT: harm
MAE: 4.7
MSE: 45.375
RMSE: 6.73609679265374

MFT: fairness
MAE: 4.3
MSE: 34.5
RMSE: 5.873670062235365

MFT: loyalty
MAE: 5.8
MSE: 48.925
RMSE: 6.994640805645419

MFT: authority
MAE: 5.2125
MSE: 46.6875
RMSE: 6.832825184358224

MFT: purity
MAE: 9.1875
MSE: 124.7125
RMSE: 11.167475095114384



In [45]:
for mft in MFT_CATEGORIES:
    y = data[mft]
    rf_model(X_final_sparse, y, mft)

MFT: harm
MAE: 4.614999999999999
MSE: 37.56875
RMSE: 6.129335200492791

MFT: fairness
MAE: 3.4387499999999998
MSE: 20.647125
RMSE: 4.543910760567377

MFT: loyalty
MAE: 4.66375
MSE: 34.037625000000006
RMSE: 5.834177319896954

MFT: authority
MAE: 4.9325
MSE: 37.67325000000001
RMSE: 6.1378538594528305

MFT: purity
MAE: 6.904999999999999
MSE: 67.03325
RMSE: 8.187383586958656



In [46]:
lr_predicted = {}
for mft in MFT_CATEGORIES:
    y = data[mft]
    lr = lr_modelCV(X_final_sparse, y, mft)
    lr_predicted['predicted_{}'.format(mft)] = lr.predict(X_final_sparse)

MFT: harm
MAE: 4.341950898573629
MSE: 31.69512701544791
RMSE: 5.629842539134457

MFT: fairness
MAE: 3.642141246709483
MSE: 21.108311077169446
RMSE: 4.5943782035406535

MFT: loyalty
MAE: 5.055461906569151
MSE: 41.16457198339236
RMSE: 6.415962280390398

MFT: authority
MAE: 4.863098316248031
MSE: 39.65109537845172
RMSE: 6.29691157460955

MFT: purity
MAE: 6.389123238812633
MSE: 60.62536947908918
RMSE: 7.786229477679757



In [50]:
lr_predicted_df = pd.DataFrame(lr_predicted)
print('Correlations between Ground Truth and Predicted Values')

for mft in MFT_CATEGORIES:
    print('Pearson\'s Correlation for {}: {}'.format(mft, round(pearsonr(lr_predicted_df['predicted_{}'.format(mft)], data[mft])[0], 3)))

Correlations between Ground Truth and Predicted Values
Pearson's Correlation for harm: 0.991
Pearson's Correlation for fairness: 0.989
Pearson's Correlation for loyalty: 0.988
Pearson's Correlation for authority: 0.989
Pearson's Correlation for purity: 0.992
