# Create ROC-plots with all traits per clf

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Big Five

In [None]:
def importdata(name):
    startstring = '/home/sophia/ma_py/Big5-NLP/results/'
    traito = '_openness'
    traitc = '_conscientiousness'
    traite = '_extraversion'
    traita = '_agreeableness'
    traitn = '_neuroticism'
    reststring = '_meantprs.npy'
    aucstring = '_auc.npy'
    
    o = np.load(startstring+name+traito+reststring)
    c = np.load(startstring+name+traitc+reststring)
    e = np.load(startstring+name+traite+reststring)
    a = np.load(startstring+name+traita+reststring)
    n = np.load(startstring+name+traitn+reststring)

    ocean = np.stack([o,c,e,a,n])
    x = np.mean(ocean, axis=0)
    
    o_auc = np.load(startstring+name+traito+aucstring)
    c_auc = np.load(startstring+name+traitc+aucstring)
    e_auc = np.load(startstring+name+traite+aucstring)
    a_auc = np.load(startstring+name+traita+aucstring)
    n_auc = np.load(startstring+name+traitn+aucstring)
    
    auc_all = np.stack([o_auc, c_auc, e_auc, a_auc, n_auc])
    x_auc = np.mean(auc_all)
    return o,c,e,a,n,x, o_auc, c_auc, e_auc, a_auc, n_auc, x_auc

In [None]:
def rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc):
    p = Path('/home/sophia/ma_py/Big5-NLP/results/roc_alltraits/')
    mean_fpr = np.linspace(0, 1, 101)
    plt.figure(figsize=(7,7))
    plt.plot(mean_fpr, o, 'b', label='openness (AUC = %0.2f)' % (o_auc))
    plt.plot(mean_fpr, c, 'g', label='conscientiousness (AUC = %0.2f)' % (c_auc))
    plt.plot(mean_fpr, e, 'y', label='extraversion (AUC = %0.2f)' % (e_auc))
    plt.plot(mean_fpr, a, 'c', label='agreeableness (AUC = %0.2f)' % (a_auc))
    plt.plot(mean_fpr, n, 'm', label='neuroticism (AUC = %0.2f)' % (n_auc))
    plt.plot(mean_fpr, x, 'k', label='mean (AUC = %0.2f)' % (x_auc), linewidth=3)

    plt.plot([0, 1], [0, 1],'r--', label='Chance')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.savefig(Path(p, name + "_" + 'alltraits_roc_plot.png'))
    plt.show()

## Conceptual methodological replication

In [None]:
# Logistic regression
name = 'b5_allfeatures_df_log_binary_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

# copy for later user in different function
e_log = e.copy()
e_auc_log = e_auc.copy()

In [None]:
# Multiclass classifier
name = 'b5_allfeatures_df_mcc_binary_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

e_mcc = e.copy()
e_auc_mcc = e_auc.copy()

In [None]:
# Multi-layer perceptron
name = 'b5_allfeatures_df_mlp_binary_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x30 = x.copy()
x30_auc = x_auc.copy()


e_mlp = e.copy()
e_auc_mlp = e_auc.copy()

In [None]:
# Support vector machine
name = 'b5_allfeatures_df_svm_binary_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)


e_svm = e.copy()
e_auc_svm = e_auc.copy()

In [None]:
# create a plot which shows comparison between classifiers with one trait
def rocplot_onetrait(e_log, e_svm, e_mlp, e_mcc, e_auc_log, e_auc_mlp, e_auc_mcc, e_auc_svm):
    p = Path('/home/sophia/ma_py/Big5-NLP/results/roc_alltraits/')
    mean_fpr = np.linspace(0, 1, 101)
    plt.figure(figsize=(7,7))
    plt.plot(mean_fpr, e_mcc, 'k', label='MCC (AUC = %0.2f)' % (e_auc_mcc))
    plt.plot(mean_fpr, e_mlp, 'b', label='MLP (AUC = %0.2f)' % (e_auc_mlp))
    plt.plot(mean_fpr, e_svm, 'm', label='SVM (AUC = %0.2f)' % (e_auc_svm))
    plt.plot(mean_fpr, e_log, 'g', label='LR (AUC = %0.2f)' % (e_auc_log))

    plt.plot([0, 1], [0, 1],'r--', label='Chance')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.savefig(Path(p, "b5_allfeatures_df_clfcompare_binary_anova_PCAfalse_roc_plot.png"))
    plt.show()

rocplot_onetrait(e_log, e_svm, e_mlp, e_mcc, e_auc_log, e_auc_mlp, e_auc_mcc, e_auc_svm)

## Experiment 1: Number of features

In [None]:
# create plot for comparing number of features with average from traits
def rocplot_nfeat(x5, x5_auc, x10, x10_auc, x20, x20_auc, x30, x30_auc):
    p = Path('/home/sophia/ma_py/Big5-NLP/results/roc_alltraits/')
    mean_fpr = np.linspace(0, 1, 101)
    plt.figure(figsize=(7,7))
    plt.plot(mean_fpr, x5, 'y', label='n=5 (AUC = %0.2f)' % (x5_auc))
    plt.plot(mean_fpr, x10, 'm', label='n=10 (AUC = %0.2f)' % (x10_auc))
    plt.plot(mean_fpr, x20, 'c', label='n=20 (AUC = %0.2f)' % (x20_auc))
    plt.plot(mean_fpr, x30, 'k', label='n=30 (AUC = %0.2f)' % (x30_auc), linewidth=2)

    plt.plot([0, 1], [0, 1],'r--', label='Chance')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.savefig(Path(p, "b5_allfeatures_df_mlp_binary_anova_PCAfalse_nfeatcompare_roc_plot.png"))
    plt.show()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_anova_PCAFalse_5'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x5 =x.copy()
x5_auc = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_anova_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x10 =x.copy()
x10_auc = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_anova_PCAFalse_20'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x20 =x.copy()
x20_auc = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_anova_PCAFalse_1'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x1 =x.copy()
x1_auc = x_auc.copy()

In [None]:
rocplot_nfeat(x5, x5_auc, x10, x10_auc, x20, x20_auc, x30, x30_auc)

## Experiment 2: Include PCA

In [None]:
# Multi-layer perceptron
name = 'b5_allfeatures_df_mlp_binary_anova_PCATrue_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

## Experiment 3: Different classifiers

In [None]:
# Gradient boost classifier
name = 'b5_allfeatures_df_boost_binary_anova_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

In [None]:
# Naive Bayes classifier
name = 'b5_allfeatures_df_nb_binary_anova_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

In [None]:
# Random forest classifier
name = 'b5_allfeatures_df_rfc_binary_anova_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

In [None]:
# Decision tree classifier
name = 'b5_allfeatures_df_tree_binary_anova_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

## Experiment 4: Different feature selection

In [None]:
# compare mutual information and forward selection feature selection with multi-layer perceptron
def rocplot_nfeat(x_mipca, x_auc_mipca, x_miaov, x_auc_miaov, x_fspca, x_auc_fspca, x_fsaov, x_auc_fsaov, x10, x10_auc):
    p = Path('/home/sophia/ma_py/Big5-NLP/results/roc_alltraits/')
    mean_fpr = np.linspace(0, 1, 101)
    plt.figure(figsize=(7,7))
    plt.plot(mean_fpr, x_mipca, 'b', label='MI+PCA (AUC = %0.2f)' % (x_auc_mlppca))
    plt.plot(mean_fpr, x_miaov, 'c', label='MI+ANOVA (AUC = %0.2f)' % (x_auc_mlpaov))
    plt.plot(mean_fpr, x_fspca, 'm', label='FSFS+PCA (AUC = %0.2f)' % (x_auc_rfcpca))
    plt.plot(mean_fpr, x_fsaov, 'y', label='FSFS+ANOVA (AUC = %0.2f)' % (x_auc_rfcaov))
    plt.plot(mean_fpr, x10, 'k--', label='Baseline (AUC = %0.2f)' % (x10_auc))

    plt.plot([0, 1], [0, 1],'r--', label='Chance')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.savefig(Path(p, "b5_allfeatures_df_mlp_rfc_binary_fscompare_roc_plot.png"))
    plt.show()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_mutual_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x_mipca = x.copy()
x_auc_mipca = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_mutual_PCATrue_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x_miaov = x.copy()
x_auc_miaov = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_sequential_forward_PCAFalse_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x_fspca = x.copy()
x_auc_fspca = x_auc.copy()

In [None]:
name = 'b5_allfeatures_df_mlp_binary_sequential_forward_PCATrue_10'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

x_fsaov = x.copy()
x_auc_fsaov = x_auc.copy()

In [None]:
 rocplot_nfeat(x_mipca, x_auc_mipca, x_miaov, x_auc_miaov, x_fspca, x_auc_fspca, x_fsaov, x_auc_fsaov, x10, x10_auc)

## Augmented data

In [None]:
# plot for augmented dataset with multi-layer perceptron
name = 'augmented/b5_augmented_df_mlp_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)

# MBTI - Data subset

In [None]:
# create plots for the data subset including MBTI labels (multi-layer perceptron)
def importdatambti(name):
    startstring = '/home/sophia/ma_py/Big5-NLP/results/'
    typeei = '_mbti_ei'
    typejp = '_mbti_jp'
    typens = '_mbti_ns'
    typetf = '_mbti_tf'
    reststring = '_meantprs.npy'
    aucstring = '_auc.npy'
    
    ei = np.load(startstring+name+typeei+reststring)
    jp = np.load(startstring+name+typejp+reststring)
    ns = np.load(startstring+name+typens+reststring)
    tf = np.load(startstring+name+typetf+reststring)
    allmbti = np.stack([ei,jp,ns,tf])
    x = np.mean(allmbti, axis=0)
    
    ei_auc = np.load(startstring+name+typeei+aucstring)
    jp_auc = np.load(startstring+name+typejp+aucstring)
    ns_auc = np.load(startstring+name+typens+aucstring)
    tf_auc = np.load(startstring+name+typetf+aucstring)
    allmbti_auc = np.stack([ei_auc,jp_auc,ns_auc,tf_auc])
    x_auc = np.mean(allmbti_auc, axis=0)
    
    return ei,jp,ns,tf,x,ei_auc,jp_auc,ns_auc,tf_auc,x_auc

def rocplotmbti(name, ei,jp,ns,tf,x,ei_auc,jp_auc,ns_auc,tf_auc,x_auc):
    p = Path('/home/sophia/ma_py/Big5-NLP/results/roc_alltraits/')
    mean_fpr = np.linspace(0, 1, 101)
    plt.figure(figsize=(7, 7))
    plt.plot(mean_fpr, ei, 'b', label='E/I (AUC = %0.2f)' % (ei_auc))
    plt.plot(mean_fpr, jp, 'g', label='J/P (AUC = %0.2f)' % (jp_auc))
    plt.plot(mean_fpr, ns, 'y', label='N/S (AUC = %0.2f)' % (ns_auc))
    plt.plot(mean_fpr, tf, 'c', label='T/F (AUC = %0.2f)' % (tf_auc))
    plt.plot(mean_fpr, x, 'k', label='mean (AUC = %0.2f)' % (x_auc), linewidth=3)

    plt.plot([0, 1], [0, 1],'r--', label='Chance')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.savefig(Path(p, name + "_" + 'alltraits_roc_plot.png'))
    plt.show()

In [None]:
# plot for MBTI labels
name = 'mbti_b5mbtifeatures_df_mlp_mbti_anova_PCAFalse_30'
ei,jp,ns,tf,x,ei_auc,jp_auc,ns_auc,tf_auc,x_auc= importdatambti(name)
rocplotmbti(name, ei,jp,ns,tf,x,ei_auc,jp_auc,ns_auc,tf_auc,x_auc)

In [None]:
# plot for Big Five labels
name = 'b5_b5mbtifeatures_df_mlp_binary_anova_PCAFalse_30'
o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc= importdata(name)
rocplot(name, o,c,e,a,n,x,o_auc, c_auc, e_auc, a_auc, n_auc, x_auc)