# Classifier for the augmented dataset with adapted cross validation

## Import packages

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from time import time
import datetime
import random
random.seed(32)
import sklearn
# from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, plot_roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression

import numpy as np
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from pathlib import Path
import pickle

## Import data

In [None]:
# read in augmented dataset in two parts: original and fake authors separated
filepath = "aug_b5feat_label_new.pkl"
with open(filepath, 'rb') as f:
    new_augdf = pickle.load(f)
    
new_augdf['trait'] = new_augdf['trait'].astype('int16')
new_augdf['wordngram'] = new_augdf['wordngram'].astype('float32')
new_augdf['charngram'] = new_augdf['charngram'].astype('float32')
new_augdf['x_feat'] = new_augdf['x_feat'].astype('float32')
new_augdf['lin_feat'] = new_augdf['lin_feat'].astype('float32')
new_augdf['psych'] = new_augdf['psych'].astype('float32')
new_augdf['empath'] = new_augdf['empath'].astype('float32')
new_augdf['post'] = new_augdf['post'].astype('float32')
new_augdf['subtf'] = new_augdf['subtf'].astype('float32')
new_augdf['lda50'] = new_augdf['lda50'].astype('float64')
new_augdf['lda100'] = new_augdf['lda100'].astype('float64')

new_augdf.drop(['text'], axis=1, level=0, inplace=True)

filepath = "aug_b5feat_label_original.pkl"
with open(filepath, 'rb') as f:
    ori_augdf = pickle.load(f)
ori_augdf.name = 'augmented_df'

ori_augdf['trait'] = ori_augdf['trait'].astype('int16')
ori_augdf['wordngram'] = ori_augdf['wordngram'].astype('float32')
ori_augdf['charngram'] = ori_augdf['charngram'].astype('float32')
ori_augdf['x_feat'] = ori_augdf['x_feat'].astype('float32')
ori_augdf['lin_feat'] = ori_augdf['lin_feat'].astype('float32')
ori_augdf['psych'] = ori_augdf['psych'].astype('float32')
ori_augdf['empath'] = ori_augdf['empath'].astype('float32')
ori_augdf['post'] = ori_augdf['post'].astype('float32')
ori_augdf['subtf'] = ori_augdf['subtf'].astype('float32')
ori_augdf['lda50'] = ori_augdf['lda50'].astype('float64')
ori_augdf['lda100'] = ori_augdf['lda100'].astype('float64')

ori_augdf.drop(['text'], axis=1, level=0, inplace=True)
ori_augdf['trait'].dtypes
ori_augdf.name = 'augmented_df'

## Trait

In [None]:
#split dataset in features and target variable depending on which trait to focus on
def trait(df, trait_name):
    featuredf = df.drop(['trait'], axis=1, level=0)
    feature_cols = featuredf.columns.tolist()
    
    x = df[feature_cols] 
    
    if trait_name == 'agreeableness':
        y = df['trait', 'big5_a']
    elif trait_name == 'openness':
        y = df['trait', 'big5_o']
    elif trait_name == 'conscientiousness':
        y = df['trait', 'big5_c']
    elif trait_name == 'extraversion':
        y = df['trait', 'big5_e']
    elif trait_name == 'neuroticism':
        y = df['trait', 'big5_n']   
    return x,y 

### Functions for nested stratified cross validation

In [None]:
# get names of the features
def get_names(x, pipeline):
    features = pipeline.named_steps['feature_selection']
    names = x.columns[features.get_support(indices=True)]
    return names

def save_predictors(names, predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5, j):
    if j==1:
        predictors_fold1.append(list(names))
    elif j==2:
        predictors_fold2.append(list(names))
    elif j==3:
        predictors_fold3.append(list(names))
    elif j==4:
        predictors_fold4.append(list(names))
    elif j==5:
        predictors_fold5.append(list(names))
    return predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5

def save_acc_folds(acc, acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5, j):
    if j==1:
        acc_fold1.append(acc)
    elif j==2:
        acc_fold2.append(acc)
    elif j==3:
        acc_fold3.append(acc)
    elif j==4:
        acc_fold4.append(acc)
    elif j==5:
        acc_fold5.append(acc)
    return acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5

def save_auc_folds(auc, auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5, j):
    if j==1:
        auc_fold1.append(auc)
    elif j==2:
        auc_fold2.append(auc)
    elif j==3:
        auc_fold3.append(auc)
    elif j==4:
        auc_fold4.append(auc)
    elif j==5:
        auc_fold5.append(auc)
    return auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5

def save_f1score_folds(f1_macro, f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5, j):
    if j==1:
        f1score_fold1.append(f1_macro)
    elif j==2:
        f1score_fold2.append(f1_macro)
    elif j==3:
        f1score_fold3.append(f1_macro)
    elif j==4:
        f1score_fold4.append(f1_macro)
    elif j==5:
        f1score_fold5.append(f1_macro)
    return f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5


def save_params_folds(foldparams, params_fold1, params_fold2, params_fold3, params_fold4, params_fold5, j):
    if j==1:
        params_fold1.append(foldparams)
    elif j==2:
        params_fold2.append(foldparams)
    elif j==3:
        params_fold3.append(foldparams)
    elif j==4:
        params_fold4.append(foldparams)
    elif j==5:
        params_fold5.append(foldparams)
    return params_fold1, params_fold2, params_fold3, params_fold4, params_fold5


## Classifier

In [None]:
def classify_aug(df, traits, option, fs, dim, n_feat, replication=False, new_augdf=None):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        tstart = time()
        print("Current time: ", str(datetime.datetime.now()))
        print("Classifier: ", option, "\n")
        outputname = "b5_" +df.name +"_" +str(option) +"_" +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)
        output = {'Traits': traits}
        
        # empty lists to save data in csv
        acc_traits, f1_traits, cm_traits, auc_traits = [], [], [], []
        acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5 = [],[],[],[],[]
        auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5 = [],[],[],[],[]
        f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5 = [],[],[],[],[]
        params_fold1, params_fold2, params_fold3, params_fold4, params_fold5 = [],[],[],[],[]
        predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = [],[],[],[],[]

        print("\nTrait to predict: ", traits, "(", option, ")\n")
        x,y = trait(df, traits)
        
        # outer loop
        cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        cv_outer_lst = cv_outer.split(x, y)
        
        # empty lists for saving
        f1macro_lst, accuracy_lst, tpr_lst, ytrue_lst, ypred_lst, auc_lst = [],[],[],[],[],[]
        
        # for ROC curve
        mean_fpr = np.linspace(0, 1, 101)

        plt.figure(figsize=(7, 7))
        p = Path('/home/sophia/ma_py/Big5-NLP/results/augmented/')
        
        # count folds
        j=1
        
        # begin nested cv
        for train_idx, val_idx in cv_outer_lst:
            train_data, val_data = x.iloc[train_idx], x.iloc[val_idx]
            train_target, val_target = y.iloc[train_idx], y.iloc[val_idx]

            print("Fold No.", j)

            # add augmented data to training
            featuredf, traitdf = trait(new_augdf, traits)
            for original in train_data.index:
                res = [idx for idx in new_augdf.index if idx[0:(len(original))] == original]
                datarows = featuredf.loc[res][:]
                targetrows = traitdf.loc[res][:]
                train_data = train_data.append(datarows)
                train_target = train_target.append(targetrows)

            print("After augmentation length of train and test: ", len(train_data), len(val_data))
            print("After augmentation total users in this fold: ", (len(train_data) + len(val_data)))
            print("\n")
            
            clf = Pipeline([
                  ('variance_threshold', VarianceThreshold()),
                  ('scaler', StandardScaler()),
                  ('feature_selection',  SelectKBest(f_classif, k=n_feat)),
                  ('classification', LogisticRegression(class_weight='balanced', n_jobs=-1, max_iter=1000))
                ])
            # inner loop
            cv_inner = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
            params = {'classification__solver': ['lbfgs', 'liblinear', 'saga'], 
                      'classification__C': [10**x for x in range(-3,5)]}
            gd_search = GridSearchCV(clf, params, scoring = 'f1_macro', n_jobs=-1, cv=cv_inner).fit(train_data, train_target)
            best_model = gd_search.best_estimator_
            if dim == False:
                names = get_names(train_data, best_model)
                predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = save_predictors(names, predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5, j)
            
            # fit data on best model (inner loop)
            clfnew = best_model.fit(train_data, train_target)
            y_pred = clfnew.predict(val_data)
            y_score = clfnew.predict_proba(val_data)
            
            # ROC plot
            fpr, tpr, _ = roc_curve(val_target, y_score[:, 1])

            plt.plot(fpr, tpr, 'b', alpha=0.15)
            tpr = np.interp(mean_fpr, fpr, tpr)
            tpr[0] = 0.0
            tpr_lst.append(tpr)

            # confusion matrix
            cm = confusion_matrix(val_target, y_pred)
            cm_traits.append(cm)

            # save macro F1 and accuracy values
            f1_macro = f1_score(val_target, y_pred, average='macro')
            f1macro_lst.append(f1_macro)
            f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5 = save_f1score_folds(f1_macro, f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5, j)
            acc = accuracy_score(val_target, y_pred)
            accuracy_lst.append(acc)
            acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5 = save_acc_folds(acc, acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5, j)
            auc = roc_auc_score(val_target, y_score[:, 1])
            auc_lst.append(auc)
            auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5 = save_auc_folds(auc, auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5, j)
            foldparams = gd_search.best_params_
            params_fold1, params_fold2, params_fold3, params_fold4, params_fold5 = save_params_folds(foldparams, params_fold1, params_fold2, params_fold3, params_fold4, params_fold5, j)

            j+=1


     # Training final model (outer loop)
        auc_avg = np.mean(auc_lst)
        auc_std = np.std(auc_lst)
        auc_traits.append(round(auc_avg, 4))
        print("Average auc score (std): ", auc_avg, auc_std)
        acc_avg = np.mean(accuracy_lst)
        acc_traits.append(round(acc_avg, 4))
        print("Average accuracy: ", acc_avg)
        f1macro_avg = np.mean(f1macro_lst)
        f1_traits.append(round(f1macro_avg, 4))
        print("\n\nAverage f1 macro score: ", f1macro_avg)

        tprs = np.array(tpr_lst)
        mean_tprs = tprs.mean(axis=0)
        std = tprs.std(axis=0)
        tprs_upper = np.minimum(mean_tprs + std, 1)
        tprs_lower = mean_tprs - std
        
        # save auc and tprs for roc curve plots with comparison
        np.save(Path(p, outputname + "_" + traits + '_auc.npy'), auc_avg)
        np.save(Path(p, outputname + "_" + traits + '_meantprs.npy'), mean_tprs)


        plt.plot(mean_fpr, mean_tprs, 'b',  label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (auc_avg, auc_std))
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)

        title1 = "ROC plot for trait " + traits
        plt.plot([0, 1], [0, 1],'r--', label='Chance')
        plt.xlim([-0.01, 1.01])
        plt.ylim([-0.01, 1.01])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend(loc="lower right")
        plt.title(title1)
        plt.savefig(Path(p, outputname + "_" + traits + '_roc_plot.png'))
        plt.show()

        print("Number of users in testing: ", np.sum(cm_traits))
        title2 = "Confusion matrix for trait " + traits
        plt.figure(figsize=(7, 7))
        mean_cm = np.sum(cm_traits, axis=0, dtype='int')
        disp = ConfusionMatrixDisplay(confusion_matrix=mean_cm).plot(cmap=plt.cm.Blues, values_format = '.4f')
        disp.ax_.set_title(title2)
        plt.savefig(Path(p, outputname + "_" + traits + '_cm.png'))
        plt.show()

    print("Total accuracy: ", np.mean(acc_traits), "Total F1 macro: ", np.mean(f1_traits))
    output.update({'acc': acc_traits, 'f1_macro': f1_traits, 'auc': auc_traits})
    output.update({'acc_fold1': acc_fold1, 'acc_fold2': acc_fold2, 'acc_fold3': acc_fold3, 'acc_fold4': acc_fold4, 'acc_fold5': acc_fold5})
    output.update({'f1score_fold1': f1score_fold1, 'f1score_fold2': f1score_fold2, 'f1score_fold3': f1score_fold3, 'f1score_fold4': f1score_fold4, 'f1score_fold5': f1score_fold5})
    output.update({'auc_fold1': auc_fold1, 'auc_fold2': auc_fold2, 'auc_fold3': auc_fold3, 'auc_fold4': auc_fold4, 'auc_fold5': auc_fold5})
    output.update({'params_fold1': params_fold1, 'params_fold2': params_fold2, 'params_fold3': params_fold3, 'params_fold4': params_fold4, 'params_fold5': params_fold5})
    if dim==False:
        output.update({'predictors_fold1': predictors_fold1, 'predictors_fold2': predictors_fold2, 'predictors_fold3': predictors_fold3, 'predictors_fold4': predictors_fold4, 'predictors_fold5': predictors_fold5})
    outputdf = pd.DataFrame(output)
    outputdf.to_csv(Path(p, outputname + "_" + traits + '.csv'), index=False)
    
    print("Time for entire process: %0.2fs" % (time() - tstart))


## Augmented dataset

In [None]:
classify_aug(ori_augdf, 'openness', 'mlp', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_aug(ori_augdf, 'conscientiousness', 'mlp', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_aug(ori_augdf, 'extraversion', 'mlp', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_aug(ori_augdf, 'agreeableness', 'mlp', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_aug(ori_augdf, 'neuroticism', 'mlp', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

## Classify training data to detect over- and underfitting

In [None]:
def classify_train(df, traits, option, fs, dim, n_feat, replication=False, new_augdf=None):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        tstart = time()
        print("Current time: ", str(datetime.datetime.now()))
        print("Classifier: ", option, "\n")
        outputname = "b5_" +df.name +"_" +str(option) +"_" +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)
        output = {'Traits': traits}
        
        # empty lists to save data in csv
        acc_traits, f1_traits, cm_traits, auc_traits = [], [], [], []
        acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5 = [],[],[],[],[]
        auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5 = [],[],[],[],[]
        f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5 = [],[],[],[],[]
        params_fold1, params_fold2, params_fold3, params_fold4, params_fold5 = [],[],[],[],[]
        predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = [],[],[],[],[]

        print("\nTrait to predict: ", traits, "(", option, ")\n")
        x,y = trait(df, traits)
        
        # outer loop
        cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        cv_outer_lst = cv_outer.split(x, y)
        
        # empty lists for saving
        f1macro_lst, accuracy_lst, tpr_lst, ytrue_lst, ypred_lst, auc_lst = [],[],[],[],[],[]
        
        # for ROC curve
        mean_fpr = np.linspace(0, 1, 101)
        plt.figure(figsize=(7, 7))
        p = Path('/home/sophia/ma_py/Big5-NLP/results/')
        
        # count folds
        j=1
        
        # begin nested cv
        for train_idx, val_idx in cv_outer_lst:
            train_data, val_data = x.iloc[train_idx], x.iloc[val_idx]
            train_target, val_target = y.iloc[train_idx], y.iloc[val_idx]

            print("Fold No.", j)

            # add augmented data to training
            featuredf, traitdf = trait(new_augdf, traits)
            for original in train_data.index:
                res = [idx for idx in new_augdf.index if idx[0:(len(original))] == original]
                datarows = featuredf.loc[res][:]
                targetrows = traitdf.loc[res][:]
                train_data = train_data.append(datarows)
                train_target = train_target.append(targetrows)

            print("After augmentation length of train and test: ", len(train_data), len(val_data))
            print("After augmentation total users in this fold: ", (len(train_data) + len(val_data)))
            print("\n")
            
            clf = Pipeline([
                  ('variance_threshold', VarianceThreshold()),
                  ('scaler', StandardScaler()),
                  ('feature_selection',  SelectKBest(f_classif, k=n_feat)),
                  ('classification', LogisticRegression(class_weight='balanced', n_jobs=-1, max_iter=1000))
                ])
            # inner loop proceedings
            cv_inner = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
            params = {'classification__solver': ['lbfgs', 'liblinear', 'saga'], 
                      'classification__C': [10**x for x in range(-3,5)]}
            gd_search = GridSearchCV(clf, params, scoring = 'f1_macro', n_jobs=-1, cv=cv_inner).fit(train_data, train_target)
            best_model = gd_search.best_estimator_
            if dim == False:
                names = get_names(train_data, best_model)
                predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = save_predictors(names, predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5, j)
            
            # fit data on best model (inner loop)
            clfnew = best_model.fit(train_data, train_target)
            y_pred = clfnew.predict(train_data)
            y_score = clfnew.predict_proba(train_data)
            
            # ROC plot
            fpr, tpr, _ = roc_curve(train_target, y_score[:, 1])

            plt.plot(fpr, tpr, 'b', alpha=0.15)
            tpr = np.interp(mean_fpr, fpr, tpr)
            tpr[0] = 0.0
            tpr_lst.append(tpr)

            # confusion matrix
            cm = confusion_matrix(train_target, y_pred)
            cm_traits.append(cm)

            # save macro F1 and accuracy values
            f1_macro = f1_score(train_target, y_pred, average='macro')
            f1macro_lst.append(f1_macro)
            f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5 = save_f1score_folds(f1_macro, f1score_fold1, f1score_fold2, f1score_fold3, f1score_fold4, f1score_fold5, j)
            acc = accuracy_score(train_target, y_pred)
            accuracy_lst.append(acc)
            acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5 = save_acc_folds(acc, acc_fold1, acc_fold2, acc_fold3, acc_fold4, acc_fold5, j)
            auc = roc_auc_score(train_target, y_score[:, 1])
            auc_lst.append(auc)
            auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5 = save_auc_folds(auc, auc_fold1, auc_fold2, auc_fold3, auc_fold4, auc_fold5, j)
            foldparams = gd_search.best_params_
            params_fold1, params_fold2, params_fold3, params_fold4, params_fold5 = save_params_folds(foldparams, params_fold1, params_fold2, params_fold3, params_fold4, params_fold5, j)

            j+=1


     # Training final model (outer loop)
        auc_avg = np.mean(auc_lst)
        auc_std = np.std(auc_lst)
        auc_traits.append(round(auc_avg, 4))
        print("Average auc score (std): ", auc_avg, auc_std)
        acc_avg = np.mean(accuracy_lst)
        acc_traits.append(round(acc_avg, 4))
        print("Average accuracy: ", acc_avg)
        f1macro_avg = np.mean(f1macro_lst)
        f1_traits.append(round(f1macro_avg, 4))
        print("\n\nAverage f1 macro score: ", f1macro_avg)

        tprs = np.array(tpr_lst)
        mean_tprs = tprs.mean(axis=0)
        std = tprs.std(axis=0)
        tprs_upper = np.minimum(mean_tprs + std, 1)
        tprs_lower = mean_tprs - std
        
        # save auc and tprs for roc plot with comparison
        np.save(Path(p, outputname + "_" + trait_name + '_auc.npy'), auc_avg)
        np.save(Path(p, outputname + "_" + traits + '_meantprs.npy'), mean_tprs)


        plt.plot(mean_fpr, mean_tprs, 'b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (auc_avg, auc_std))
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)

        title1 = "ROC plot for trait " + traits
        plt.plot([0, 1], [0, 1],'r--', label='Chance')
        plt.xlim([-0.01, 1.01])
        plt.ylim([-0.01, 1.01])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend(loc="lower right")
        plt.title(title1)
#         plt.savefig(Path(p, outputname + "_" + traits + '_roc_plot.png'))
        plt.show()

        print("Number of users in testing: ", np.sum(cm_traits))
        title2 = "Confusion matrix for trait " + traits
        plt.figure(figsize=(7, 7))
        mean_cm = np.sum(cm_traits, axis=0, dtype='int')
        disp = ConfusionMatrixDisplay(confusion_matrix=mean_cm).plot(cmap=plt.cm.Blues, values_format = '.4f')
        disp.ax_.set_title(title2)
#         plt.savefig(Path(p, outputname + "_" + traits + '_cm.png'))
        plt.show()

    print("Total accuracy: ", np.mean(acc_traits), "Total F1 macro: ", np.mean(f1_traits))
    output.update({'acc': acc_traits, 'f1_macro': f1_traits, 'auc': auc_traits})
    output.update({'acc_fold1': acc_fold1, 'acc_fold2': acc_fold2, 'acc_fold3': acc_fold3, 'acc_fold4': acc_fold4, 'acc_fold5': acc_fold5})
    output.update({'f1score_fold1': f1score_fold1, 'f1score_fold2': f1score_fold2, 'f1score_fold3': f1score_fold3, 'f1score_fold4': f1score_fold4, 'f1score_fold5': f1score_fold5})
    output.update({'auc_fold1': auc_fold1, 'auc_fold2': auc_fold2, 'auc_fold3': auc_fold3, 'auc_fold4': auc_fold4, 'auc_fold5': auc_fold5})
    output.update({'params_fold1': params_fold1, 'params_fold2': params_fold2, 'params_fold3': params_fold3, 'params_fold4': params_fold4, 'params_fold5': params_fold5})
    if dim==False:
        output.update({'predictors_fold1': predictors_fold1, 'predictors_fold2': predictors_fold2, 'predictors_fold3': predictors_fold3, 'predictors_fold4': predictors_fold4, 'predictors_fold5': predictors_fold5})
    outputdf = pd.DataFrame(output)
#     outputdf.to_csv(Path(p, outputname + "_" + traits + '.csv'), index=False)
    return outputdf
    
    print("Time for entire process: %0.2fs" % (time() - tstart))


In [None]:
classify_train(ori_augdf, 'openness', 'log', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_train(ori_augdf, 'conscientiousness', 'log', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_train(ori_augdf, 'extraversion', 'log', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_train(ori_augdf, 'agreeableness', 'log', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

In [None]:
classify_train(ori_augdf, 'neuroticism', 'log', 'anova', dim=False, n_feat=30, new_augdf=new_augdf)

## Regression on augmented data

In [None]:
def split(df, trait_name):
    featuredf = df.drop(['trait'], axis=1, level=0)
    try:
        featuredf.drop(['text'], axis=1, level=0, inplace=True)
    except KeyError:
        pass
    try:
        featuredf.drop(['data'], axis=1, level=0, inplace=True)
    except KeyError:
        pass
    feature_cols = featuredf.columns.tolist()
    
    x = df[feature_cols]     
    y = df['trait', trait_name]
    return x,y 

def get_classifier(classifier):
    if classifier == 'linear':
        return LinearRegression(n_jobs=-1)
    elif classifier == 'rfc_reg':
        return RandomForestRegressor(n_jobs=-1)
    elif classifier == 'boost_reg':
        return GradientBoostingRegressor(random_state=0)

    
def get_featureselection(fs, classifier, n_feat):
    if fs == 'anova':
        return SelectKBest(f_classif, k=n_feat)
    if fs == 'mutual':
        return SelectKBest(mutual_info_classif, k=n_feat)
    if fs == 'sequential_forward':
        return SequentialFeatureSelector(get_classifier(classifier), n_features_to_select=n_feat, direction='forward', n_jobs=-1)
    if fs == 'sequential_backward':
        return SequentialFeatureSelector(get_classifier(classifier), n_features_to_select=n_feat, direction='backward', n_jobs=-1)


    
def create_pipeline_cv(classifier, fs, dim, n_feat):
    if dim:
        pipeline = Pipeline([
              ('variance_threshold', VarianceThreshold()),
              ('scaler', StandardScaler()),
              ('pca', PCA(n_components=100)),
              ('feature_selection',  get_featureselection(fs, classifier, n_feat)),
              ('classification', get_classifier(classifier))
            ])
    else:
        pipeline = Pipeline([
              ('variance_threshold', VarianceThreshold()),
              ('scaler', StandardScaler()),
              ('feature_selection',  get_featureselection(fs, classifier, n_feat)),
              ('classification', get_classifier(classifier))
            ])
    return pipeline


def define_outputname(traits, df, option, fs, dim, n_feat, train=False):
    if train:
        if len(traits) ==1:
            outputname =  "train_" +df.name +"_" +str(option) +"_" +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)
        else:
            outputname =  "train_" +df.name +"_" +str(option) +"_" +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)
 
    else:
        if len(traits) ==1:
            outputname = str(option) +"_" +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)
        else:
            outputname = str(option) +"_"   +str(fs) +"_PCA" +str(dim) +"_"+str(n_feat)

    return outputname


def linear_plot(val_target, y_pred, outputname, trait_name):
    linear_plot = sns.regplot(x=val_target, y=y_pred, ci=None,  scatter_kws={"color": "blue"}, line_kws={"color": "red"})
    # color="b"
#     plt.savefig(Path(p, outputname + "_" + trait_name + '_ytruepred_plot.png'))
    return linear_plot

def save_coefficients(coefficients, coef_fold1, coef_fold2, coef_fold3, coef_fold4, coef_fold5, j):
    if j==1:
        coef_fold1.append(list(coefficients))
    elif j==2:
        coef_fold2.append(list(coefficients))
    elif j==3:
        coef_fold3.append(list(coefficients))
    elif j==4:
        coef_fold4.append(list(coefficients))
    elif j==5:
        coef_fold5.append(list(coefficients))
    return coef_fold1, coef_fold2, coef_fold3, coef_fold4, coef_fold5

In [None]:
def regress(df, traits, clf_lst, fs, dim, n_feat,newaugdf, train=False):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        print("Current time: ", str(datetime.datetime.now()))
        tstart=time()
        for option in clf_lst:
            print("Classifier: ", option, "\n")
            outputname = define_outputname(traits, df, option, fs, dim, n_feat)
            output = {'Traits': traits}
            
            # empty lists to save data in csv
            predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = [],[],[],[],[]
            rsquared_traits, mse_traits = [], []
            if option == 'linear':
                coef_fold1, coef_fold2, coef_fold3, coef_fold4, coef_fold5  = [],[],[],[],[]

    
            for trait_name in traits:
                print("\nTrait to predict: ", trait_name, "(", option, ")\n")
                x,y = split(df, trait_name)
                cv_outer = KFold(n_splits=5, shuffle=True, random_state=0)
                cv_outer_lst = cv_outer.split(x)
                
                # empty lists for saving        
                rsquared_lst, mse_lst, ytrue_lst, ypred_lst = [],[],[],[]

                plt.figure(figsize=(7, 7))
                p = Path('/home/sophia/ma_py/Big5-NLP/results/')
                j=1
                for train_idx, val_idx in cv_outer_lst:
                    train_data, val_data = x.iloc[train_idx], x.iloc[val_idx]
                    train_target, val_target = y.iloc[train_idx], y.iloc[val_idx]
                    
                    # add augmented data to training
                    featuredf, traitdf = split(new_augdf, trait_name)
                    for original in train_data.index:
                        res = [idx for idx in new_augdf.index if idx[0:(len(original))] == original]
                        datarows = featuredf.loc[res][:]
                        targetrows = traitdf.loc[res][:]
                        train_data = train_data.append(datarows)
                        train_target = train_target.append(targetrows)
                    
                    print("Fold No. ", j)
                    print("Length of train and test: ", len(train_data), len(val_data))
                    print("Total users in this fold: ", (len(train_data) + len(val_data)))
            
                    # create pipeline
                    clf = create_pipeline_cv(option, fs, dim, n_feat)
                    clfnew = clf.fit(train_data, train_target)
                    if dim == False:
                        names = get_names(train_data, clfnew)
                        predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5 = save_predictors(names, predictors_fold1, predictors_fold2, predictors_fold3, predictors_fold4, predictors_fold5, j)
                    
                    if train: 
                        y_pred = clfnew.predict(train_data)
                        score = clf.score(train_data, train_target)
                        mse = mean_squared_error(train_target, y_pred)
                        ytrue_lst.append(train_target)
                        
                    else:
                        y_pred = clfnew.predict(val_data)
                        score = clf.score(val_data, val_target)
                        mse = mean_squared_error(val_target, y_pred)
                        ytrue_lst.append(val_target)
                        
                    ypred_lst.append(y_pred)
                        
                    if option == 'linear':
                        coefficients = clf.named_steps['classification'].coef_
                        coef_fold1, coef_fold2, coef_fold3, coef_fold4, coef_fold5 = save_coefficients(coefficients, coef_fold1, coef_fold2, coef_fold3, coef_fold4, coef_fold5, j)
                    rsquared_lst.append(score)
                    mse_lst.append(mse)



                    j+=1
                    
                    
                  # Average results
                r_avg = np.mean(rsquared_lst)
                rsquared_traits.append(round(r_avg, 4))
                mse_avg = np.mean(mse_lst)
                mse_traits.append(round(mse_avg, 4))
                print("Average score (R squared): ", r_avg, "\nAverage MSE: ", mse_avg)

                all_ytrue = np.concatenate(ytrue_lst)
                all_ypred = np.concatenate(ypred_lst)
                print(len(all_ytrue))
                print(len(all_ypred))
                title = 'Regression plot for trait ' + trait_name 
                plot = linear_plot(all_ytrue, all_ypred, outputname, trait_name)
                plt.xlim([1, 101])
                plt.ylim([1, 101])
                plt.ylabel('True scores')
                plt.xlabel('Predicted scores')
                plt.axes().set_aspect('equal', 'datalim')
                plt.title(title)
                plt.savefig(Path(p, outputname + "_" + trait_name + '_linearplot.png'))
                plt.show()

            print("Total r squared: ", np.mean(rsquared_traits), "Total MSE: ", np.mean(mse_traits))
            output.update({'rsquared': rsquared_traits, 'MSE': mse_traits})
            if dim==False:
                output.update({'predictors_fold1': predictors_fold1, 'predictors_fold2': predictors_fold2, 'predictors_fold3': predictors_fold3, 'predictors_fold4': predictors_fold4, 'predictors_fold5': predictors_fold5})
            if option=='linear':
                output.update({'coef_fold1': coef_fold1, 'coef_fold2': coef_fold2, 'coef_fold3': coef_fold3, 'coef_fold4': coef_fold4, 'coef_fold5': coef_fold5})
            outputdf = pd.DataFrame(output)
            outputdf.to_csv(Path(p, outputname + '.csv'), index=False)
            print("Time for entire process: %0.2fs" % (time() - tstart))

            

big5_traits = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
regres = ['linear', 'rfc_reg', 'boost_reg', 'mlp_reg']

In [None]:
regress(ori_augdf, big5_traits, ['linear'], 'anova', dim=False, n_feat=10, newaugdf=new_augdf)