# Personality Prediction Framework (v2.0)

Developed by: R.T.R Jayasekara

## Required Libraries

In [1]:
import pandas
import numpy
import os
import pickle

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [2]:
from tune_sklearn import TuneGridSearchCV

In [3]:
# import nltk
# nltk.download('vader_lexicon')

## Preprocessor

In order to work with text data, it is important to transform the raw text into a form that can be understood and used by Machine Learning algorithms, this is called preprocessing.

In [4]:
if __name__ == "__main__":
    status_data = pandas.read_csv("../datasets/mypersonality_final.csv")

    NEG_INDEX = 2
    POS_INDEX = 3
    NEU_INDEX = 4
    COMP_INDEX = 5

    # Annotate the status with sentiment scores
    # From nltk.sentiment.vader corpus
    if not os.path.isfile("../datasets/mypersonality_cleaned.csv"):
        status_data.insert(NEG_INDEX, "sentiNEG", 0)
        status_data.insert(POS_INDEX, "sentiPOS", 0)
        status_data.insert(NEU_INDEX, "sentiNEU", 0)
        status_data.insert(COMP_INDEX, "sentiCOMPOUND", 0)

        sid = SentimentIntensityAnalyzer()
        count = 0
        for row in status_data.itertuples():
            """
            pos: positive
            neg: negative
            neu: neutral
            compound: aggregated score for the sentence
            """
            ss = sid.polarity_scores(row.STATUS)
            status_data.iloc[count, NEG_INDEX] = ss["neg"]
            status_data.iloc[count, POS_INDEX] = ss["pos"]
            status_data.iloc[count, NEU_INDEX] = ss["neu"]
            status_data.iloc[count, COMP_INDEX] = ss["compound"]
            count += 1

        status_data.to_csv("../datasets/mypersonality_cleaned.csv")
    else:
        status_data = pandas.read_csv("../datasets/mypersonality_cleaned.csv")

In [5]:
# Drop NAs
status_data = status_data.dropna()

# We drop columns which give us a score for personality type
status_data = status_data.drop(['STATUS', '#AUTHID', 'sEXT', 'sNEU', 'sAGR',
                                    'sCON', 'sOPN', 'DATE'], axis=1)

# Drop non-normalized scores of Brokerage and Betweenness
status_data = status_data.drop(['BROKERAGE', 'BETWEENNESS', 'NBROKERAGE',
                                    'NBETWEENNESS', 'DENSITY', 'TRANSITIVITY', 'NETWORKSIZE'], axis=1)

In [6]:
status_data.head(10)

Unnamed: 0.1,Unnamed: 0,sentiNEG,sentiPOS,sentiNEU,sentiCOMPOUND,cEXT,cNEU,cAGR,cCON,cOPN
0,0,0.0,0.412,0.588,0.4215,n,y,n,n,y
1,1,0.167,0.0,0.833,-0.3412,n,y,n,n,y
2,2,0.195,0.278,0.527,0.628,n,y,n,n,y
3,3,0.0,0.259,0.741,0.4215,n,y,n,n,y
4,4,0.0,0.592,0.408,0.4404,n,y,n,n,y
5,5,0.0,0.0,1.0,0.0,n,y,n,n,y
6,6,0.0,0.515,0.485,0.8916,n,y,n,n,y
7,7,0.0,0.0,1.0,0.0,n,y,n,n,y
8,8,0.188,0.053,0.759,-0.6249,n,y,n,n,y
9,9,0.0,0.323,0.677,0.7351,n,y,n,n,y


In [7]:
# Change the name of first row to "rowID"
new_columns = status_data.columns.values
new_columns[0] = "rowID"
status_data.columns = new_columns

# Put the columns to be predicted, at the end
cols = status_data.columns.tolist()
cols = cols[:5] + cols[5:10]
status_data = status_data[cols]

In [8]:
status_data.head(10)

Unnamed: 0,rowID,sentiNEG,sentiPOS,sentiNEU,sentiCOMPOUND,cEXT,cNEU,cAGR,cCON,cOPN
0,0,0.0,0.412,0.588,0.4215,n,y,n,n,y
1,1,0.167,0.0,0.833,-0.3412,n,y,n,n,y
2,2,0.195,0.278,0.527,0.628,n,y,n,n,y
3,3,0.0,0.259,0.741,0.4215,n,y,n,n,y
4,4,0.0,0.592,0.408,0.4404,n,y,n,n,y
5,5,0.0,0.0,1.0,0.0,n,y,n,n,y
6,6,0.0,0.515,0.485,0.8916,n,y,n,n,y
7,7,0.0,0.0,1.0,0.0,n,y,n,n,y
8,8,0.188,0.053,0.759,-0.6249,n,y,n,n,y
9,9,0.0,0.323,0.677,0.7351,n,y,n,n,y


In [9]:
# 'y' for 1 and 'n' for 0
features = ['cEXT', 'cNEU', 'cOPN', 'cAGR', 'cCON']
for feature in features:
    status_data[feature] = status_data[feature].map({'y': 1.0, 'n': 0.0}).astype(int)

# Machine Learning

In [10]:
# Split into training and test data: 66% and 33%
train_data, test_data = train_test_split(status_data, test_size=0.50)

train_OPN = train_data[['rowID','sentiNEG', 'sentiPOS', 'sentiNEU', 'sentiCOMPOUND', 'cOPN']].values
train_CON = train_data[['rowID','sentiNEG', 'sentiPOS', 'sentiNEU', 'sentiCOMPOUND', 'cCON']].values
train_EXT = train_data[['rowID','sentiNEG', 'sentiPOS', 'sentiNEU', 'sentiCOMPOUND', 'cEXT']].values
train_AGR = train_data[['rowID','sentiNEG', 'sentiPOS', 'sentiNEU', 'sentiCOMPOUND', 'cAGR']].values
train_NEU = train_data[['rowID','sentiNEG', 'sentiPOS', 'sentiNEU', 'sentiCOMPOUND', 'cNEU']].values

test = test_data.values

In [15]:
test_data.cCON

2205    0
3390    0
2674    0
3305    1
3242    1
       ..
7324    0
3454    0
8652    0
7034    0
8856    0
Name: cCON, Length: 4958, dtype: int32

In [11]:
# Calculate evaluation metrics
def summarize_metrics(tp, tn, fp, fn):
    precision = 0 if ((tp + fp) == 0) else (tp / (tp + fp))
    recall = 0 if ((tp + fn) == 0) else (tp / (tp + fn))
    accuracy = 0 if ((tp + tn + fp + fn) == 0) else ((tp + tn) / (tp + tn + fp + fn))
    f1_score = 0 if ((recall + precision) == 0) else ((2 * recall * precision) / (recall + precision))

    print("Precison:", precision)
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("F1 score:", f1_score)

In [12]:
# Build a confusion matrix
def eval_model(rowID, result_df, trait):
    tp_count = 0 #true-positive
    tn_count = 0 #true-negative
    fp_count = 0 #false-positive
    fn_count = 0 #false-negative
    
    #print(result_df)
    
    for row in rowID:
        test_trait_val = test_data.loc[test_data['rowID'] == row].cOPN
        result_trait_val = result_df.loc[result_df['rowID'] == row].cOPN
        
        #check this if tree_______________________________________________________________________
        if trait == 'cCON':
            test_trait_val = test_data.loc[test_data['rowID'] == row].cCON
            result_trait_val = result_df.loc[result_df['rowID'] == row].cCON
        if trait == 'cEXT':
            test_trait_val = test_data.loc[test_data['rowID'] == row].cEXT
            result_trait_val = result_df.loc[result_df['rowID'] == row].cEXT
        if trait == 'cAGR':
            test_trait_val = test_data.loc[test_data['rowID'] == row].cAGR
            result_trait_val = result_df.loc[result_df['rowID'] == row].cAGR
        if trait == 'cNEU':
            test_trait_val = test_data.loc[test_data['rowID'] == row].cNEU
            result_trait_val = result_df.loc[result_df['rowID'] == row].cNEU
        
        if test_trait_val.astype(int).all() == 1:
            if result_trait_val.astype(int).all() == 1:
                tp_count += 1
            else:
                fn_count += 1
        else:
            if result_trait_val.astype(int).all() == 1:
                fp_count += 1
            else:
                tn_count += 1

    print(tp_count, tn_count, fp_count, fn_count)
    summarize_metrics(tp_count, tn_count, fp_count, fn_count)

In [13]:
def eval_pipeline(rowID, result_df):
    # Assess the model for Openness
    print('Openness')
    eval_model(rowID, result_df, 'cOPN')
    print('\n')
    
    # Assess the model for Conscientiousness
    print('Conscientiousness')
    eval_model(rowID, result_df, 'cCON')
    print('\n')
    
    # Assess the model for Extraversion
    print('Extraversion')
    eval_model(rowID, result_df, 'cEXT')
    print('\n')
    
    # Assess the model for Aggreableness
    print('Aggreableness')
    eval_model(rowID, result_df, 'cAGR')
    print('\n')

    # Assess the model for Neuroticism
    print('Neuroticism')
    eval_model(rowID, result_df, 'cNEU')

## Logistic Regression (LR)

In [16]:
# Build LR classifiers for each personality trait
model_LR_OPN = LogisticRegression(solver = 'lbfgs')
model_LR_OPN = model_LR_OPN.fit(train_OPN[0:, 1:5], train_OPN[0:, 5])

model_LR_CON = LogisticRegression(solver = 'lbfgs')
model_LR_CON = model_LR_CON.fit(train_CON[0:, 1:5], train_CON[0:, 5])

model_LR_EXT = LogisticRegression(solver = 'lbfgs')
model_LR_EXT = model_LR_EXT.fit(train_EXT[0:, 1:5], train_EXT[0:, 5])

model_LR_AGR = LogisticRegression(solver = 'lbfgs')
model_LR_AGR = model_LR_AGR.fit(train_AGR[0:, 1:5], train_AGR[0:, 5])

model_LR_NEU = LogisticRegression(solver = 'lbfgs')
model_LR_NEU = model_LR_NEU.fit(train_NEU[0:, 1:5], train_NEU[0:, 5])

In [17]:
# Predict
output_LR_OPN = model_LR_OPN.predict(test[:, 1:5])
output_LR_CON = model_LR_CON.predict(test[:, 1:5])
output_LR_EXT = model_LR_EXT.predict(test[:, 1:5])
output_LR_AGR = model_LR_AGR.predict(test[:, 1:5])
output_LR_NEU = model_LR_NEU.predict(test[:, 1:5])

rowID_LR = [TEST.rowID for TEST in test_data.itertuples()]

result_df_LR = pandas.DataFrame({"rowID": rowID_LR,"cOPN": list(output_LR_OPN)})
result_df_LR['cCON'] = list(output_LR_CON)
result_df_LR['cEXT'] = list(output_LR_EXT)
result_df_LR['cAGR'] = list(output_LR_AGR)
result_df_LR['cNEU'] = list(output_LR_NEU)

In [21]:
result_df_LR.cOPN

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
4953    1.0
4954    1.0
4955    1.0
4956    1.0
4957    1.0
Name: cOPN, Length: 4958, dtype: float64

In [18]:
eval_pipeline(rowID_LR, result_df_LR)

Openness
3691 0 1267 0
Precison: 0.7444534086325131
Recall: 1.0
Accuracy: 0.7444534086325131
F1 score: 0.8535090761937796


Conscientiousness
48 2656 54 2200
Precison: 0.47058823529411764
Recall: 0.021352313167259787
Accuracy: 0.54538120209762
F1 score: 0.04085106382978724


Extraversion
0 2811 0 2147
Precison: 0
Recall: 0.0
Accuracy: 0.5669624848729327
F1 score: 0


Aggreableness
2629 0 2329 0
Precison: 0.5302541347317467
Recall: 1.0
Accuracy: 0.5302541347317467
F1 score: 0.6930275471200738


Neuroticism
0 3107 0 1851
Precison: 0
Recall: 0.0
Accuracy: 0.626663977410246
F1 score: 0


## K-Nearest Neighbors (KNN)

In [16]:
# Build KNN classifiers for each personality trait
# k is chosen to be square root of number of training example
model_KNN_OPN = KNeighborsClassifier(n_neighbors=250)
model_KNN_OPN = model_KNN_OPN.fit(train_OPN[0:, 1:5], train_OPN[0:, 5])

model_KNN_CON = KNeighborsClassifier(n_neighbors=250)
model_KNN_CON = model_KNN_CON.fit(train_CON[0:, 1:5], train_CON[0:, 5])

model_KNN_EXT = KNeighborsClassifier(n_neighbors=250)
model_KNN_EXT = model_KNN_EXT.fit(train_EXT[0:, 1:5], train_EXT[0:, 5])

model_KNN_AGR = KNeighborsClassifier(n_neighbors=250)
model_KNN_AGR = model_KNN_AGR.fit(train_AGR[0:, 1:5], train_AGR[0:, 5])

model_KNN_NEU = KNeighborsClassifier(n_neighbors=250)
model_KNN_NEU = model_KNN_NEU.fit(train_NEU[0:, 1:5], train_NEU[0:, 5])

In [17]:
# Predict
output_KNN_OPN = model_KNN_OPN.predict(test[:, 1:5])
output_KNN_CON = model_KNN_CON.predict(test[:, 1:5])
output_KNN_EXT = model_KNN_EXT.predict(test[:, 1:5])
output_KNN_AGR = model_KNN_AGR.predict(test[:, 1:5])
output_KNN_NEU = model_KNN_NEU.predict(test[:, 1:5])

rowID_KNN = [TEST.rowID for TEST in test_data.itertuples()]

result_df_KNN = pandas.DataFrame({"rowID": rowID_KNN,"cOPN": list(output_KNN_OPN)})
result_df_KNN['cCON'] = list(output_KNN_CON)
result_df_KNN['cEXT'] = list(output_KNN_EXT)
result_df_KNN['cAGR'] = list(output_KNN_AGR)
result_df_KNN['cNEU'] = list(output_KNN_NEU)

In [18]:
eval_pipeline(rowID_KNN, result_df_KNN)

Openness
3684 0 1274 0
Precison: 0.7430415490116983
Recall: 1.0
Accuracy: 0.7430415490116983
F1 score: 0.8525804211987966


Conscientiousness
249 2421 280 2008
Precison: 0.4706994328922495
Recall: 0.11032343819229065
Accuracy: 0.5385235982250908
F1 score: 0.17875089734386218


Extraversion
51 2800 71 2036
Precison: 0.4180327868852459
Recall: 0.024436990896023
Accuracy: 0.5750302541347317
F1 score: 0.04617473970122227


Aggreableness
2267 291 2068 332
Precison: 0.522952710495963
Recall: 0.8722585609849942
Accuracy: 0.5159338442920532
F1 score: 0.6538794346697433


Neuroticism
0 3089 0 1869
Precison: 0
Recall: 0.0
Accuracy: 0.6230334812424365
F1 score: 0


## Support Vector Machine (SVM)

In [19]:
# Build SVM classifiers for each personality trait
model_SVM_OPN = SVC()
model_SVM_OPN = model_SVM_OPN.fit(train_OPN[0:, 1:5], train_OPN[0:, 5])

model_SVM_CON = SVC()
model_SVM_CON = model_SVM_CON.fit(train_CON[0:, 1:5], train_CON[0:, 5])

model_SVM_EXT = SVC()
model_SVM_EXT = model_SVM_EXT.fit(train_EXT[0:, 1:5], train_EXT[0:, 5])

model_SVM_AGR = SVC()
model_SVM_AGR = model_SVM_AGR.fit(train_AGR[0:, 1:5], train_AGR[0:, 5])

model_SVM_NEU = SVC()
model_SVM_NEU = model_SVM_NEU.fit(train_NEU[0:, 1:5], train_NEU[0:, 5])

In [20]:
# Predict
output_SVM_OPN = model_SVM_OPN.predict(test[:, 1:5])
output_SVM_CON = model_SVM_CON.predict(test[:, 1:5])
output_SVM_EXT = model_SVM_EXT.predict(test[:, 1:5])
output_SVM_AGR = model_SVM_AGR.predict(test[:, 1:5])
output_SVM_NEU = model_SVM_NEU.predict(test[:, 1:5])

rowID_SVM = [TEST.rowID for TEST in test_data.itertuples()]

result_df_SVM = pandas.DataFrame({"rowID": rowID_SVM,"cOPN": list(output_SVM_OPN)})
result_df_SVM['cCON'] = list(output_SVM_CON)
result_df_SVM['cEXT'] = list(output_SVM_EXT)
result_df_SVM['cAGR'] = list(output_SVM_AGR)
result_df_SVM['cNEU'] = list(output_SVM_NEU)

In [21]:
eval_pipeline(rowID_SVM, result_df_SVM)

Openness
3684 0 1274 0
Precison: 0.7430415490116983
Recall: 1.0
Accuracy: 0.7430415490116983
F1 score: 0.8525804211987966


Conscientiousness
6 2691 10 2251
Precison: 0.375
Recall: 0.002658396101019052
Accuracy: 0.5439693424768052
F1 score: 0.005279366476022878


Extraversion
0 2871 0 2087
Precison: 0
Recall: 0.0
Accuracy: 0.5790641387656313
F1 score: 0


Aggreableness
2498 97 2262 101
Precison: 0.5247899159663866
Recall: 0.9611388995767602
Accuracy: 0.5233965308592174
F1 score: 0.6788965892104907


Neuroticism
0 3089 0 1869
Precison: 0
Recall: 0.0
Accuracy: 0.6230334812424365
F1 score: 0


## Gaussian Naive Bayes

In [22]:
# Build Naive Bayes classifiers for each personality trait
model_GNB_OPN = GaussianNB()
model_GNB_OPN = model_GNB_OPN.fit(train_OPN[0:, 1:5], train_OPN[0:, 5])

model_GNB_CON = GaussianNB()
model_GNB_CON = model_GNB_CON.fit(train_CON[0:, 1:5], train_CON[0:, 5])

model_GNB_EXT = GaussianNB()
model_GNB_EXT = model_GNB_EXT.fit(train_EXT[0:, 1:5], train_EXT[0:, 5])

model_GNB_AGR = GaussianNB()
model_GNB_AGR = model_GNB_AGR.fit(train_AGR[0:, 1:5], train_AGR[0:, 5])

model_GNB_NEU = GaussianNB()
model_GNB_NEU = model_GNB_NEU.fit(train_NEU[0:, 1:5], train_NEU[0:, 5])

In [23]:
# Predict
output_GNB_OPN = model_GNB_OPN.predict(test[:, 1:5])
output_GNB_CON = model_GNB_CON.predict(test[:, 1:5])
output_GNB_EXT = model_GNB_EXT.predict(test[:, 1:5])
output_GNB_AGR = model_GNB_AGR.predict(test[:, 1:5])
output_GNB_NEU = model_GNB_NEU.predict(test[:, 1:5])

rowID_GNB = [TEST.rowID for TEST in test_data.itertuples()]

result_df_GNB = pandas.DataFrame({"rowID": rowID_GNB,"cOPN": list(output_GNB_OPN)})
result_df_GNB['cCON'] = list(output_GNB_CON)
result_df_GNB['cEXT'] = list(output_GNB_EXT)
result_df_GNB['cAGR'] = list(output_GNB_AGR)
result_df_GNB['cNEU'] = list(output_GNB_NEU)

In [24]:
eval_pipeline(rowID_GNB, result_df_GNB)

Openness
3684 0 1274 0
Precison: 0.7430415490116983
Recall: 1.0
Accuracy: 0.7430415490116983
F1 score: 0.8525804211987966


Conscientiousness
454 2138 563 1803
Precison: 0.4464110127826942
Recall: 0.20115197164377493
Accuracy: 0.5227914481645825
F1 score: 0.2773365913255956


Extraversion
401 2286 585 1686
Precison: 0.4066937119675456
Recall: 0.19214183037853377
Accuracy: 0.5419524001613554
F1 score: 0.2609827530100879


Aggreableness
1192 1374 985 1407
Precison: 0.5475424896646761
Recall: 0.458637937668334
Accuracy: 0.5175473981444131
F1 score: 0.4991624790619765


Neuroticism
144 2890 199 1725
Precison: 0.4198250728862974
Recall: 0.07704654895666131
Accuracy: 0.6119402985074627
F1 score: 0.1301989150090416


## Pickling the Models

In [33]:
# Pickling LR Models
with open('pickle_files/ppf_lr_opn.pickle', 'wb') as files:
  pickle.dump(model_LR_OPN, files)

with open('pickle_files/ppf_lr_con.pickle', 'wb') as files:
  pickle.dump(model_LR_CON, files)

with open('pickle_files/ppf_lr_ext.pickle', 'wb') as files:
  pickle.dump(model_LR_EXT, files)

with open('pickle_files/ppf_lr_agr.pickle', 'wb') as files:
  pickle.dump(model_LR_AGR, files)

with open('pickle_files/ppf_lr_neu.pickle', 'wb') as files:
  pickle.dump(model_LR_NEU, files)

In [25]:
# Pickling KNN Models
with open('pickle_files/ppf_knn_opn.pickle', 'wb') as files:
  pickle.dump(model_KNN_OPN, files)

with open('pickle_files/ppf_knn_con.pickle', 'wb') as files:
  pickle.dump(model_KNN_CON, files)

with open('pickle_files/ppf_knn_ext.pickle', 'wb') as files:
  pickle.dump(model_KNN_EXT, files)

with open('pickle_files/ppf_knn_agr.pickle', 'wb') as files:
  pickle.dump(model_KNN_AGR, files)

with open('pickle_files/ppf_knn_neu.pickle', 'wb') as files:
  pickle.dump(model_KNN_NEU, files)

In [26]:
# Pickling SVM Models
with open('pickle_files/ppf_svm_opn.pickle', 'wb') as files:
  pickle.dump(model_SVM_OPN, files)

with open('pickle_files/ppf_svm_con.pickle', 'wb') as files:
  pickle.dump(model_SVM_CON, files)

with open('pickle_files/ppf_svm_ext.pickle', 'wb') as files:
  pickle.dump(model_SVM_EXT, files)

with open('pickle_files/ppf_svm_agr.pickle', 'wb') as files:
  pickle.dump(model_SVM_AGR, files)

with open('pickle_files/ppf_svm_neu.pickle', 'wb') as files:
  pickle.dump(model_SVM_NEU, files)

In [27]:
# Pickling GNB Models
with open('pickle_files/ppf_gnb_opn.pickle', 'wb') as files:
  pickle.dump(model_GNB_OPN, files)

with open('pickle_files/ppf_gnb_con.pickle', 'wb') as files:
  pickle.dump(model_GNB_CON, files)

with open('pickle_files/ppf_gnb_ext.pickle', 'wb') as files:
  pickle.dump(model_GNB_EXT, files)

with open('pickle_files/ppf_gnb_agr.pickle', 'wb') as files:
  pickle.dump(model_GNB_AGR, files)

with open('pickle_files/ppf_gnb_neu.pickle', 'wb') as files:
  pickle.dump(model_GNB_NEU, files)