# Using LIME to explain the Personality Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

# Show all columns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('data_treated/ml_pers_traits.csv', index_col=None)

df.drop(columns=['Unnamed: 0'], inplace=True)

print(df.shape)
df.head(2)

(137, 63)


Unnamed: 0,Gender,Age,Education_lev,fluidIQ,freq_calls,freq_Tools,freq_Finance,freq_Games,freq_Entertainment,freq_Productivity,freq_Personalization,freq_News...Magazines,freq_Unknown,freq_Photography,freq_Shopping,freq_Communication,freq_Books...Reference,freq_Travel...Local,freq_Music...Audio,freq_Medical,freq_Education,freq_Business,freq_Lifestyle,freq_Transportation,freq_Weather,freq_Sports,freq_Browser,freq_Health...Fitness,freq_Media...Video,freq_Social,freq_Comics,dur_calls,dur_Tools,dur_Finance,dur_Games,dur_Entertainment,dur_Productivity,dur_Personalization,dur_News...Magazines,dur_Unknown,dur_Photography,dur_Shopping,dur_Communication,dur_Books...Reference,dur_Travel...Local,dur_Music...Audio,dur_Medical,dur_Education,dur_Business,dur_Lifestyle,dur_Transportation,dur_Weather,dur_Sports,dur_Browser,dur_Health...Fitness,dur_Media...Video,dur_Social,dur_Comics,Emotional Stability,Extraversion,Openness,Conscientiousness,Agreeableness
0,1,23,4,0.7251,120,414,0,40,51,553,0,8,31,260,52,1930,3,128,113,0,11,1,7,153,0,184,231,176,1,8,0,80.508333,19.989234,0.0,474.149741,66.142812,27.299807,0.0,63.482948,5.594866,22.306456,62.541808,60.222363,102.0,66.64078,19.273367,0.0,35.316076,4.0,60.666667,55.469027,0.0,124.447371,72.502856,125.382997,140.0,0.0,0.0,0,0,0,0,0
1,1,21,4,0.9921,98,239,3,0,21,88,0,4,42,113,23,3430,5,19,4,0,0,7,0,34,0,0,187,0,1,211,0,176.214286,21.201236,77.0,0.0,292.695366,29.823646,0.0,0.0,2.149066,16.753147,90.80438,42.197411,52.0,53.702947,0.0,0.0,0.0,24.285714,0.0,74.587645,0.0,0.0,95.82911,0.0,0.0,82.581954,0.0,1,0,1,1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 63 columns):
Gender                    137 non-null int64
Age                       137 non-null int64
Education_lev             137 non-null int64
fluidIQ                   137 non-null float64
freq_calls                137 non-null int64
freq_Tools                137 non-null int64
freq_Finance              137 non-null int64
freq_Games                137 non-null int64
freq_Entertainment        137 non-null int64
freq_Productivity         137 non-null int64
freq_Personalization      137 non-null int64
freq_News...Magazines     137 non-null int64
freq_Unknown              137 non-null int64
freq_Photography          137 non-null int64
freq_Shopping             137 non-null int64
freq_Communication        137 non-null int64
freq_Books...Reference    137 non-null int64
freq_Travel...Local       137 non-null int64
freq_Music...Audio        137 non-null int64
freq_Medical              137 non-null

## Machine Learning

Build classifiers

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

#### Split data from training and test

In [5]:
np.random.seed(1)
X = df.iloc[:, :-5]
Y_all_traits = df[['Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness']]

X_train, X_test, y_train_all_traits, y_test_all_traits = train_test_split(X, Y_all_traits, test_size=0.25, random_state = 74)

In [6]:
X_train.to_csv('data_splitted/ml_X_train.csv')
X_test.to_csv('data_splitted/ml_X_test.csv')

y_train_all_traits.to_csv('data_splitted/ml_y_all_traits_train.csv')
y_test_all_traits.to_csv('data_splitted/ml_y_all_traits_test.csv')

print(X_train.shape)
print(X_test.shape)
# All five traits targets
print(y_train_all_traits.shape)
print(y_test_all_traits.shape)

(102, 58)
(35, 58)
(102, 5)
(35, 5)


In [7]:
X_test.head(36)

Unnamed: 0,Gender,Age,Education_lev,fluidIQ,freq_calls,freq_Tools,freq_Finance,freq_Games,freq_Entertainment,freq_Productivity,freq_Personalization,freq_News...Magazines,freq_Unknown,freq_Photography,freq_Shopping,freq_Communication,freq_Books...Reference,freq_Travel...Local,freq_Music...Audio,freq_Medical,freq_Education,freq_Business,freq_Lifestyle,freq_Transportation,freq_Weather,freq_Sports,freq_Browser,freq_Health...Fitness,freq_Media...Video,freq_Social,freq_Comics,dur_calls,dur_Tools,dur_Finance,dur_Games,dur_Entertainment,dur_Productivity,dur_Personalization,dur_News...Magazines,dur_Unknown,dur_Photography,dur_Shopping,dur_Communication,dur_Books...Reference,dur_Travel...Local,dur_Music...Audio,dur_Medical,dur_Education,dur_Business,dur_Lifestyle,dur_Transportation,dur_Weather,dur_Sports,dur_Browser,dur_Health...Fitness,dur_Media...Video,dur_Social,dur_Comics
82,1,21,4,0.7566,28,293,6,199,17,281,0,0,19,385,80,4621,143,66,39,0,33,22,0,105,13,0,727,0,9,1659,0,12.071429,13.04421,37.0,188.938817,276.625,19.024673,0.0,0.0,3.147828,24.344433,50.906274,32.099305,193.623821,51.18379,15.122307,0.0,11.257627,97.419155,0.0,36.78402,6.537799,0.0,97.616783,0.0,1352.464837,134.498118,0.0
120,2,19,4,0.9547,627,1137,359,519,98,256,0,21,763,507,111,6092,43,47,87,35,4,112,10,210,0,0,659,9,17,2080,0,113.188198,3.25892,22.178779,118.901203,89.539916,12.346919,0.0,2.0,2.118835,6.469082,45.3206,19.674302,38.923613,53.46144,10.285218,3.660747,5.970033,24.28884,70.8,37.667244,0.0,0.0,59.398911,12.616721,39.4,101.118181,0.0
52,1,18,4,1.4896,76,1725,8,635,95,1178,26,30,1292,51,39,862,153,162,223,0,356,39,0,27,0,0,755,0,123,334,1,117.947368,2.066492,37.333345,8.428931,9.538364,13.421554,2.872503,51.720024,3.534347,19.108916,54.637637,8.591626,31.962175,29.082392,21.493582,0.0,109.773121,22.023497,0.0,33.150525,0.0,0.0,32.347281,0.0,4.169088,37.442936,397.0
16,2,20,4,0.9763,134,111,7,0,19,374,0,11,25,93,16,1326,69,73,37,0,0,10,4,56,0,0,311,52,4,736,0,35.865672,26.819805,49.74674,0.0,322.704987,22.136856,0.0,18.235017,3.56835,10.120037,72.088419,28.587304,34.047771,52.895711,22.324325,0.0,0.0,16.6,0.0,60.044083,0.0,0.0,72.82007,58.990064,46.206808,67.879375,0.0
72,2,28,5,0.656,32,122,0,5,0,140,0,144,15,80,0,963,2,24,67,0,60,14,0,38,0,0,117,0,1,0,0,258.0,32.608597,0.0,234.75,0.0,20.695836,0.0,15.636136,5.497845,40.722961,0.0,33.757956,0.0,89.256142,33.239869,0.0,244.86227,298.910349,0.0,55.089653,0.0,0.0,171.928596,0.0,12.0,0.0,0.0
109,1,23,4,3.0946,170,27,0,0,49,25,0,146,1,63,2,2482,2,12,2,0,189,0,0,11,0,0,107,10,2,39,0,204.717647,21.123102,0.0,0.0,187.729944,13.2578,0.0,2.0,1.904499,16.034031,0.0,30.194267,0.0,30.0,0.0,0.0,9.877734,0.0,0.0,56.406823,0.0,0.0,43.864539,38.444444,0.0,32.545455,0.0
17,2,21,3,1.5184,20,122,0,22,1,46,0,0,25,151,0,1249,18,2,56,0,0,39,0,63,1,0,333,3,0,333,0,0.0,41.924205,0.0,696.476191,18.0,30.27404,0.0,0.0,4.472518,21.067406,0.0,36.781262,37.885763,151.0,12.765811,0.0,0.0,60.66312,0.0,61.142424,24.0,0.0,73.730654,518.0,0.0,57.594612,0.0
118,2,40,4,1.5039,149,83,0,0,39,39,0,0,7,66,0,620,10,48,3,0,6,7,0,47,27,5,62,0,4,0,0,37.281879,23.011615,0.0,0.0,800.921008,32.655289,0.0,0.0,4.815058,27.452027,0.0,71.842134,552.5,134.367537,34.0,0.0,6.653348,117.571429,0.0,100.360241,19.600035,16.0,156.074908,0.0,0.0,0.0,0.0
105,2,29,5,-0.1423,450,354,0,421,18,184,12,300,327,377,2,2886,47,6,13,0,0,17,62,48,197,0,432,0,28,10,0,59.355556,23.595624,0.0,296.066928,28.0,22.503789,158.220869,69.898383,6.633062,23.165047,0.0,15.457862,35.358142,61.2,30.773397,0.0,0.0,28.515775,40.936523,44.037754,30.090248,0.0,90.723898,0.0,29.704637,0.0,0.0
75,2,23,4,0.711,13,141,0,603,13,115,0,14,23,73,0,919,4,10,21,0,28,1,9,30,0,34,117,22,12,9,0,920.230769,12.446484,0.0,129.521504,324.0,100.31242,0.0,0.0,2.866635,16.545638,0.0,19.867049,0.0,65.5,21.222222,0.0,26.522809,48.0,23.660103,69.447961,0.0,32.876042,53.613867,30.825048,14.166673,62.0,0.0


#### Setup and train the classifiers  for LR, SVM and KNN

Logistic Regression

In [8]:
# ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
# The default is 100 max interations, we changed to 2000 


# Setup classifiers architecture for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')
lr_clf_esta = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')
lr_clf_extr = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')
lr_clf_open = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')
lr_clf_consc = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')
lr_clf_agr = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')

from sklearn.externals import joblib
# Train the classifiers and save them
lr_clf_esta = lr_clf_esta.fit(X_train,y_train_all_traits['Emotional Stability'])
joblib.dump(lr_clf_esta, 'classifiers/logistic_regression_emotional_stability.pkl')
print('1...',end=' ')
lr_clf_extr = lr_clf_extr.fit(X_train,y_train_all_traits['Extraversion'])
joblib.dump(lr_clf_extr, 'classifiers/logistic_regression_extraversion.pkl') 
print('2...',end=' ')
lr_clf_open = lr_clf_open.fit(X_train,y_train_all_traits['Openness'])
joblib.dump(lr_clf_open, 'classifiers/logistic_regression_openness.pkl') 
print('3...',end=' ')
lr_clf_consc = lr_clf_consc.fit(X_train,y_train_all_traits['Conscientiousness'])
joblib.dump(lr_clf_consc, 'classifiers/logistic_regression_conscientiousness.pkl') 
print('4...',end=' ')
lr_clf_agr = lr_clf_agr.fit(X_train,y_train_all_traits['Agreeableness'])
joblib.dump(lr_clf_agr, 'classifiers/logistic_regression_agreeableness.pkl') 
print('5')

# Test the model
y_pred_esta = lr_clf_esta.predict(X_test)
y_pred_extr = lr_clf_extr.predict(X_test)
y_pred_open = lr_clf_open.predict(X_test)
y_pred_consc = lr_clf_consc.predict(X_test)
y_pred_agr = lr_clf_agr.predict(X_test)

# Predicted Values
print('Emotional Stability:', y_pred_esta)
print('Extraversion:', y_pred_extr)
print('Openness:', y_pred_open)
print('Conscientiousness:', y_pred_consc)
print('Agreeableness:', y_pred_agr)
print()
# Save predictions
df_predicted = pd.DataFrame()
df_predicted['instance'] = X_test.index
df_predicted['Emotional Stability'] = pd.Series(y_pred_esta)
df_predicted['Extraversion'] = pd.Series(y_pred_extr)
df_predicted['Openness'] = pd.Series(y_pred_open)
df_predicted['Conscientiousness'] = pd.Series(y_pred_consc)
df_predicted['Agreeableness'] = pd.Series(y_pred_agr)
df_predicted.to_csv('results/classification_results/lr_predictions.csv')

# Get the accuracy
lr_clf_accuracy_esta = accuracy_score(y_test_all_traits['Emotional Stability'], y_pred_esta)
lr_clf_accuracy_extr = accuracy_score(y_test_all_traits['Extraversion'], y_pred_extr)
lr_clf_accuracy_open = accuracy_score(y_test_all_traits['Openness'], y_pred_open)
lr_clf_accuracy_consc = accuracy_score(y_test_all_traits['Conscientiousness'], y_pred_consc)
lr_clf_accuracy_agr = accuracy_score(y_test_all_traits['Agreeableness'], y_pred_agr)
print('Accuracy Emotional Stability:', lr_clf_accuracy_esta)
print('Accuracy Extraversion:', lr_clf_accuracy_extr)
print('Accuracy Openness:', lr_clf_accuracy_open)
print('Accuracy Conscientiousness:', lr_clf_accuracy_consc)
print('Accuracy Agreeableness:', lr_clf_accuracy_agr)

# LIME requires class probabilities in case of classification example
# Get probability estimates.
lr_clf_proba_esta = lambda x: lr_clf_esta.predict_proba(x).astype(float)
lr_clf_proba_extr = lambda x: lr_clf_extr.predict_proba(x).astype(float)
lr_clf_proba_open = lambda x: lr_clf_open.predict_proba(x).astype(float)
lr_clf_proba_consc = lambda x: lr_clf_consc.predict_proba(x).astype(float)
lr_clf_proba_agr = lambda x: lr_clf_agr.predict_proba(x).astype(float)



1... 



2... 3... 4... 5
Emotional Stability: [0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1]
Extraversion: [0 0 1 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0]
Openness: [1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0]
Conscientiousness: [1 0 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1]
Agreeableness: [1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0]

Accuracy Emotional Stability: 0.4857142857142857
Accuracy Extraversion: 0.6285714285714286
Accuracy Openness: 0.6857142857142857
Accuracy Conscientiousness: 0.5714285714285714
Accuracy Agreeableness: 0.6285714285714286


Support Vector Machine

In [13]:
# Setup classifiers architecture for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')
svm_clf_esta = SVC(kernel='poly', degree=2, gamma='auto', probability=True)
svm_clf_extr = SVC(kernel='poly', degree=3, gamma='auto', probability=True)
svm_clf_open = SVC(kernel='poly', degree=3, gamma='auto', probability=True)
svm_clf_consc = SVC(kernel='poly', degree=4, gamma='auto', probability=True)
svm_clf_agr = SVC(kernel='poly', degree=7, gamma='auto', probability=True)

from sklearn.externals import joblib
# Train the classifiers and save them
svm_clf_esta = svm_clf_esta.fit(X_train,y_train_all_traits['Emotional Stability'])
joblib.dump(svm_clf_esta, 'classifiers/svm_emotional_stability.pkl')
print('1...',end=' ')
svm_clf_extr = svm_clf_extr.fit(X_train,y_train_all_traits['Extraversion'])
joblib.dump(svm_clf_extr, 'classifiers/svm_extraversion.pkl') 
print('2...',end=' ')
svm_clf_open = svm_clf_open.fit(X_train,y_train_all_traits['Openness'])
joblib.dump(svm_clf_open, 'classifiers/svm_openness.pkl') 
print('3...',end=' ')
svm_clf_consc = svm_clf_consc.fit(X_train,y_train_all_traits['Conscientiousness'])
joblib.dump(svm_clf_consc, 'classifiers/svm_conscientiousness.pkl') 
print('4...',end=' ')
svm_clf_agr = svm_clf_agr.fit(X_train,y_train_all_traits['Agreeableness'])
joblib.dump(svm_clf_agr, 'classifiers/svm_agreeableness.pkl') 
print('5')

# Test the model
y_pred_esta = svm_clf_esta.predict(X_test)
y_pred_extr = svm_clf_extr.predict(X_test)
y_pred_open = svm_clf_open.predict(X_test)
y_pred_consc = svm_clf_consc.predict(X_test)
y_pred_agr = svm_clf_agr.predict(X_test)

# Predicted Values
print('Emotional Stability:', y_pred_esta)
print('Extraversion:', y_pred_extr)
print('Openness:', y_pred_open)
print('Conscientiousness:', y_pred_consc)
print('Agreeableness:', y_pred_agr)
print()
# Save predictions
df_predicted = pd.DataFrame()
df_predicted['instance'] = X_test.index
df_predicted['Emotional Stability'] = pd.Series(y_pred_esta)
df_predicted['Extraversion'] = pd.Series(y_pred_extr)
df_predicted['Openness'] = pd.Series(y_pred_open)
df_predicted['Conscientiousness'] = pd.Series(y_pred_consc)
df_predicted['Agreeableness'] = pd.Series(y_pred_agr)
df_predicted.to_csv('results/classification_results/svm_predictions.csv')

# Get the accuracy
svm_clf_accuracy_esta = accuracy_score(y_test_all_traits['Emotional Stability'], y_pred_esta)
svm_clf_accuracy_extr = accuracy_score(y_test_all_traits['Extraversion'], y_pred_extr)
svm_clf_accuracy_open = accuracy_score(y_test_all_traits['Openness'], y_pred_open)
svm_clf_accuracy_consc = accuracy_score(y_test_all_traits['Conscientiousness'], y_pred_consc)
svm_clf_accuracy_agr = accuracy_score(y_test_all_traits['Agreeableness'], y_pred_agr)
print('Accuracy Emotional Stability:', svm_clf_accuracy_esta)
print('Accuracy Extraversion:', svm_clf_accuracy_extr)
print('Accuracy Openness:', svm_clf_accuracy_open)
print('Accuracy Conscientiousness:', svm_clf_accuracy_consc)
print('Accuracy Agreeableness:', svm_clf_accuracy_agr)

# LIME requires class probabilities in case of classification example
# Get probability estimates.
svm_clf_proba_esta = lambda x: svm_clf_esta.predict_proba(x).astype(float)
svm_clf_proba_extr = lambda x: svm_clf_extr.predict_proba(x).astype(float)
svm_clf_proba_open = lambda x: svm_clf_open.predict_proba(x).astype(float)
svm_clf_proba_consc = lambda x: svm_clf_consc.predict_proba(x).astype(float)
svm_clf_proba_agr = lambda x: svm_clf_agr.predict_proba(x).astype(float)

1... 2... 3... 4... 5
Emotional Stability: [1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0]
Extraversion: [0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0]
Openness: [0 0 1 0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 1 1 1 1 0 0 0 1 0 0]
Conscientiousness: [1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0]
Agreeableness: [0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 0]

Accuracy Emotional Stability: 0.5142857142857142
Accuracy Extraversion: 0.6
Accuracy Openness: 0.5428571428571428
Accuracy Conscientiousness: 0.45714285714285713
Accuracy Agreeableness: 0.5714285714285714


K-nearest neighbors

In [None]:
# Setup classifiers architecture for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')
knn_clf_esta = KNeighborsClassifier(n_neighbors=3)
knn_clf_extr = KNeighborsClassifier(n_neighbors=3)
knn_clf_open = KNeighborsClassifier(n_neighbors=3)
knn_clf_consc = KNeighborsClassifier(n_neighbors=3)
knn_clf_agr = KNeighborsClassifier(n_neighbors=3)

from sklearn.externals import joblib
# Train the classifiers and save them
knn_clf_esta = knn_clf_esta.fit(X_train,y_train_all_traits['Emotional Stability'])
joblib.dump(knn_clf_esta, 'classifiers/knn_emotional_stability.pkl')
print('1...',end=' ')
knn_clf_extr = knn_clf_extr.fit(X_train,y_train_all_traits['Extraversion'])
joblib.dump(knn_clf_extr, 'classifiers/knn_extraversion.pkl') 
print('2...',end=' ')
knn_clf_open = knn_clf_open.fit(X_train,y_train_all_traits['Openness']) 
joblib.dump(knn_clf_open, 'classifiers/knn_openness.pkl')
print('3...',end=' ')
knn_clf_consc = knn_clf_consc.fit(X_train,y_train_all_traits['Conscientiousness']) 
joblib.dump(knn_clf_consc, 'classifiers/knn_conscientiousness.pkl')
print('4...',end=' ')
knn_clf_agr = knn_clf_agr.fit(X_train,y_train_all_traits['Agreeableness']) 
joblib.dump(knn_clf_agr, 'classifiers/knn_agreeableness.pkl')
print('5')

# Test the model
y_pred_esta = knn_clf_esta.predict(X_test)
y_pred_extr = knn_clf_extr.predict(X_test)
y_pred_open = knn_clf_open.predict(X_test)
y_pred_consc = knn_clf_consc.predict(X_test)
y_pred_agr = knn_clf_agr.predict(X_test)

# Predicted Values
print('Emotional Stability:', y_pred_esta)
print('Extraversion:', y_pred_extr)
print('Openness:', y_pred_open)
print('Conscientiousness:', y_pred_consc)
print('Agreeableness:', y_pred_agr)
print()
# Save predictions
df_predicted = pd.DataFrame()
df_predicted['instance'] = X_test.index
df_predicted['Emotional Stability'] = pd.Series(y_pred_esta)
df_predicted['Extraversion'] = pd.Series(y_pred_extr)
df_predicted['Openness'] = pd.Series(y_pred_open)
df_predicted['Conscientiousness'] = pd.Series(y_pred_consc)
df_predicted['Agreeableness'] = pd.Series(y_pred_agr)
df_predicted.to_csv('results/classification_results/knn_predictions.csv')

# Get the accuracy
knn_clf_accuracy_esta = accuracy_score(y_test_all_traits['Emotional Stability'], y_pred_esta)
knn_clf_accuracy_extr = accuracy_score(y_test_all_traits['Extraversion'], y_pred_extr)
knn_clf_accuracy_open = accuracy_score(y_test_all_traits['Openness'], y_pred_open)
knn_clf_accuracy_consc = accuracy_score(y_test_all_traits['Conscientiousness'], y_pred_consc)
knn_clf_accuracy_agr = accuracy_score(y_test_all_traits['Agreeableness'], y_pred_agr)
print('Accuracy Emotional Stability:', knn_clf_accuracy_esta)
print('Accuracy Extraversion:', knn_clf_accuracy_extr)
print('Accuracy Openness:', knn_clf_accuracy_open)
print('Accuracy Conscientiousness:', knn_clf_accuracy_consc)
print('Accuracy Agreeableness:', knn_clf_accuracy_agr)

# LIME requires class probabilities in case of classification example
# Get probability estimates.
knn_clf_proba_esta = lambda x: knn_clf_esta.predict_proba(x).astype(float)
knn_clf_proba_extr = lambda x: knn_clf_extr.predict_proba(x).astype(float)
knn_clf_proba_open = lambda x: knn_clf_open.predict_proba(x).astype(float)
knn_clf_proba_consc = lambda x: knn_clf_consc.predict_proba(x).astype(float)
knn_clf_proba_agr = lambda x: knn_clf_agr.predict_proba(x).astype(float)

#### Save Classifiers

In [None]:
from sklearn.externals import joblib

In [None]:
# Save Logistic Regression Classifiers
joblib.dump(lr_clf_esta, 'classifiers/logistic_regression_emotional_stability.pkl')
joblib.dump(lr_clf_extr, 'classifiers/logistic_regression_extraversion.pkl') 
joblib.dump(lr_clf_open, 'classifiers/logistic_regression_openness.pkl') 
joblib.dump(lr_clf_consc, 'classifiers/logistic_regression_conscientiousness.pkl') 
joblib.dump(lr_clf_agr, 'classifiers/logistic_regression_agreeableness.pkl') 

# Load Classifier
#clf = joblib.load('classifiers/logistic_regression_emotional_stability.pkl')

In [None]:
# Save Support Vector Classifiers
joblib.dump(svm_clf_esta, 'classifiers/svm_emotional_stability.pkl')
joblib.dump(svm_clf_extr, 'classifiers/svm_extraversion.pkl') 
joblib.dump(svm_clf_open, 'classifiers/svm_openness.pkl') 
joblib.dump(svm_clf_consc, 'classifiers/svm_conscientiousness.pkl') 
joblib.dump(svm_clf_agr, 'classifiers/svm_agreeableness.pkl') 

In [None]:
# Save KNN Classifiers
joblib.dump(knn_clf_esta, 'classifiers/knn_emotional_stability.pkl')
joblib.dump(knn_clf_extr, 'classifiers/knn_extraversion.pkl') 
joblib.dump(knn_clf_open, 'classifiers/knn_openness.pkl') 
joblib.dump(knn_clf_consc, 'classifiers/knn_conscientiousness.pkl') 
joblib.dump(knn_clf_agr, 'classifiers/knn_agreeableness.pkl') 

# Explainable AI - LIME

In [None]:
# LIME works for different types of data such as text, images, categorical, numerical, ...
import lime
import lime.lime_tabular

### Setup the explainer (independent from the classifier, just required information about the training data)

In [None]:
training_data = X_train.values # to 2d numpy
feature_names = X_train.columns.values
categorical_features = [0,2]
categorical_names = ['Gender','Education_lev']

In [None]:
# Create the explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=training_data, # convert from pandas DataFrame to numpy 2d array
    mode='classification',
    feature_names=feature_names, # All columns name
    categorical_features=categorical_features, # Index of categorical features (int)
    categorical_names= categorical_names, 
    # class_names=class_names, # classes (0 and 1)
    class_names=['low', 'high'],
    discretize_continuous=True
)

### Explain the result for an instance of the testing data in GUI mode

In [None]:
testing_data = X_test.values # to 2d numpy

# Explain an instance
np.random.seed(1)
i = 3
exp = explainer.explain_instance(
    testing_data[i],
    lr_clf_proba_extr, 
    num_features=10,
    num_samples=10000, # size of the neighborhood to learn the linear model,
    distance_metric='euclidean',
    model_regressor=None
)
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
print(exp.predict_proba)
exp.as_list()

### Explain for all instances of testing data and save on a csv file

Our object is geting all the 5 features that most contributed to the decision as well as the one that most positive and negative influenced the decision for each instance of the testing data. We also get the prediction probabilities. The output is saved on csv files by classifier and trait

In [None]:
# Explain all instance
def extract_token(explanation_feature):
    # explanation_feature (string) - (e.g., 'freq_Shopping > 17.75')
    # Check if the value is a number, since sometimes the conditions is like: 24.99 < dur_calls <= 59.81
    # Instead of 24.99 < dur_calls
    # Extract the right token
    
    feature = explanation_feature.split(' ')
    
    # solve this issue - 'Education_lev=3' -> return Education_lev
    if '=' in feature[0]:
        return feature[0].split('=')[0]
        
    try: 
        int(feature[0]) # is a number
        return feature[2]
    except ValueError:
        if any(str.isdigit(c) for c in feature[0]):
            return feature[2]
        else:
            return feature[0]
        

def explain_all_instances(df, testing_data, classifier_proba):  
    np.random.seed(1)
    print('Total ' + str(len(testing_data)) + ' [', end = '')
    
    num_features = 20
    
    for instance in range(len(testing_data)):
               
        features = []
        
        print(str(instance), end = ' ')
        exp = explainer.explain_instance(
            testing_data[instance],
            classifier_proba, 
            num_features=num_features, # Rise this value if it does not encounter a value that contributes to be positive or negative
            num_samples=10000, # size of the neighborhood to learn the linear model,
            distance_metric='euclidean',
            model_regressor=None
        )
        
        # Save probabilities
        proba = exp.predict_proba
        df.loc[df['index'] == instance,'low_prob'] = proba[0]
        df.loc[df['index'] == instance,'high_prob'] = proba[1]
        
        #print(exp.show_in_notebook(show_table=True, show_all=False))
        #exp.as_html('results/extroverted/instance-' + str(i) + '.html')
        exp_res = exp.as_list()
        
        # EXEMPLE OUTPUT OF exp_res
        #[('dur_calls > 130.96', -0.3301713358748256),
        #('dur_Shopping <= 0.00', -0.23895139664004045),
        #('freq_Shopping <= 0.00', 0.19739928890647732),
        #('dur_Sports <= 0.00', 0.1435479064088512),
        #('dur_Lifestyle <= 0.00', -0.10569875594553718)]
        
        for index in range(num_features):
            features.append(extract_token(exp_res[index][0])) 
        
        df.loc[df['index'] == instance,'most_contribute_1'] = features[0]
        df.loc[df['index'] == instance,'most_contribute_2'] = features[1]
        df.loc[df['index'] == instance,'most_contribute_3'] = features[2]
        df.loc[df['index'] == instance,'most_contribute_4'] = features[3]
        df.loc[df['index'] == instance,'most_contribute_5'] = features[4]
        
        most_contribure_low = False
        most_contribure_high = False
        
        for index in range(num_features):
            
            if most_contribure_low and most_contribure_high:
                break
            
            value = float(exp_res[index][1])
            positive_signal = value >= 0
            #print(value, '-', positive_signal, end= ' ')
            
            if positive_signal == False and most_contribure_low == False:
                most_contribure_low = True
                df.loc[df['index'] == instance,'most_contribute_to_low'] = features[index]
            elif positive_signal == True and most_contribure_high == False:
                most_contribure_high = True
                df.loc[df['index'] == instance,'most_contribute_to_high'] = features[index]
        
    print(']')
    
    return df

extract_token('dur_Shopping <= 0.00')
extract_token('0.00 < dur_Shopping')
extract_token('0.00 < dur_Shopping < 10')

### Explain each Trait for Logistic Regression Models

In [None]:
# Run for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, lr_clf_proba_esta)
df_results.to_csv('results/lime_Emotional_stability_Logistic_Regression.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, lr_clf_proba_extr)
df_results.to_csv('results/lime_Extroversion_Logistic_Regression.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, lr_clf_proba_open)
df_results.to_csv('results/lime_Openness_Logistic_Regression.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, lr_clf_proba_consc)
df_results.to_csv('results/lime_Conscientiousness_Logistic_Regression.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, lr_clf_proba_agr)
df_results.to_csv('results/lime_Agreeableness_Logistic_Regression.csv')

### Explain each Trait for SVM Models

In [None]:
# Run for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, svm_clf_proba_esta)
df_results.to_csv('results/lime_Emotional_stability_SVM.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, svm_clf_proba_extr)
df_results.to_csv('results/lime_Extroversion_SVM.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, svm_clf_proba_open)
df_results.to_csv('results/lime_Openness_SVM.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, svm_clf_proba_consc)
df_results.to_csv('results/lime_Conscientiousness_SVM.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, svm_clf_proba_agr)
df_results.to_csv('results/lime_Agreeableness_SVM.csv')

### Explain each Trait for KNN Models

In [None]:
# Run for ('Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, knn_clf_proba_esta)
df_results.to_csv('results/lime_Emotional_Stability_KNN.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, knn_clf_proba_extr)
df_results.to_csv('results/lime_Extroversion_KNN.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, knn_clf_proba_open)
df_results.to_csv('results/lime_Openness_KNN.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, knn_clf_proba_consc)
df_results.to_csv('results/lime_Conscientiousness_KNN.csv')

df_results = pd.read_csv('results/template/template.csv', index_col=None)
df = explain_all_instances(df_results, testing_data, knn_clf_proba_agr)
df_results.to_csv('results/lime_Agreeableness_KNN.csv')