# Using LIME to explain Extroversion Trait Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

# Show all columns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('data_treated/ml_extroverted_trait.csv', index_col=None)

df.drop(columns=['Unnamed: 0'], inplace=True)

print(df.shape)
df.head(2)

(137, 59)


Unnamed: 0,Gender,Age,Education_lev,fluidIQ,freq_calls,freq_Tools,freq_Finance,freq_Games,freq_Entertainment,freq_Productivity,freq_Personalization,freq_News...Magazines,freq_Unknown,freq_Photography,freq_Shopping,freq_Communication,freq_Books...Reference,freq_Travel...Local,freq_Music...Audio,freq_Medical,freq_Education,freq_Business,freq_Lifestyle,freq_Transportation,freq_Weather,freq_Sports,freq_Browser,freq_Health...Fitness,freq_Media...Video,freq_Social,freq_Comics,dur_calls,dur_Tools,dur_Finance,dur_Games,dur_Entertainment,dur_Productivity,dur_Personalization,dur_News...Magazines,dur_Unknown,dur_Photography,dur_Shopping,dur_Communication,dur_Books...Reference,dur_Travel...Local,dur_Music...Audio,dur_Medical,dur_Education,dur_Business,dur_Lifestyle,dur_Transportation,dur_Weather,dur_Sports,dur_Browser,dur_Health...Fitness,dur_Media...Video,dur_Social,dur_Comics,Extraversion
0,1,23,4,0.7251,120,414,0,40,51,553,0,8,31,260,52,1930,3,128,113,0,11,1,7,153,0,184,231,176,1,8,0,80.508333,19.989234,0.0,474.149741,66.142812,27.299807,0.0,63.482948,5.594866,22.306456,62.541808,60.222363,102.0,66.64078,19.273367,0.0,35.316076,4.0,60.666667,55.469027,0.0,124.447371,72.502856,125.382997,140.0,0.0,0.0,0
1,1,21,4,0.9921,98,239,3,0,21,88,0,4,42,113,23,3430,5,19,4,0,0,7,0,34,0,0,187,0,1,211,0,176.214286,21.201236,77.0,0.0,292.695366,29.823646,0.0,0.0,2.149066,16.753147,90.80438,42.197411,52.0,53.702947,0.0,0.0,0.0,24.285714,0.0,74.587645,0.0,0.0,95.82911,0.0,0.0,82.581954,0.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 59 columns):
Gender                    137 non-null int64
Age                       137 non-null int64
Education_lev             137 non-null int64
fluidIQ                   137 non-null float64
freq_calls                137 non-null int64
freq_Tools                137 non-null int64
freq_Finance              137 non-null int64
freq_Games                137 non-null int64
freq_Entertainment        137 non-null int64
freq_Productivity         137 non-null int64
freq_Personalization      137 non-null int64
freq_News...Magazines     137 non-null int64
freq_Unknown              137 non-null int64
freq_Photography          137 non-null int64
freq_Shopping             137 non-null int64
freq_Communication        137 non-null int64
freq_Books...Reference    137 non-null int64
freq_Travel...Local       137 non-null int64
freq_Music...Audio        137 non-null int64
freq_Medical              137 non-null

## Machine Learning

Build classifiers

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.ensemble
import sklearn.preprocessing

from sklearn.metrics import accuracy_score

#### Split data from training and test

In [5]:
np.random.seed(1)
X = df.iloc[:, :-1]
Y = df['Extraversion']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 0)

In [6]:
# X_train.to_csv('data_treated/extroverted/ml_extroverted_trait_X_train.csv')
# X_test.to_csv('data_treated/extroverted/ml_extroverted_trait_X_test.csv')

# y_train.to_csv('data_treated/extroverted/ml_extroverted_trait_y_train.csv')
# y_test.to_csv('data_treated/extroverted/ml_extroverted_trait_y_test.csv')

print(X_train.shape)
print(X_test.shape)

(102, 58)
(35, 58)


In [7]:
X_test.head(36)

Unnamed: 0,Gender,Age,Education_lev,fluidIQ,freq_calls,freq_Tools,freq_Finance,freq_Games,freq_Entertainment,freq_Productivity,freq_Personalization,freq_News...Magazines,freq_Unknown,freq_Photography,freq_Shopping,freq_Communication,freq_Books...Reference,freq_Travel...Local,freq_Music...Audio,freq_Medical,freq_Education,freq_Business,freq_Lifestyle,freq_Transportation,freq_Weather,freq_Sports,freq_Browser,freq_Health...Fitness,freq_Media...Video,freq_Social,freq_Comics,dur_calls,dur_Tools,dur_Finance,dur_Games,dur_Entertainment,dur_Productivity,dur_Personalization,dur_News...Magazines,dur_Unknown,dur_Photography,dur_Shopping,dur_Communication,dur_Books...Reference,dur_Travel...Local,dur_Music...Audio,dur_Medical,dur_Education,dur_Business,dur_Lifestyle,dur_Transportation,dur_Weather,dur_Sports,dur_Browser,dur_Health...Fitness,dur_Media...Video,dur_Social,dur_Comics
26,2,27,5,0.0514,229,238,0,0,3,82,0,0,1,37,0,1251,0,12,62,0,0,0,0,43,0,0,19,0,0,4,0,146.825328,42.700711,0.0,0.0,30.0,28.470026,0.0,0.0,7.633862,21.162917,0.0,34.858202,0.0,57.988226,9.239297,0.0,0.0,0.0,0.0,59.508968,0.0,0.0,31.257779,0.0,0.0,0.0,0.0
8,2,20,4,0.2676,129,493,84,2,94,228,20,24,240,635,1,4575,25,33,27,0,2,39,4,21,13,0,1494,0,39,365,0,0.0,6.182718,36.319479,207.0,77.823472,18.905723,1.209473,2.0,3.091656,20.498183,6.0,33.95612,29.714029,61.946773,8.087628,0.0,0.0,46.117719,0.0,57.8,2.0,0.0,59.671803,0.0,17.589697,65.207059,0.0
86,2,21,5,0.293,49,157,4,2,9,44,0,11,8,110,0,1268,8,95,74,0,2,18,37,121,0,44,609,3,4,6,0,0.0,29.025738,0.0,0.0,79.2,16.187478,0.0,20.0,4.974101,20.489934,0.0,30.951097,0.0,72.467128,8.454677,0.0,0.0,66.986523,15.640012,54.303762,0.0,19.581971,55.575089,53.0,0.0,0.0,0.0
78,2,21,4,0.8868,217,211,0,9,12,176,0,43,30,228,0,3807,4,173,28,0,1,0,7,0,0,0,674,0,11,6,0,85.253456,14.372734,0.0,10.0,61.056137,26.280924,0.0,160.800479,3.004563,17.170467,0.0,27.379535,0.0,55.669395,13.984558,0.0,34.0,0.0,28.0,0.0,0.0,0.0,87.628893,0.0,27.0,17.0,0.0
43,2,23,4,0.203,147,620,0,65,187,60,0,26,294,56,0,1521,82,25,45,0,0,25,0,4,0,0,251,84,2,126,0,0.0,3.934569,0.0,137.296569,221.915232,19.221176,0.0,77.360212,2.445225,21.473463,0.0,30.637935,23.711547,52.637632,7.186493,0.0,0.0,29.059143,0.0,82.25,0.0,0.0,85.910023,22.755052,109.5,279.483674,0.0
85,2,20,4,0.9317,372,1082,0,603,32,380,24,4,658,439,0,7765,42,184,38,50,0,1,38,0,105,0,590,8,108,3800,0,47.77957,1.897833,0.0,132.862607,93.0,22.68726,1.0,0.0,2.246302,11.02873,0.0,14.72792,15.67488,39.535078,6.170653,1.632245,0.0,12.0,1.465629,0.0,13.337766,0.0,37.531051,0.0,13.05772,25.552532,0.0
22,2,20,4,0.082,209,245,0,243,35,119,0,0,25,194,23,1769,13,48,40,0,0,23,0,134,44,0,411,0,61,303,0,55.69378,11.293672,0.0,74.175377,167.976638,24.546998,0.0,0.0,3.224738,19.05312,34.643067,31.966394,35.291832,51.168501,43.121763,0.0,0.0,41.54785,0.0,50.817453,8.342008,0.0,83.09726,0.0,47.569511,50.790512,0.0
50,1,20,4,0.6562,67,209,0,0,30,1363,0,75,80,967,0,1488,60,59,310,0,0,35,88,124,0,0,604,0,2,66,0,0.0,13.449677,0.0,0.0,125.350356,30.463724,0.0,116.711862,5.317411,4.125053,0.0,37.419877,35.788423,71.004224,10.726118,0.0,0.0,28.552095,85.180867,54.664036,0.0,0.0,70.183677,0.0,0.0,111.228622,0.0
45,2,21,4,0.9461,401,237,0,210,10,54,0,16,25,131,12,1993,69,73,12,0,22,40,34,91,17,0,343,0,15,608,0,157.723192,9.02768,0.0,333.560879,11.0,12.734244,0.0,0.0,4.762249,38.091789,31.638059,29.476331,48.019379,78.009853,0.0,0.0,12.5,33.728449,33.253429,54.921063,23.737513,0.0,67.407558,0.0,50.850164,79.183971,0.0
24,2,20,4,0.4028,202,147,0,161,11,61,0,4,19,51,0,1609,4,161,8,0,0,0,0,0,0,0,128,0,4,375,0,44.430693,18.825832,0.0,350.939619,41.333333,51.470128,0.0,151.0,5.55183,38.478052,0.0,30.637705,0.0,41.717766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.741567,0.0,0.0,26.826113,0.0


#### Setup and train the classifier 

In [9]:
# ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
# The default is 100 max interations, we changed to 2000 

# Setup classifier architecture
lr_clf = LogisticRegression(random_state=0, C=0.001,max_iter=2000, solver='lbfgs', multi_class='multinomial')

# Train the classifier
lr_clf = lr_clf.fit(X_train,y_train)

# Test the model
y_pred = lr_clf.predict(X_test)

# Get the accuracy
lr_clf_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', lr_clf_accuracy)

# LIME requires class probabilities in case of classification example
# Get probability estimates.
lr_clf_proba = lambda x: lr_clf.predict_proba(x).astype(float)

Accuracy: 0.6285714285714286


#### Save Classifier

In [10]:
from sklearn.externals import joblib

# Save classifier
joblib.dump(lr_clf, 'classifiers/extraversion_logistic_regression.pkl') 

# Load Classifier
#clf = joblib.load('classifiers/logistic_regression_logistic_regression.pkl')



['classifiers/extraversion_logistic_regression.pkl']

## Explainable AI - LIME

In [None]:
# LIME works for different types of data such as text, images, categorical, numerical, ...
import lime
import lime.lime_tabular

In [None]:
training_data = X_train.values # to 2d numpy
testing_data = X_test.values # to 2d numpy
feature_names = X_train.columns.values
categorical_features = [0,2]
categorical_names = ['Gender','Education_lev']

In [None]:
# Create the explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=training_data, # convert from pandas DataFrame to numpy 2d array
    mode='classification',
    feature_names=feature_names, # All columns name
    categorical_features=categorical_features, # Index of categorical features (int)
    categorical_names= categorical_names, 
    # class_names=class_names, # classes (0 and 1)
    class_names=['low', 'high'],
    discretize_continuous=True
)

# Explain an instance
np.random.seed(1)
i = 6
exp = explainer.explain_instance(
    testing_data[i],
    lr_clf_proba, 
    num_features=10,
    num_samples=10000, # size of the neighborhood to learn the linear model,
    distance_metric='euclidean',
    model_regressor=None
)
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
print(exp.predict_proba)
exp.as_list()

In [None]:
#df_results.set_index('index', inplace=True, drop=True)

In [None]:
# Explain all instance
df_results = pd.read_csv('results/read.csv', index_col=None)

def extract_token(explanation_feature):
    # explanation_feature (string) - (e.g., 'freq_Shopping > 17.75')
    # Check if the value is a number, since sometimes the conditions is like: 24.99 < dur_calls <= 59.81
    # Instead of 24.99 < dur_calls
    # Extract the right token
    
    feature = explanation_feature.split(' ')
    
    try: 
        int(feature[0]) # is a number
        return feature[2]
    except ValueError:
        if any(str.isdigit(c) for c in feature[0]):
            return feature[2]
        else:
            return feature[0]
        

def explain_all_instances(testing_data):  
    np.random.seed(1)
    print('Total ' + str(len(testing_data)) + ' [', end = '')
    
    num_features = 20
    
    for instance in range(len(testing_data)):
               
        features = []
        
        print(str(instance), end = ' ')
        exp = explainer.explain_instance(
            testing_data[instance],
            lr_clf_proba, 
            num_features=num_features, # Rise this value if it does not encounter a value that contributes to be positive or negative
            num_samples=10000, # size of the neighborhood to learn the linear model,
            distance_metric='euclidean',
            model_regressor=None
        )
        
        # Save probabilities
        proba = exp.predict_proba
        df_results.loc[df_results['index'] == instance,'low_prob'] = proba[0]
        df_results.loc[df_results['index'] == instance,'high_prob'] = proba[1]
        
        #print(exp.show_in_notebook(show_table=True, show_all=False))
        #exp.as_html('results/extroverted/instance-' + str(i) + '.html')
        exp_res = exp.as_list()
        
        # EXEMPLE OUTPUT OF exp_res
        #[('dur_calls > 130.96', -0.3301713358748256),
        #('dur_Shopping <= 0.00', -0.23895139664004045),
        #('freq_Shopping <= 0.00', 0.19739928890647732),
        #('dur_Sports <= 0.00', 0.1435479064088512),
        #('dur_Lifestyle <= 0.00', -0.10569875594553718)]
        
        for index in range(num_features):
            features.append(extract_token(exp_res[index][0])) 
        
        df_results.loc[df_results['index'] == instance,'most_contribute_1'] = features[0]
        df_results.loc[df_results['index'] == instance,'most_contribute_2'] = features[1]
        df_results.loc[df_results['index'] == instance,'most_contribute_3'] = features[2]
        df_results.loc[df_results['index'] == instance,'most_contribute_4'] = features[3]
        df_results.loc[df_results['index'] == instance,'most_contribute_5'] = features[4]
        
        most_contribure_low = False
        most_contribure_high = False
        
        for index in range(num_features):
            
            if most_contribure_low and most_contribure_high:
                break
            
            value = float(exp_res[index][1])
            positive_signal = value >= 0
            #print(value, '-', positive_signal, end= ' ')
            
            if positive_signal == False and most_contribure_low == False:
                most_contribure_low = True
                df_results.loc[df_results['index'] == instance,'most_contribute_to_low'] = features[index]
            elif positive_signal == True and most_contribure_high == False:
                most_contribure_high = True
                df_results.loc[df_results['index'] == instance,'most_contribute_to_high'] = features[index]
        
    print(']')
        
explain_all_instances(testing_data)

df_results.to_csv('results/personality_traits_results_lime_Extroversion_Logistic_Regression.csv')

In [None]:
print(y_pred)
y_pred[0]

In [None]:
X_test.iloc[17]

# Save Classifier

In [None]:
df = pd.read_csv('data_treated/ml_extroverted_trait.csv', index_col=None)

df.drop(columns=['Unnamed: 0'], inplace=True)

# Emotional Stability	Extraversion	Openness	Conscientiousness	Agreeableness