# Skater

Python Library for Model Interpretation/Explanations

https://github.com/oracle/Skater

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

# Show all columns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

%matplotlib inline

In [2]:
df = pd.read_csv('data_treated/ml_pers_traits.csv', index_col=None)

df.drop(columns=['Unnamed: 0'], inplace=True)

print(df.shape)
df.head(2)

(137, 63)


Unnamed: 0,Gender,Age,Education_lev,fluidIQ,freq_calls,freq_Tools,freq_Finance,freq_Games,freq_Entertainment,freq_Productivity,freq_Personalization,freq_News...Magazines,freq_Unknown,freq_Photography,freq_Shopping,freq_Communication,freq_Books...Reference,freq_Travel...Local,freq_Music...Audio,freq_Medical,freq_Education,freq_Business,freq_Lifestyle,freq_Transportation,freq_Weather,freq_Sports,freq_Browser,freq_Health...Fitness,freq_Media...Video,freq_Social,freq_Comics,dur_calls,dur_Tools,dur_Finance,dur_Games,dur_Entertainment,dur_Productivity,dur_Personalization,dur_News...Magazines,dur_Unknown,dur_Photography,dur_Shopping,dur_Communication,dur_Books...Reference,dur_Travel...Local,dur_Music...Audio,dur_Medical,dur_Education,dur_Business,dur_Lifestyle,dur_Transportation,dur_Weather,dur_Sports,dur_Browser,dur_Health...Fitness,dur_Media...Video,dur_Social,dur_Comics,Emotional Stability,Extraversion,Openness,Conscientiousness,Agreeableness
0,1,23,4,0.7251,120,414,0,40,51,553,0,8,31,260,52,1930,3,128,113,0,11,1,7,153,0,184,231,176,1,8,0,80.508333,19.989234,0.0,474.149741,66.142812,27.299807,0.0,63.482948,5.594866,22.306456,62.541808,60.222363,102.0,66.64078,19.273367,0.0,35.316076,4.0,60.666667,55.469027,0.0,124.447371,72.502856,125.382997,140.0,0.0,0.0,0,0,0,0,0
1,1,21,4,0.9921,98,239,3,0,21,88,0,4,42,113,23,3430,5,19,4,0,0,7,0,34,0,0,187,0,1,211,0,176.214286,21.201236,77.0,0.0,292.695366,29.823646,0.0,0.0,2.149066,16.753147,90.80438,42.197411,52.0,53.702947,0.0,0.0,0.0,24.285714,0.0,74.587645,0.0,0.0,95.82911,0.0,0.0,82.581954,0.0,1,0,1,1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 63 columns):
Gender                    137 non-null int64
Age                       137 non-null int64
Education_lev             137 non-null int64
fluidIQ                   137 non-null float64
freq_calls                137 non-null int64
freq_Tools                137 non-null int64
freq_Finance              137 non-null int64
freq_Games                137 non-null int64
freq_Entertainment        137 non-null int64
freq_Productivity         137 non-null int64
freq_Personalization      137 non-null int64
freq_News...Magazines     137 non-null int64
freq_Unknown              137 non-null int64
freq_Photography          137 non-null int64
freq_Shopping             137 non-null int64
freq_Communication        137 non-null int64
freq_Books...Reference    137 non-null int64
freq_Travel...Local       137 non-null int64
freq_Music...Audio        137 non-null int64
freq_Medical              137 non-null

## Machine Learning

Load already trained classifiers and get the train and test data through a fix random_state

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

#### Split data from training and test

In [5]:
np.random.seed(1)
X = df.iloc[:, :-5]
Y_all_traits = df[['Emotional Stability','Extraversion','Openness','Conscientiousness','Agreeableness']]

X_train, X_test, y_train_all_traits, y_test_all_traits = train_test_split(X, Y_all_traits, test_size=0.25, random_state = 74)

In [6]:
X_train.to_csv('data_splitted/ml_X_train.csv')
X_test.to_csv('data_splitted/ml_X_test.csv')

y_train_all_traits.to_csv('data_splitted/ml_y_all_traits_train.csv')
y_test_all_traits.to_csv('data_splitted/ml_y_all_traits_test.csv')

print(X_train.shape)
print(X_test.shape)
# All five traits targets
print(y_train_all_traits.shape)
print(y_test_all_traits.shape)

(102, 58)
(35, 58)
(102, 5)
(35, 5)


#### Load the classifiers  for LR, SVM, KNN, LDA and RF

In [7]:
from sklearn.externals import joblib



In [8]:
# Logistic Regression
lr_models = dict()
lr_models['esta'] = joblib.load('classifiers/logistic_regression_emotional_stability.pkl')
lr_models['extr'] = joblib.load('classifiers/logistic_regression_extraversion.pkl')
lr_models['open'] = joblib.load('classifiers/logistic_regression_openness.pkl')
lr_models['consc'] = joblib.load('classifiers/logistic_regression_conscientiousness.pkl')
lr_models['agr'] = joblib.load('classifiers/logistic_regression_agreeableness.pkl')

# Support Vectors Machine
svm_models = dict()
svm_models['esta'] = joblib.load('classifiers/svm_emotional_stability.pkl')
svm_models['extr'] = joblib.load('classifiers/svm_extraversion.pkl')
svm_models['open'] = joblib.load('classifiers/svm_openness.pkl')
svm_models['consc'] = joblib.load('classifiers/svm_conscientiousness.pkl')
svm_models['agr'] = joblib.load('classifiers/svm_agreeableness.pkl')

# K-nearest neighbors
knn_models = dict()
knn_models['esta'] = joblib.load('classifiers/knn_emotional_stability.pkl')
knn_models['extr'] = joblib.load('classifiers/knn_extraversion.pkl')
knn_models['open'] = joblib.load('classifiers/knn_openness.pkl')
knn_models['consc'] = joblib.load('classifiers/knn_conscientiousness.pkl')
knn_models['agr'] = joblib.load('classifiers/knn_agreeableness.pkl')

# Linear Discriminant Analysis
lda_models = dict()
lda_models['esta'] = joblib.load('classifiers/lda_emotional_stability.pkl')
lda_models['extr'] = joblib.load('classifiers/lda_extraversion.pkl')
lda_models['open'] = joblib.load('classifiers/lda_openness.pkl')
lda_models['consc'] = joblib.load('classifiers/lda_conscientiousness.pkl')
lda_models['agr'] = joblib.load('classifiers/lda_agreeableness.pkl')

# Random Forest
rf_models = dict()
rf_models['esta'] = joblib.load('classifiers/rf_emotional_stability.pkl')
rf_models['extr'] = joblib.load('classifiers/rf_extraversion.pkl')
rf_models['open'] = joblib.load('classifiers/rf_openness.pkl')
rf_models['consc'] = joblib.load('classifiers/rf_conscientiousness.pkl')
rf_models['agr'] = joblib.load('classifiers/rf_agreeableness.pkl')

Logistic Regression

In [9]:
# Test the model
y_pred_esta = lr_models['esta'].predict(X_test)
y_pred_extr = lr_models['extr'].predict(X_test)
y_pred_open = lr_models['open'].predict(X_test)
y_pred_consc = lr_models['consc'].predict(X_test)
y_pred_agr = lr_models['agr'].predict(X_test)

# Predicted Values: Just to verify that the model as correctly loaded
print('Emotional Stability:', y_pred_esta)
print('Extraversion:', y_pred_extr)
print('Openness:', y_pred_open)
print('Conscientiousness:', y_pred_consc)
print('Agreeableness:', y_pred_agr)
print()

# Get probability estimates.
lr_proba = dict()
lr_proba['esta'] = lambda x: lr_models['esta'].predict_proba(x).astype(float)
lr_proba['extr'] = lambda x: lr_models['extr'].predict_proba(x).astype(float)
lr_proba['open'] = lambda x: lr_models['open'].predict_proba(x).astype(float)
lr_proba['consc'] = lambda x: lr_models['consc'].predict_proba(x).astype(float)
lr_proba['agr'] = lambda x: lr_models['agr'].predict_proba(x).astype(float)

Emotional Stability: [0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1]
Extraversion: [0 0 1 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0]
Openness: [1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0]
Conscientiousness: [1 0 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1]
Agreeableness: [1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0]



Support Vector Machine

In [10]:
# Get probability estimates.
svm_proba = dict()
svm_proba['esta'] = lambda x: lr_models['esta'].predict_proba(x).astype(float)
svm_proba['extr'] = lambda x: lr_models['extr'].predict_proba(x).astype(float)
svm_proba['open'] = lambda x: lr_models['open'].predict_proba(x).astype(float)
svm_proba['consc'] = lambda x: lr_models['consc'].predict_proba(x).astype(float)
svm_proba['agr'] = lambda x: lr_models['agr'].predict_proba(x).astype(float)

K-nearest neighbors

In [11]:
# Get probability estimates.
knn_proba = dict()
knn_proba['esta'] = lambda x: lr_models['esta'].predict_proba(x).astype(float)
knn_proba['extr'] = lambda x: lr_models['extr'].predict_proba(x).astype(float)
knn_proba['open'] = lambda x: lr_models['open'].predict_proba(x).astype(float)
knn_proba['consc'] = lambda x: lr_models['consc'].predict_proba(x).astype(float)
knn_proba['agr'] = lambda x: lr_models['agr'].predict_proba(x).astype(float)

Linear Discriminant Analysis

In [12]:
# Get probability estimates.
lda_proba = dict()
lda_proba['esta'] = lambda x: lr_models['esta'].predict_proba(x).astype(float)
lda_proba['extr'] = lambda x: lr_models['extr'].predict_proba(x).astype(float)
lda_proba['open'] = lambda x: lr_models['open'].predict_proba(x).astype(float)
lda_proba['consc'] = lambda x: lr_models['consc'].predict_proba(x).astype(float)
lda_proba['agr'] = lambda x: lr_models['agr'].predict_proba(x).astype(float)

Random Forest

In [13]:
# Get probability estimates.
rf_proba = dict()
rf_proba['esta'] = lambda x: lr_models['esta'].predict_proba(x).astype(float)
rf_proba['extr'] = lambda x: lr_models['extr'].predict_proba(x).astype(float)
rf_proba['open'] = lambda x: lr_models['open'].predict_proba(x).astype(float)
rf_proba['consc'] = lambda x: lr_models['consc'].predict_proba(x).astype(float)
rf_proba['agr'] = lambda x: lr_models['agr'].predict_proba(x).astype(float)

# Explainable AI

In [14]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

In [15]:
#X_train = X_train.apply(lambda x:(x-x.min()) / (x.max()-x.min()))
#X_test = X_test.apply(lambda x:(x-x.min()) / (x.max()-x.min()))

### With probability scores (proba)

In [16]:
interpreter = Interpretation(training_data=X_test.values,
                             feature_names=X_train.columns.values)
model = InMemoryModel(lr_models['esta'].predict_proba,
                      examples=X_train.values)

#plots = interpreter.feature_importance.plot_feature_importance(model, ascending = False)

In [17]:
interpreter.feature_importance.feature_importance(model, ascending = False)

faster runs, do progress_bar=False


[58/58] features ████████████████████ Time elapsed: 2 seconds

dur_Comics                0.017241
freq_Shopping             0.017241
freq_Browser              0.017241
freq_Sports               0.017241
freq_Weather              0.017241
freq_Transportation       0.017241
freq_Lifestyle            0.017241
freq_Business             0.017241
freq_Education            0.017241
freq_Medical              0.017241
freq_Music...Audio        0.017241
freq_Travel...Local       0.017241
freq_Books...Reference    0.017241
freq_Communication        0.017241
freq_Photography          0.017241
dur_Social                0.017241
freq_Unknown              0.017241
freq_News...Magazines     0.017241
freq_Personalization      0.017241
freq_Productivity         0.017241
freq_Entertainment        0.017241
freq_Games                0.017241
freq_Finance              0.017241
freq_Tools                0.017241
freq_calls                0.017241
fluidIQ                   0.017241
Education_lev             0.017241
Age                       0.017241
freq_Health...Fitnes

### Without probability scores

In [18]:
interpreter = Interpretation(X_test.values, feature_names=X_test.columns.values)
model_no_proba = InMemoryModel(lr_models['esta'].predict, 
    examples=X_train.values, 
    unique_values=lr_models['esta'].classes_)
interpreter.feature_importance.feature_importance(model_no_proba, ascending = False)

faster runs, do progress_bar=False


[58/58] features ████████████████████ Time elapsed: 2 seconds

FeatureImportanceError: Something went wrong. Importances do not sum to a positive value
This could be due to:
1) 0 or infinite divisions
2) perturbed values == original values
3) feature is a constant


In [None]:
X_train

In [None]:
X_train