# reloading final model & columns and predict

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# imports
#imports
import os

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

import pickle
from treeinterpreter import treeinterpreter as ti

## Loading model

In [15]:
folder = "models"
path_to_load = f"../../src/{folder}/1.1 - df_train 01-18to12-18"
model_file = os.path.join(path_to_load, "1.1.b.2.f (Model) - Model Optimization recall - class_weight balanced - time sorted - valid score (0.452, 0.978).pickle")

with open(model_file, "rb") as file:
    m = pickle.load(file)
m



ExtraTreesClassifier(bootstrap=False, class_weight={0: 1, 1: 18},
           criterion='gini', max_depth=15, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=55,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

## Loading columns

In [16]:
folder = "features"
path_to_load = f"../../src/{folder}/1.1 - df_train 01-18to12-18"
columns_file = os.path.join(path_to_load, "1.1.b.2.f (COLUMNS) - Model Optimization - class_weight balanced - time sorted - valid score (0.696, 0.999).pickle")

with open(columns_file, "rb") as file:
    columns = pickle.load(file)
print(columns)

['cond_C10', 'es_gte_5', 'TIPO_EXPED', 'preg_15', 'cond_C12', 'preg_32', 'cond_32', 'preg_33', 'preg_31', 'cond_C11', 'preg_34', 'COD_CAUSA_SINI', 'cond_C2', 'cond_C3', 'METRO', 'SEXO_TERC', 'OCUPACION_ASEG', 'FALTANTE', 'cond_C5', 'CONDICION_ROBO_EXP50', 'EXISTE_FRAUDE']


## Loading example dataset

In [17]:
# loading examples
folder = "data"
path_to_load = f"../../src/{folder}/1.1 - df_train 01-18to12-18"
sample_file = os.path.join(path_to_load, "1.1.b.2.f (sample)  - dataset example to predict model.csv")
df_sample = pd.read_csv(sample_file, index_col=0)
df_sample.head()

Unnamed: 0,cond_C10,es_gte_5,TIPO_EXPED,preg_15,cond_C12,preg_32,cond_32,preg_33,preg_31,cond_C11,...,COD_CAUSA_SINI,cond_C2,cond_C3,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,cond_C5,CONDICION_ROBO_EXP50,EXISTE_FRAUDE
71763,-1.0,False,2,-1,-1.0,-1,-1.0,-1,-1,-1.0,...,501.0,-1.0,-1.0,1,1,5.0,1,-1.0,-1.0,False
71764,-1.0,False,3,-1,-1.0,-1,-1.0,-1,-1,-1.0,...,55.0,-1.0,-1.0,1,-1,99999.0,0,-1.0,-1.0,False
71765,-1.0,False,4,-1,-1.0,-1,-1.0,-1,-1,-1.0,...,505.0,-1.0,-1.0,0,-1,5.0,1,-1.0,-1.0,False
71766,-1.0,False,4,-1,-1.0,-1,-1.0,-1,-1,-1.0,...,501.0,-1.0,-1.0,1,-1,5.0,1,-1.0,-1.0,False
71767,-1.0,False,2,-1,-1.0,-1,-1.0,-1,-1,-1.0,...,501.0,-1.0,-1.0,1,1,99999.0,1,-1.0,-1.0,False


In [18]:
df_sample["EXISTE_FRAUDE"].value_counts()

False    50
Name: EXISTE_FRAUDE, dtype: int64

### predict all examples

In [9]:
y_val = df_sample["EXISTE_FRAUDE"]

In [10]:
y_val_pred_prob = m.predict_proba(df_sample[columns].drop(columns="EXISTE_FRAUDE"))[:, -1]
y_val_pred = y_val_pred_prob > 0.5
y_val_f1_bestFI = f1_score(y_val, y_val_pred)
y_val_rocauc_bestFI = roc_auc_score(y_val, y_val_pred_prob)
print("%.3f, %.3f" % (y_val_f1_bestFI, y_val_rocauc_bestFI))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
confusion_matrix(y_val, y_val_pred)

### predict 1 row example

In [117]:
case = df_sample.drop(columns="EXISTE_FRAUDE").iloc[1:5,:].head(1)
case

Unnamed: 0,cond_C10,es_gte_5,TIPO_EXPED,preg_15,cond_C12,preg_32,cond_32,preg_33,preg_31,cond_C11,preg_34,COD_CAUSA_SINI,cond_C2,cond_C3,METRO,SEXO_TERC,OCUPACION_ASEG,FALTANTE,cond_C5,CONDICION_ROBO_EXP50
71764,-1.0,False,3,-1,-1.0,-1,-1.0,-1,-1,-1.0,-1,55.0,-1.0,-1.0,1,-1,99999.0,0,-1.0,-1.0


In [118]:
y_example_pred = m.predict_proba(case)[0, 1]
y_example_pred, df_sample.head(1)["EXISTE_FRAUDE"].values

(0.005588127023652639, array([False]))

## Adding interpretation 1 case

In [119]:
prediction, bias, contributions = ti.predict(m, case)
prediction, bias

(array([[0.99441187, 0.00558813]]), array([[0.93515329, 0.06484671]]))

In [120]:
def contributions_to_json(columns, contributions):
    json = {}
    idxs = np.argsort(contributions)
    for i in idxs[::-1]:
        json[columns[i]] = contributions[i]
    return json

In [121]:
# sort by absolute error
idxs = np.argsort(contributions[0][:, 1])
[o for o in zip(case.columns[idxs], case.iloc[0][idxs], np.around(contributions[0][idxs, 1], 4))]

In [122]:
contributions_to_json(case.columns, np.around(contributions[0][:, 1], 4))

{'COD_CAUSA_SINI': 0.005,
 'FALTANTE': 0.0013,
 'METRO': 0.0002,
 'SEXO_TERC': -0.0,
 'OCUPACION_ASEG': -0.0001,
 'CONDICION_ROBO_EXP50': -0.0007,
 'cond_C5': -0.0007,
 'preg_34': -0.0011,
 'cond_C2': -0.0012,
 'cond_C3': -0.0017,
 'preg_31': -0.0036,
 'preg_15': -0.0038,
 'cond_C11': -0.0043,
 'cond_C12': -0.0049,
 'preg_33': -0.0055,
 'cond_32': -0.0059,
 'preg_32': -0.006,
 'cond_C10': -0.0066,
 'es_gte_5': -0.0067,
 'TIPO_EXPED': -0.0131}

In [145]:
def calc_prop_impact(prediction, contributions):
    # select the weight to sum (false or true prediction)
    idx_pred = np.argmax(prediction)
    sum_abs = np.abs(contributions[:, idx_pred]).sum()
    prop_imp = (contributions / sum_abs)
    return prop_imp

In [146]:
prop_imp = calc_prop_impact(prediction, contributions[0])

contributions_to_json(case.columns, np.around(prop_imp, 4))

TypeError: unhashable type: 'Index'

In [148]:
idx_pred = np.argmax(prediction)
sum_abs = np.abs(contributions[0][:, idx_pred]).sum()
prop_imp = (contributions / sum_abs)
prop_imp

array([[[ 9.09356866e-02, -9.09356866e-02],
        [ 9.26750968e-02, -9.26750968e-02],
        [ 1.81434695e-01, -1.81434695e-01],
        [ 5.19605681e-02, -5.19605681e-02],
        [ 6.85389065e-02, -6.85389065e-02],
        [ 8.28715590e-02, -8.28715590e-02],
        [ 8.22784684e-02, -8.22784684e-02],
        [ 7.61468496e-02, -7.61468496e-02],
        [ 4.93137579e-02, -4.93137579e-02],
        [ 5.90784157e-02, -5.90784157e-02],
        [ 1.50059062e-02, -1.50059062e-02],
        [-6.92436701e-02,  6.92436701e-02],
        [ 1.64423437e-02, -1.64423437e-02],
        [ 2.30995825e-02, -2.30995825e-02],
        [-2.18204822e-03,  2.18204822e-03],
        [ 4.48288149e-05, -4.48288149e-05],
        [ 1.35556540e-03, -1.35556540e-03],
        [-1.83029002e-02,  1.83029002e-02],
        [ 9.62738693e-03, -9.62738693e-03],
        [ 9.46176488e-03, -9.46176488e-03]]])

In [144]:
type(np.argmax(prediction))

numpy.int64

In [127]:
prediction, bias

(array([[0.99441187, 0.00558813]]), array([[0.93515329, 0.06484671]]))

# Adding shap

In [3]:
import shap

In [52]:
explainer = shap.TreeExplainer(m)
shap_values = explainer.shap_values(df_sample)

In [42]:
shap_values[0].shape, case.shape

((1, 20), (1, 20))

In [46]:
shap_values[0][0]

array([ 0.00583533,  0.00642212,  0.0115467 ,  0.00407391,  0.00345414,
        0.0050868 ,  0.00542241,  0.00393197,  0.0034776 ,  0.00351224,
        0.00139421,  0.00237926,  0.00101618,  0.00107696,  0.00018955,
        0.00137307, -0.00082273, -0.00012657,  0.00059433,  0.00074632])

In [76]:
shap.force_plot(explainer.expected_value[0], shap_values[0][0], df_sample.iloc[0,:].values)

TypeError: Object of type 'bool_' is not JSON serializable

In [75]:
shap_values[0][0]

array([ 0.00583533,  0.00642212,  0.0115467 ,  0.00407391,  0.00345414,
        0.0050868 ,  0.00542241,  0.00393197,  0.0034776 ,  0.00351224,
        0.00139421,  0.00237926,  0.00101618,  0.00107696,  0.00018955,
        0.00137307, -0.00082273, -0.00012657,  0.00059433,  0.00074632,
        0.        ])

In [68]:
shap_values