# SHAP values & model interpretability 

SHAP values explain the change in the expected model prediction based on the feature values. 

![](https://unsplash.com/photos/lOcP_QZzitI)

> 
> SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions (see papers for details and citations.
> 


![](https://commons.wikimedia.org/wiki/File:Anne-nygard-lOcP_QZzitI-unsplash.jpg)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)


# Standard plotly imports
import plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
import plotly.figure_factory as ff
import os


import warnings
warnings.filterwarnings("ignore")

## Create Environment

In [None]:
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

In [None]:
print("XGBoost version:", xgb.__version__)

In [None]:
print('# File sizes')
total_size = 0
start_path = '../input/jane-street-market-prediction'  # To get size of current directory
for path, dirs, files in os.walk(start_path):
    for f in files:
        fp = os.path.join(path, f)
        total_size += os.path.getsize(fp)
print("Directory size: " + str(round(total_size/ 1000000, 2)) + 'MB')

In [None]:
%%time
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
example_test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
print ("Data is loaded!")

In [None]:
print('train shape is {}'.format(train.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

In [None]:
train.head()

# Reducting memory usage by 75% 

Source: https://www.kaggle.com/sbunzini/reduce-memory-usage-by-75

In [None]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
train = reduce_memory_usage(train)

### Missing Values Count

In [None]:
missing_values_count = train.isnull().sum()
print (missing_values_count)
total_cells = np.product(train.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

# Pre-processing before training

In [None]:
# I have taked this cell from https://www.kaggle.com/jazivxt/the-market-is-reactive
# And https://www.kaggle.com/drcapa/jane-street-market-prediction-starter-xgb

train = train[train['weight'] != 0]

train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int')


X_train = train.loc[:, train.columns.str.contains('feature')]
y_train = train.loc[:, 'action']

In [None]:
X_train = X_train.fillna(-999)

In [None]:
del train

## Training
##### To activate GPU usage, simply use tree_method='gpu_hist' (took me an hour to figure out, I wish XGBoost documentation was clearer about that).

In [None]:
# The training part taked from here https://www.kaggle.com/xhlulu/ieee-fraud-xgboost-with-gpu-fit-in-40s

clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    missing=-999,
    random_state=2020,
    tree_method='gpu_hist'  # THE MAGICAL PARAMETER
)

In [None]:
%time clf.fit(X_train, y_train)

# Loading Shap library & **initjs()**

In [None]:
import shap

shap.initjs()

## The next few blocks of code will take some time to run!

Since computing the shap values for the entire set takes an *inordinately* long time, we will use only a small sample(of around ~10k).   

In [None]:
%%time
# compute the SHAP values for every prediction in the validation dataset
explainer = shap.TreeExplainer(clf)

In [None]:
%%time
X_sample = X_train.sample(10000)
shap_values = explainer.shap_values(X_sample)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_sample.iloc[0,:])

In [None]:
shap.summary_plot(shap_values, X_sample)

In [None]:
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[i], shap_values, X_sample)

# LGB model

In [None]:
features = features.set_index('feature')
features = features.T * 1
features.columns

In [None]:
%%time
import lightgbm as lgb
from sklearn import *

k = cluster.KMeans(n_clusters=29, random_state=0).fit(features[['feature_' + str(i) for i in range(130)]])
n = preprocessing.Normalizer()
X_train['k'] = k.predict(n.fit_transform(X_train[['feature_' + str(i) for i in range(130)]].fillna(-999)))

In [None]:
col = [c for c in X_train.columns if c not in ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id', 'date', 'action']]

In [None]:
params = {'objective':'binary', 'boosting': 'gbdt', 'learning_rate': 0.2, 'max_depth': -1, 'random_state': 20, 'device':'gpu'}
x1, x2, y1, y2 = model_selection.train_test_split(X_train[col], y_train, test_size=0.3, random_state=20)

In [None]:
del X_train, y_train

In [None]:
model = lgb.train(params, lgb.Dataset(x1, y1), 450,  lgb.Dataset(x2, y2), verbose_eval=100, early_stopping_rounds=100)

In [None]:
%%time
# compute the SHAP values for every prediction in the validation dataset
explainer = shap.TreeExplainer(model)

In [None]:
%%time
X_sample = x2.sample(10000)
shap_values = explainer.shap_values(X_sample)

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:])

In [None]:
shap.summary_plot(shap_values, X_sample)

In [None]:
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[0][i], shap_values[0], X_sample)

**Credits to the following notebooks**

[Reduce Memory Usage by 75%](https://www.kaggle.com/sbunzini/reduce-memory-usage-by-75)

[Market Prediction: XGBoost with GPU (Fit in 1min)](https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min)

[The market is reactive](https://www.kaggle.com/jazivxt/the-market-is-reactive)