In this notebook we'll explore feature importance using SHAP values. SHAP values are the most mathematically consistent way for getting feature importances, and they work particulalry nicely with the tree-based models. Unfortunately, calculating SHAP values is an extremely resource intensive process. However, starting with XGBoost 1.3 it is possible to calcualte these values on GPUs, whcih speeds up the process by a factor of 20X - 50X compared to calculating the same on a CPU. Furthermore, it is also possible to calculate SHAP values for feature interactions. The GPU speedup for those is even more dramatic - it takes a few minutes, as opposed to days or even longer on a CPU.



In [None]:
%matplotlib inline

import xgboost as xgb
xgb.__version__

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import shap

# load JS visualization code to notebook
shap.initjs()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/ncaam-2021-features-only/tourney.csv')
test = pd.read_csv('../input/ncaam-2021-features-only/test.csv')

In [None]:
y = train["result"]
s = train["Season"]
X = train.drop(['Season','TeamID1','TeamID2','result'], axis=1)


X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)

In [None]:
train_oof = np.zeros((X.shape[0],))
test_preds = 0
train_oof.shape

In [None]:
xgb_params= {
        "objective": "binary:logistic",
        "max_depth": 2,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample": 0.9,
        #"reg_alpha" : 0,
        "min_child_weight": 30,
        "n_jobs": 2,
        "seed": 2001,
        'tree_method': "gpu_hist",
        "gpu_id": 0,
        'predictor': 'gpu_predictor'
    }

In [None]:
X_test.shape

In [None]:
X.shape

In [None]:
X.head()

In [None]:
test = xgb.DMatrix(X_test)

In [None]:
train_oof = np.zeros((X.shape[0],))
test_preds = 0
train_oof.shape

In [None]:
NUM_FOLDS = 10
kf = GroupKFold(n_splits=NUM_FOLDS)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y, s))):
        #print(f'Fold {f}')
        train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        train_df = xgb.DMatrix(train_df, label=train_target)
        val_df = xgb.DMatrix(val_df, label=val_target)
        model =  xgb.train(xgb_params, train_df, 400)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(test)

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        print(log_loss(val_target, temp_oof))
        
        
        
        
        

In [None]:
log_loss(y, train_oof)

In [None]:
0.561003662790192

In [None]:
np.save('train_oof', train_oof)
np.save('test_preds', test_preds)

Next, we calculate the SHAP values for the test set.


In [None]:
%%time
shap_preds = model.predict(test, pred_contribs=True)

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_preds[:,:-1], X_test)

In [None]:
shap.summary_plot(shap_preds[:,:-1], X_test, plot_type="bar")

In [None]:
%%time
shap_interactions = model.predict(test, pred_interactions=True)

In [None]:
def plot_top_k_interactions(feature_names, shap_interactions, k):
    # Get the mean absolute contribution for each feature interaction
    aggregate_interactions = np.mean(np.abs(shap_interactions[:, :-1, :-1]), axis=0)
    interactions = []
    for i in range(aggregate_interactions.shape[0]):
        for j in range(aggregate_interactions.shape[1]):
            if j < i:
                interactions.append(
                    (feature_names[i] + "-" + feature_names[j], aggregate_interactions[i][j] * 2))
    # sort by magnitude
    interactions.sort(key=lambda x: x[1], reverse=True)
    interaction_features, interaction_values = map(tuple, zip(*interactions))
    plt.bar(interaction_features[:k], interaction_values[:k])
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    
plot_top_k_interactions(X_test.columns, shap_interactions, 10)

In [None]:
X_test

Now let's make a submission.

In [None]:
test = pd.read_csv('../input/ncaam-2021-features-only/test.csv')
MSampleSubmission = pd.read_csv('../input/ncaam-march-mania-2021/MSampleSubmissionStage1.csv')

In [None]:
test_preds.shape

In [None]:
idx = test_preds.shape[0] //2
test_preds[idx:] = 1 - test_preds[idx:]

pred = pd.concat([test.ID, pd.Series(test_preds)], axis=1).groupby('ID')[0]\
        .mean().reset_index().rename(columns={0:'Pred'})
sub = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
sub.to_csv('submission.csv', index=False)
sub.head()