In this notebook we'll explore feature importance using SHAP values. SHAP values are the most mathematically consistent way for getting feature importances, and they work particulalry nicely with the tree-based models. Unfortunately, calculating SHAP values is an extremely resource intensive process. However, starting with XGBoost 1.3 it is possible to calcualte these values on GPUs, which speeds up the process by a factor of 20X - 50X compared to calculating the same on a CPU. Furthermore, it is also possible to calculate SHAP values for feature interactions. The GPU speedup for those is even more dramatic - it takes a few minutes, as opposed to days or even longer on a CPU.

In [None]:
%matplotlib inline
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import shap

# load JS visualization code to notebook
shap.initjs()
xgb.__version__

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
columns = test.columns[1:]
columns


In [None]:
target = train['target'].values


Let's applay label encoder to the categorical features.


In [None]:
cat_features = columns[:19]
cat_features

In [None]:
def label_encode(train_df, test_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column].unique().tolist() + test_df[column].unique().tolist())
    train_df[new_feature] = le.transform(train_df[column])
    test_df[new_feature] = le.transform(test_df[column])
    return new_feature

In [None]:
cat_cols = [col for col in columns if 'cat' in col]
cont_cols = [col for col in columns if 'cont' in col]

In [None]:
le_cols = []
for feature in cat_cols:
    le_cols.append(label_encode(train, test, feature))

In [None]:
columns = le_cols + cont_cols

In [None]:
xgb_params=  {'learning_rate': 0.005,
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
                'tree_method': 'gpu_hist',
                'predictor': 'gpu_predictor',
                'gpu_id': 0,
                'max_bin': 623,
                'max_depth': 10,
                'alpha': 0.5108154566815425,
                'gamma': 1.9276236172849432,
                'reg_lambda': 11.40999855634382,
                'colsample_bytree': 0.705851334291963,
                'subsample': 0.8386116751473301,
                'min_child_weight': 2.5517043283716605}

In [None]:
test = xgb.DMatrix(test[columns])

In [None]:
%%time
train_oof = np.zeros((300000,))
test_preds = 0
train_oof.shape
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        
        train_df = xgb.DMatrix(train_df, label=train_target)
        val_df = xgb.DMatrix(val_df, label=val_target)
        
        model =  xgb.train(xgb_params, train_df, 7000)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(test)

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))
        
print(roc_auc_score(target, train_oof))

In [None]:
0.895597940122485

In [None]:
0.8945190953989157

In [None]:
np.save('xgb_train_oof_0', train_oof)
np.save('xgb_test_preds_0', test_preds)

Next, we calculate the SHAP values for the test set.

In [None]:
%%time
shap_preds = model.predict(test, pred_contribs=True)

Now let's do some plots of these values.



In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
le_cols = []
for feature in cat_cols:
    le_cols.append(label_encode(train, test, feature))
columns = le_cols + cont_cols

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_preds[:,:-1], test[columns])

In [None]:
shap.summary_plot(shap_preds[:,:-1], test[columns], plot_type="bar")

Next, we'll calculate SHAP values for featue interactions. There will be 30x30x200,000 + 200,000 numbers that need to be computed.

In [None]:
#%%time
shap_interactions = model.predict(xgb.DMatrix(test[columns]), pred_interactions=True)

It took 45 minutes to calculate these values. On CPU this would take up to a day to compute.

Now let's take a look at what are the top interactions in this dataset.

In [None]:
def plot_top_k_interactions(feature_names, shap_interactions, k):
    # Get the mean absolute contribution for each feature interaction
    aggregate_interactions = np.mean(np.abs(shap_interactions[:, :-1, :-1]), axis=0)
    interactions = []
    for i in range(aggregate_interactions.shape[0]):
        for j in range(aggregate_interactions.shape[1]):
            if j < i:
                interactions.append(
                    (feature_names[i] + "-" + feature_names[j], aggregate_interactions[i][j] * 2))
    # sort by magnitude
    interactions.sort(key=lambda x: x[1], reverse=True)
    interaction_features, interaction_values = map(tuple, zip(*interactions))
    plt.bar(interaction_features[:k], interaction_values[:k])
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    
plot_top_k_interactions(columns, shap_interactions, 10)

We'll now try a different set of XGBoost Hyperparameters

In [None]:
%%time
xgb_params=  {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'random_state': 199,
    'tree_method': 'gpu_hist',
    'max_depth': 8,
    'learning_rate': 0.01,
    'min_child_weight': 20,
    'gamma': 0.1,
    'alpha': 0.2,
    'lambda': 9,
    'colsample_bytree': 0.2,
    'subsample': 0.8
}

test = xgb.DMatrix(test[columns])

train_oof_1 = np.zeros((300000,))
test_preds_1 = 0
print(train_oof_1.shape)
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train, target))):
        #print(f'Fold {f}')
        train_df, val_df = train.iloc[train_ind][columns], train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        
        train_df = xgb.DMatrix(train_df, label=train_target)
        val_df = xgb.DMatrix(val_df, label=val_target)
        
        model =  xgb.train(xgb_params, train_df, 4200)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(test)

        train_oof_1[val_ind] = temp_oof
        test_preds_1 += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))
        
print(roc_auc_score(target, train_oof_1))

np.save('xgb_train_oof_1', train_oof_1)
np.save('xgb_test_preds_1', test_preds_1)

Finally we make the submission.

In [None]:
sub['target'] = test_preds
sub.to_csv('submission_0.csv', index=False)

In [None]:
sub['target'] = test_preds_1
sub.to_csv('submission_1.csv', index=False)

In [None]:
sub['target'] = 0.9*test_preds_1+0.1*test_preds
sub.to_csv('blend_0.csv', index=False)

In [None]:
sub.head()