In [1]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [2]:
import os
os.environ['CUDA_PATH'] = '/usr/local'

In [3]:
!git clone https://github.com/shap/shap.git

Cloning into 'shap'...
remote: Enumerating objects: 18390, done.[K
remote: Counting objects: 100% (3263/3263), done.[K
remote: Compressing objects: 100% (479/479), done.[K
remote: Total 18390 (delta 3060), reused 2784 (delta 2784), pack-reused 15127 (from 4)[K
Receiving objects: 100% (18390/18390), 283.15 MiB | 39.15 MiB/s, done.
Resolving deltas: 100% (12902/12902), done.
Updating files: 100% (552/552), done.


In [4]:
os.chdir('/content/shap')

In [5]:
# Rerun the install again after the new packages are installed
!pip install --upgrade --force-reinstall --no-cache-dir .

Processing /content/shap
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting numpy (from shap==0.48.1.dev8)
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (from shap==0.48.1.dev8)
  Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn (from shap==0.48.1.dev8)
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas (from shap==0.48.1.dev8)
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [None]:
# Restart and continue the run from here, skipping all rows above

In [1]:
import shap

In [5]:
print(shap.explainers._gpu_tree)
assert hasattr(shap, "_cext_gpu")
shap.__version__

<module 'shap.explainers._gpu_tree' from '/usr/local/lib/python3.11/dist-packages/shap/explainers/_gpu_tree.py'>


'0.48.1.dev8'

In [6]:
import xgboost as xgb
import time
from tqdm import tqdm

In [7]:
import pandas as pd

# Useful if you run this on google colab and downloaded the data into your drive.
# If you run the notebook in other environment remove these lines and change the 'pd.read_csv()' function in this notebook to read from
# where you saved you data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Fraud Data Preprocessing and Model training


In [8]:
# 1. Download train_transaction data from: https://www.kaggle.com/c/ieee-fraud-detection/data?select=train_transaction.csv
# 2. Save it in your google drive
# 3. Change the path in the 'pd.read_csv()' function to where you saved your data
transactions = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/train_transaction.csv')

# feature engineering - do one hot encoding for categorical features
transactions['is_visa'] = transactions['card4'] == 'visa'
transactions['is_american_express'] = transactions['card4'] == 'american express'
transactions['is_discover'] = transactions['card4'] == 'discover'
transactions['is_mastercard'] = transactions['card4'] == 'mastercard'

transactions['is_debit'] = transactions['card6'] == 'debit'

transactions['ProductCD_W'] = transactions['ProductCD'] == 'W'
transactions['ProductCD_C'] = transactions['ProductCD'] == 'C'
transactions['ProductCD_R'] = transactions['ProductCD'] == 'R'
transactions['ProductCD_H'] = transactions['ProductCD'] == 'H'
transactions['ProductCD_S'] = transactions['ProductCD'] == 'S'

for i in [2,3,5,6,7,8,9]:
  transactions[f'M{i}'] = (transactions[f'M{i}'] == 'T').astype('int8')

transactions['M4'] = 0
transactions.loc[transactions['M4'] == 'M1', 'M4'] = 1
transactions.loc[transactions['M4'] == 'M2', 'M4'] = 2

transactions['gmail_hotmail_or_yahoo_email'] = transactions['P_emaildomain'].isin(['gmail.com', 'hotmail.com', 'yahoo.com'])
transactions['nan_email'] = transactions['P_emaildomain'].isna()

# Add the enginered features and build the list of features to train on
add = ['is_visa', 'is_american_express', 'is_discover', 'is_mastercard',
       'is_debit',
       'ProductCD_W', 'ProductCD_C', 'ProductCD_R', 'ProductCD_H', 'ProductCD_S',
       'gmail_hotmail_or_yahoo_email', 'nan_email']

for c in add:
  transactions[c] = transactions[c].astype('int8')

remove = ['card4', 'card6', 'ProductCD', 'M1', 'P_emaildomain', 'R_emaildomain', 'TransactionID', 'isFraud', 'TransactionDT']
train_features = [c for c in transactions.columns if c not in remove]


# Split transactions into train and test using the TransactionDT column, the 20% highest values should be the test
# We could use the 'test_transaction.csv' file of the Kaggle competition for our testset but it does not have the target column ('isFraud')
# so we would have no way to validate our model prefromance.
threshold = transactions['TransactionDT'].quantile(0.8)
transactions_train = transactions[transactions['TransactionDT'] <= threshold]
transactions_test = transactions[transactions['TransactionDT'] > threshold]

# Save RAM
del transactions

In [9]:
# Train an XGBoost model

XGB_GPU_PARAMS = {
    "objective": "reg:squarederror",  # Regression task with mean squared error loss
    "eval_metric": "rmse",  # Evaluation metric is root mean squared error
    "max_depth": 6,  # Maximum depth of each tree
    "learning_rate": 0.1,  # Learning rate (step size shrinkage)
    "subsample": 1,  # Subsample ratio of the training instances
    "colsample_bytree": 0.8,  # Subsample ratio of columns when constructing each tree
    "seed": 123,
    # "nthread": 1,
    # 'tree_method': 'gpu_hist',
    'device': 'cuda'
}

def xgboost_model(X_train, y_train, params, num_rounds=100):
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)
    return xgb.train(params, train_dmatrix, num_rounds)

In [10]:
# train XGBoost regressor
gpu_model = xgboost_model(transactions_train[train_features], transactions_train['isFraud'], XGB_GPU_PARAMS, num_rounds=100)

# Quickly evaluate our model - just so we know it was train correctly and produce meaningful predicitons
test_dmatrix = xgb.DMatrix(transactions_test[train_features])
y_pred = gpu_model.predict(test_dmatrix)
y_pred = pd.Series(y_pred, index=transactions_test.index)
# print(f"Accuracy: {accuracy_score(transactions_test['isFraud'], y_pred.round())}, F1 score: {f1_score(transactions_test['isFraud'], y_pred.round())}")

In [11]:
fraud_trainset = transactions_train[train_features]
fraud_testset = transactions_test[train_features]

del transactions_train, transactions_test
print(f"EEEI-CIS train size: {len(fraud_trainset)}  EEEI-CIS test size: {len(fraud_testset)}")

EEEI-CIS train size: 472432  EEEI-CIS test size: 118108


In [12]:
start_time = time.time()
explainer = shap.explainers.GPUTree(gpu_model, fraud_trainset.head(100))
iee_explainer = explainer(fraud_testset, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Background SHAP on 100 rows {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(fraud_trainset) / 100)}")

Background SHAP on 100 rows 10.890917778015137, estimation on all data 51452.215594367975


## Path dependent using the shap package - SLOW

In [13]:
start_time = time.time()
explainer = shap.explainers.GPUTree(gpu_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(fraud_testset, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP: {time.time() - start_time}")

Path Dependent SHAP: 4.137226343154907


In [14]:
fraud_testset_head_10000 = fraud_testset.head(10000)
start_time = time.time()
explainer = shap.explainers.GPUTree(gpu_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(fraud_testset_head_10000, interactions=True, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP IV on 10,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(fraud_trainset) / 10000)}")

Path Dependent SHAP IV on 10,000 rows: 116.04637503623962, estimation on all data 5482.402397967148


In [15]:
fraud_testset_head_1000 = fraud_testset.head(1000)
start_time = time.time()
explainer = shap.explainers.GPUTree(gpu_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(fraud_testset_head_1000, interactions=True, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP IV on 1,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(fraud_trainset) / 1000)}")

Path Dependent SHAP IV on 1,000 rows: 11.663624048233032, estimation on all data 5510.272052268982


In [16]:
del iee_shap_values

## Path dependent using xgboost package - Fast

In [17]:
start_time = time.time()
test_dmatrix = xgb.DMatrix(fraud_testset)
output = gpu_model.predict(test_dmatrix, pred_contribs=True, strict_shape=True)
print(f"Path Dependent SHAP: {time.time() - start_time}")

Path Dependent SHAP: 0.9395408630371094


In [18]:
fraud_sample = fraud_testset.sample(1_000)
start_time = time.time()
test_sample_dmatrix = xgb.DMatrix(fraud_sample)
gpu_model.predict(test_sample_dmatrix, pred_interactions=True, strict_shape=True)
print(f"Path Dependent SHAP IV on 1,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(fraud_testset) / 1000)}")

Path Dependent SHAP IV on 1,000 rows: 0.9889914989471436, estimation on all data 116.80868089103699


In [19]:
fraud_sample = fraud_testset.sample(10_000)
start_time = time.time()
test_sample_dmatrix = xgb.DMatrix(fraud_sample)
gpu_model.predict(test_sample_dmatrix, pred_interactions=True, strict_shape=True)
print(f"Path Dependent SHAP IV on 10,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(fraud_testset) / 10000)}")

Path Dependent SHAP IV on 10,000 rows: 8.892449855804443, estimation on all data 105.02704531393051


# KDD-Cup 1999: Intrusion Detection Dataset

In [20]:
# Step 1: Download from https://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz
# Step 2: Ungzip the file using 7-zip (in windows) or 'gunzip kddcup.data.gz' (in linux)
# Step 3: Save this file in you google drive and load it from here

columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

# Change the path here to where you save it in your RAM
detection_data = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/KDD_CUP_1999/kddcup.data.corrected', names=columns)
detection_data.shape

(4898431, 42)

In [21]:
# Run the one hot encodeing and choose the features to train on
def create_one_hot_encoding(data, feature_name):
    vc = data[feature_name].value_counts()
    print(f"Overall {len(vc)} catagories")
    for catagory, count in tqdm(vc.items()):
        data[feature_name + "_" + catagory] = (data[feature_name] == catagory).astype(int)

create_one_hot_encoding(detection_data, "service")
create_one_hot_encoding(detection_data, "protocol_type")
create_one_hot_encoding(detection_data, "flag")

detection_data['target'] = (detection_data["label"] != "normal.").astype(int)
print(f"Normal: {(detection_data['target'] == 0).sum()} other: {(detection_data['target'] == 1).sum()}")

detc_features_to_drop = ['target', 'label', 'flag', 'service', 'protocol_type', 'service_other']
detection_train_features = [c for c in detection_data.columns if c not in detc_features_to_drop]
print(f"Train features: {len(detection_train_features)} Overall features: {len(detection_data.columns)}")

Overall 70 catagories


70it [00:22,  3.08it/s]


Overall 3 catagories


3it [00:01,  2.99it/s]


Overall 11 catagories


11it [00:03,  2.94it/s]


Normal: 972781 other: 3925650
Train features: 121 Overall features: 127


In [22]:
detection_GPU_model = xgboost_model(
    detection_data[detection_train_features], detection_data['target'],
    XGB_GPU_PARAMS, num_rounds=100
)

In [23]:
# Step 1: Download the gz file from http://kdd.ics.uci.edu/databases/kddcup99/kddcup.testdata.unlabeled.gz
# Step 2: Ungzip the file using 7-zip (in windows) or 'gunzip kddcup.data.gz' (in linux)
# Step 3: Save this file in you google drive and load it from here

columns.remove("label")
unlabeled_data = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/KDD_CUP_1999/kddcup.testdata.unlabeled', names=columns)

create_one_hot_encoding(unlabeled_data, "service")
create_one_hot_encoding(unlabeled_data, "protocol_type")
create_one_hot_encoding(unlabeled_data, "flag")

for c in detection_train_features:
    if c not in unlabeled_data:
        unlabeled_data[c] = 0

Overall 70 catagories


70it [00:13,  5.01it/s]


Overall 3 catagories


3it [00:00,  4.89it/s]


Overall 11 catagories


11it [00:02,  4.85it/s]


In [24]:
detection_trainset = detection_data[detection_train_features]
detection_testset = unlabeled_data[detection_train_features]

del detection_data, unlabeled_data
print(f"KDD-Cup train size: {len(detection_trainset)}  KDD-Cup test size: {len(detection_testset)}")

KDD-Cup train size: 4898431  KDD-Cup test size: 2984154


In [25]:
start_time = time.time()
explainer = shap.explainers.GPUTree(detection_GPU_model, detection_trainset.head(100))
iee_explainer = explainer(detection_testset, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Background SHAP on 100 rows {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(detection_trainset) / 100)}")

Background SHAP on 100 rows 159.7638647556305, estimation on all data 7825922.958278348


## Path dependent using the shap package - SLOW

In [26]:
start_time = time.time()
explainer = shap.explainers.GPUTree(detection_GPU_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(detection_testset, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP: {time.time() - start_time}")

Path Dependent SHAP: 33.698811054229736


In [27]:
detection_testset_head_1000 = detection_testset.head(1000)
start_time = time.time()
explainer = shap.explainers.GPUTree(detection_GPU_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(detection_testset_head_1000, interactions=True, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP IV on 1,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(detection_testset) / 1000)}")

Path Dependent SHAP IV on 1,000 rows: 1.1293003559112549, estimation on all data 3370.022538282394


In [28]:
detection_testset_head_10000 = detection_testset.head(10000)
start_time = time.time()
explainer = shap.explainers.GPUTree(detection_GPU_model, feature_perturbation="tree_path_dependent")
iee_explainer = explainer(detection_testset_head_10000, interactions=True, check_additivity=False)
iee_shap_values = iee_explainer.values
print(f"Path Dependent SHAP IV on 10,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(detection_testset) / 10000)}")

Path Dependent SHAP IV on 10,000 rows: 10.74632453918457, estimation on all data 3206.870372289419


## Path dependent using xgboost package - Fast

In [30]:
start_time = time.time()
test_dmatrix = xgb.DMatrix(detection_testset)
output = detection_GPU_model.predict(test_dmatrix, pred_contribs=True, strict_shape=True)
print(f"Path Dependent SHAP: {time.time() - start_time}")

Path Dependent SHAP: 7.906008958816528


In [31]:
detection_sample = detection_testset.sample(1_000)
start_time = time.time()
test_sample_dmatrix = xgb.DMatrix(detection_sample)
detection_GPU_model.predict(test_sample_dmatrix, pred_interactions=True, strict_shape=True)
print(f"Path Dependent SHAP IV on 1,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(detection_testset) / 1000)}")

Path Dependent SHAP IV on 1,000 rows: 0.09706830978393555, estimation on all data 289.6902636809349


In [32]:
detection_sample = detection_testset.sample(10_000)
start_time = time.time()
test_sample_dmatrix = xgb.DMatrix(detection_sample)
detection_GPU_model.predict(test_sample_dmatrix, pred_interactions=True, strict_shape=True)
print(f"Path Dependent SHAP IV on 10,000 rows: {time.time() - start_time}, estimation on all data {(time.time() - start_time) * (len(detection_testset) / 10000)}")

Path Dependent SHAP IV on 10,000 rows: 0.7676239013671875, estimation on all data 229.07299915709493
