In [1]:
# Install and restart session
!pip install numpy==1.24.0



In [2]:
pip install fasttreeshap

Collecting fasttreeshap
  Downloading fasttreeshap-0.1.6.tar.gz (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.0/287.0 kB[0m [31m934.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting slicer==0.0.7 (from fasttreeshap)
  Downloading slicer-0.0.7-py3-none-any.whl.metadata (3.7 kB)
INFO: pip is looking at multiple versions of shap to determine which version is compatible with other requirements. This could take a while.
Collecting shap (from fasttreeshap)
  Downloading shap-0.47.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
  Downloading shap-0.47.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
  Downloading shap-0.47.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
  Downloading shap-0.46.0-cp311-cp311-m

In [3]:
pip install https://github.com/schufa-innovationlab/pltreeshap/archive/main.zip

Collecting https://github.com/schufa-innovationlab/pltreeshap/archive/main.zip
  Downloading https://github.com/schufa-innovationlab/pltreeshap/archive/main.zip
[2K     [32m-[0m [32m20.6 kB[0m [31m320.0 kB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pltreeshap
  Building wheel for pltreeshap (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pltreeshap: filename=pltreeshap-0.1.1-cp311-cp311-linux_x86_64.whl size=725296 sha256=ddc92cff0ca8efabfb4b2c19d3771b5e54c0748f302ce7355bd109d690a8e9e6
  Stored in directory: /tmp/pip-ephem-wheel-cache-wv93rvtq/wheels/b0/cd/4e/b07c48f845743c8a6fec1b2e9b485e29fca8f53bac5b183177
Successfully built pltreeshap
Installing collected packages: pltreeshap
Successfully installed pltreeshap-0.1.1


In [4]:
import pandas as pd
import lightgbm as lgb
import time
from tqdm import tqdm
import fasttreeshap
from pltreeshap import PLTreeExplainer

In [5]:
# Useful if you run this on google colab and downloaded the data into your drive.
# If you run the notebook in other environment remove these lines and change the 'pd.read_csv()' function in this notebook to read from
# where you saved you data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Environment Note
Run this notebook on a high-memory CPU runtime (at least 50 GB RAM).

Some cells may take 5–10 minutes to execute, while the last two cells—especially Background SHAP on the KDD dataset—can take up to an hour.

# Fraud Data Preprocessing and Model training

In [6]:
# 1. Download train_transaction data from: https://www.kaggle.com/c/ieee-fraud-detection/data?select=train_transaction.csv
# 2. Save it in your google drive
# 3. Change the path in the 'pd.read_csv()' function to where you saved your data
transactions = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/train_transaction.csv')

# feature engineering - do one hot encoding for categorical features
transactions['is_visa'] = transactions['card4'] == 'visa'
transactions['is_american_express'] = transactions['card4'] == 'american express'
transactions['is_discover'] = transactions['card4'] == 'discover'
transactions['is_mastercard'] = transactions['card4'] == 'mastercard'

transactions['is_debit'] = transactions['card6'] == 'debit'

transactions['ProductCD_W'] = transactions['ProductCD'] == 'W'
transactions['ProductCD_C'] = transactions['ProductCD'] == 'C'
transactions['ProductCD_R'] = transactions['ProductCD'] == 'R'
transactions['ProductCD_H'] = transactions['ProductCD'] == 'H'
transactions['ProductCD_S'] = transactions['ProductCD'] == 'S'

for i in [2,3,5,6,7,8,9]:
  transactions[f'M{i}'] = (transactions[f'M{i}'] == 'T').astype('int8')

transactions['M4'] = 0
transactions.loc[transactions['M4'] == 'M1', 'M4'] = 1
transactions.loc[transactions['M4'] == 'M2', 'M4'] = 2

transactions['gmail_hotmail_or_yahoo_email'] = transactions['P_emaildomain'].isin(['gmail.com', 'hotmail.com', 'yahoo.com'])
transactions['nan_email'] = transactions['P_emaildomain'].isna()

# Add the enginered features and build the list of features to train on
add = ['is_visa', 'is_american_express', 'is_discover', 'is_mastercard',
       'is_debit',
       'ProductCD_W', 'ProductCD_C', 'ProductCD_R', 'ProductCD_H', 'ProductCD_S',
       'gmail_hotmail_or_yahoo_email', 'nan_email']

for c in add:
  transactions[c] = transactions[c].astype('int8')

remove = ['card4', 'card6', 'ProductCD', 'M1', 'P_emaildomain', 'R_emaildomain', 'TransactionID', 'isFraud', 'TransactionDT']
train_features = [c for c in transactions.columns if c not in remove]


# Split transactions into train and test using the TransactionDT column, the 20% highest values should be the test
# We could use the 'test_transaction.csv' file of the Kaggle competition for our testset but it does not have the target column ('isFraud')
# so we would have no way to validate our model prefromance.
threshold = transactions['TransactionDT'].quantile(0.8)
transactions_train = transactions[transactions['TransactionDT'] <= threshold]
transactions_test = transactions[transactions['TransactionDT'] > threshold]

# Save RAM
del transactions

## FastTreeShap

In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor
from copy import copy

# Train random forrest as FastTreeShap doesn't support xgboost
train_sample = transactions_train.sample(1_000)
model = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42, n_jobs=1)
model.fit(train_sample[train_features], train_sample['isFraud'])

# Quickly evaluate our model - just so we know it was train correctly and produce meaningful predicitons
# test_dmatrix = xgb.DMatrix(transactions_test[train_features])
# y_pred = model.predict(test_dmatrix)
y_pred = model.predict(transactions_test[train_features])
# y_pred = pd.Series(y_pred, index=transactions_test.index)
print(f"Accuracy: {accuracy_score(transactions_test['isFraud'], y_pred.round())}, F1 score: {f1_score(transactions_test['isFraud'], y_pred.round())}")

Accuracy: 0.9531530463643445, F1 score: 0.1738091682843064


In [8]:
fraud_trainset = transactions_train[train_features]
fraud_testset = transactions_test[train_features]

In [9]:
start_time = time.time()
shap_explainer = fasttreeshap.TreeExplainer(model, algorithm="v2", n_jobs=1)
shap_values = shap_explainer(fraud_testset, check_additivity=False).values
print("Path Dependent SHAP took " + str(time.time() - start_time))

Path Dependent SHAP took 15.855484962463379


In [10]:
start_time = time.time()
shap_explainer_iv = fasttreeshap.TreeExplainer(model, algorithm="v1", n_jobs=1)
shap_iv_values = shap_explainer_iv(fraud_testset, check_additivity=False, interactions=True).values
print("Path Dependent SHAP IV took " + str(time.time() - start_time))

Path Dependent SHAP IV took 349.72976636886597


In [11]:
del shap_values, shap_iv_values
time.sleep(3)
import gc
gc.collect()

0

## PLTreeShap

In [12]:
from sklearn.metrics import mean_squared_error

# PLTreeShap supports LightGBM and not xgboost

# Convert to LightGBM Dataset format
train_data = lgb.Dataset(transactions_train[train_features], label=transactions_train["isFraud"])
test_data = lgb.Dataset(transactions_test[train_features], label=transactions_test["isFraud"], reference=train_data)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 6,
    'verbosity': -1
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
)

# Predict
y_pred = model.predict(transactions_test[train_features])
rmse = mean_squared_error(transactions_test["isFraud"], y_pred)
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 0.03


In [13]:
start_time = time.time()
explainer = PLTreeExplainer(model)
explainer.aggregate(fraud_trainset)  # precomputes split statistics
print(f"aggregation of Background data took " + str(time.time() - start_time))
explainer.shap_values(fraud_testset)
print("Background SHAP took " + str(time.time() - start_time))

aggregation of Background data took 20.59420394897461
Background SHAP took 245.11277532577515


In [14]:
start_time = time.time()
explainer = PLTreeExplainer(model)
explainer.aggregate(transactions_train[train_features])  # precomputes split statistics
aggregation_time = time.time() - start_time
print(f"aggregation of Background data took " + str(aggregation_time))
iv_computation_start_time = time.time()
explainer.shap_interaction_values(transactions_test[train_features].head(10_000)) # Running on all rows uses too much RAM and crashes the session
iv_computation_time = time.time() - iv_computation_start_time
print(f"Background SHAP IV took on 10,000 rows took {str(time.time() - start_time)}, on all rows will take {aggregation_time + (len(transactions_test) / 10_000) * iv_computation_time}")

aggregation of Background data took 21.19762349128723
Background SHAP IV took on 10,000 rows took 69.9813940525055, on all rows will take 597.3689850535393


In [15]:
# clean up RAM
del fraud_trainset, fraud_testset, train_data, test_data, transactions_train, transactions_test

# KDD-Cup 1999: Intrusion Detection Dataset

In [16]:
# Step 1: Download from https://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz
# Step 2: Ungzip the file using 7-zip (in windows) or 'gunzip kddcup.data.gz' (in linux)
# Step 3: Save this file in you google drive and load it from here

columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

# Change the path here to where you save it in your RAM
detection_data = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/KDD_CUP_1999/kddcup.data.corrected', names=columns)
detection_data.shape

(4898431, 42)

In [17]:
# Run the one hot encodeing and choose the features to train on
def create_one_hot_encoding(data, feature_name):
    vc = data[feature_name].value_counts()
    print(f"Overall {len(vc)} catagories")
    for catagory, count in tqdm(vc.items()):
        data[feature_name + "_" + catagory] = (data[feature_name] == catagory).astype(int)

create_one_hot_encoding(detection_data, "service")
create_one_hot_encoding(detection_data, "protocol_type")
create_one_hot_encoding(detection_data, "flag")

detection_data['target'] = (detection_data["label"] != "normal.").astype(int)
print(f"Normal: {(detection_data['target'] == 0).sum()} other: {(detection_data['target'] == 1).sum()}")

detc_features_to_drop = ['target', 'label', 'flag', 'service', 'protocol_type', 'service_other']
detection_train_features = [c for c in detection_data.columns if c not in detc_features_to_drop]
print(f"Train features: {len(detection_train_features)} Overall features: {len(detection_data.columns)}")

Overall 70 catagories


70it [00:30,  2.33it/s]


Overall 3 catagories


3it [00:01,  2.34it/s]


Overall 11 catagories


11it [00:04,  2.21it/s]


Normal: 972781 other: 3925650
Train features: 121 Overall features: 127


In [18]:
# Step 1: Download the gz file from http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz
# Step 2: Ungzip the file using 7-zip (in windows) or 'gunzip kddcup.data.gz' (in linux)
# Step 3: Save this file in you google drive and load it from here

small_test_data = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/KDD_CUP_1999/corrected', names=columns)

create_one_hot_encoding(small_test_data, "service")
create_one_hot_encoding(small_test_data, "protocol_type")
create_one_hot_encoding(small_test_data, "flag")

small_test_data['target'] = (small_test_data["label"] != "normal.").astype(int)
print(f"Normal: {(small_test_data['target'] == 0).sum()} other: {(small_test_data['target'] == 1).sum()}")
for c in detection_train_features:
    if c not in small_test_data:
        small_test_data[c] = 0


Overall 65 catagories


65it [00:01, 36.06it/s]


Overall 3 catagories


3it [00:00, 36.08it/s]


Overall 11 catagories


11it [00:00, 35.12it/s]


Normal: 60593 other: 250436


In [19]:
# Step 1: Download the gz file from http://kdd.ics.uci.edu/databases/kddcup99/kddcup.testdata.unlabeled.gz
# Step 2: Ungzip the file using 7-zip (in windows) or 'gunzip kddcup.data.gz' (in linux)
# Step 3: Save this file in you google drive and load it from here

columns.remove("label")
unlabeled_data = pd.read_csv('drive/MyDrive/ShapResearch/DataAndNotebooks/Data/KDD_CUP_1999/kddcup.testdata.unlabeled', names=columns)

create_one_hot_encoding(unlabeled_data, "service")
create_one_hot_encoding(unlabeled_data, "protocol_type")
create_one_hot_encoding(unlabeled_data, "flag")

for c in detection_train_features:
    if c not in unlabeled_data:
        unlabeled_data[c] = 0

Overall 70 catagories


70it [00:15,  4.45it/s]


Overall 3 catagories


3it [00:00,  4.34it/s]


Overall 11 catagories


11it [00:02,  4.41it/s]


## FastTreeShap

In [20]:
train_sample = detection_data.sample(100_000)
model = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)
model.fit(train_sample[detection_train_features], train_sample['target'])

y_pred = model.predict(small_test_data[detection_train_features])
print(f"Accuracy: {accuracy_score(small_test_data['target'], y_pred.round())}, F1 score: {f1_score(small_test_data['target'], y_pred.round())}")

Accuracy: 0.9221455234077851, F1 score: 0.9492402279840101


In [21]:
start_time = time.time()
shap_explainer = fasttreeshap.TreeExplainer(model, algorithm="v2", n_jobs=1)
shap_values = shap_explainer(unlabeled_data[detection_train_features], check_additivity=False).values
print("Path Dependent SHAP took " + str(time.time() - start_time))

Path Dependent SHAP took 373.2308542728424


In [22]:
big_consumer_data_sample = unlabeled_data[detection_train_features].sample(100_000, random_state=42)
start_time = time.time()
shap_explainer_iv = fasttreeshap.TreeExplainer(model, algorithm="v1", n_jobs=1)
shap_iv_values = shap_explainer_iv(
    big_consumer_data_sample,
    check_additivity=False, interactions=True
).values
iv_running_time = time.time() - start_time
print(f"Path Dependent SHAP IV on 100,000 sample took {iv_running_time} on all consumers will take {iv_running_time * (len(unlabeled_data) / len(big_consumer_data_sample))}")

Path Dependent SHAP IV on 100,000 sample took 444.7480540275574 on all consumers will take 13271.966844185514


In [23]:
del shap_values, shap_iv_values
time.sleep(3)
import gc
gc.collect()

8

## PLTreeShap

In [24]:
# Convert to LightGBM Dataset format
train_data = lgb.Dataset(detection_data[detection_train_features], label=detection_data["target"])
test_data = lgb.Dataset(small_test_data[detection_train_features], label=small_test_data["target"], reference=train_data)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 6,
    'verbosity': -1
}

# Train model
detection_model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
    # early_stopping_rounds=10
)

# Predict
y_pred = detection_model.predict(small_test_data[detection_train_features])
rmse = mean_squared_error(small_test_data["target"], y_pred)
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 0.07


In [25]:
kdd_trainset = detection_data[detection_train_features]
kdd_testset = unlabeled_data[detection_train_features]

In [26]:
start_time = time.time()
explainer = PLTreeExplainer(detection_model)
explainer.aggregate(kdd_trainset)  # precomputes split statistics
print(f"aggregation of Background data took " + str(time.time() - start_time))
explainer.shap_values(kdd_testset)
print("Background SHAP took " + str(time.time() - start_time))

aggregation of Background data took 165.85053825378418
Background SHAP took 2657.155497074127


In [28]:
kdd_testset_sample = kdd_testset.head(10_000)

start_time = time.time()
explainer = PLTreeExplainer(detection_model)
explainer.aggregate(kdd_trainset)  # precomputes split statistics
aggregation_time = time.time() - start_time
iv_computation_start_time = time.time()
explainer.shap_interaction_values(kdd_testset_sample) # Running on all rows uses too much RAM and crashes the session
iv_computation_time = time.time() - iv_computation_start_time
print(f"Background SHAP IV took on 10,000 rows took {str(time.time() - start_time)}, on all rows will take {aggregation_time + (len(kdd_testset) / len(kdd_testset_sample)) * iv_computation_time}")

Background SHAP IV took on 10,000 rows took 194.699214220047, on all rows will take 6322.659606448077
