# BentoML Demo - IEEE-CIS Fraud Detection

Accept dataset rules on Kaggle before downloading: https://www.kaggle.com/competitions/ieee-fraud-detection/data

In [None]:
# Set Kaggle Credentials for downloading dataset
%env KAGGLE_USERNAME=
%env KAGGLE_KEY=

In [None]:
!kaggle competitions download -c ieee-fraud-detection
!unzip -d ./data/ ieee-fraud-detection.zip && rm ieee-fraud-detection.zip

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("./data/train_transaction.csv")

In [None]:
from sklearn.impute import SimpleImputer

# Replace NaNs
nan_columns = data.columns[data.isna().any()]
float_nan_subset = data[nan_columns].select_dtypes(include='float64')

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data[float_nan_subset.columns] = imputer.fit_transform(float_nan_subset)

obj_nan_subset = data[nan_columns].select_dtypes(include='object')
data[obj_nan_subset.columns] = obj_nan_subset.fillna('UNKNOWN')

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert string columns to categorical or perform label encoding
cat_columns = data.select_dtypes(include='object')

for col in cat_columns.columns:
    data[col] = LabelEncoder().fit_transform(data[col])

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X = data.drop('isFraud', axis=1)
y = data.isFraud.astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=0
)
# Copy data to avoid slowdowns due to fragmentation
X_train = X_train.copy()
X_test = X_test.copy()

In [None]:
import xgboost as xgb

# Define model training function
def train_model(num_trees, max_depth):
    model = xgb.XGBClassifier(
        tree_method='hist',
        use_label_encoder=False,
        eval_metric='aucpr',
        objective='binary:logistic',
        max_depth=max_depth,
        n_estimators=num_trees
    )
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)]
    )
    return model

In [None]:
# Train a small model with just 500 trees and a maximum depth of 3
small_model = train_model(500, 3)

In [None]:
import bentoml
bentoml.xgboost.save_model(
    "ieee-fraud-detection-sm",
    small_model,
    signatures = {
        "predict_proba": {"batchable": True},
    })

In [None]:
# Train a large model with 5000 trees and a maximum depth of 12
large_model = train_model(5000, 12)

In [None]:
bentoml.xgboost.save_model(
    "ieee-fraud-detection-lg",
    large_model,
    signatures = {
        "predict_proba": {"batchable": True},
    })

In [None]:
test_runner = bentoml.xgboost.get("ieee-fraud-detection-sm:latest").to_runner()
test_runner.init_local()
test_runner.predict_proba.run(X_test[0:5])

In [None]:
small_model.predict_proba(X_test[0:5])

In [None]:
test_runner = bentoml.xgboost.get("ieee-fraud-detection-lg:latest").to_runner()
test_runner.init_local()
test_runner.predict_proba.run(X_test[0:5])

In [None]:
result = test_runner.predict_proba.run(X_test[0:5])
np.argmax(result, axis=1)