# Import Library

In [12]:
import kagglehub
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import fbeta_score
from tqdm import tqdm

from sklearn.metrics import roc_auc_score, fbeta_score, precision_score, recall_score

random_seed = 20888160

## Get Data

In [2]:
# Download latest version
path = kagglehub.dataset_download("sgpjesus/bank-account-fraud-dataset-neurips-2022")

print("Path to dataset files:", path)
# ensure we point to a .csv file (dataset_download may return a path without extension)
csv_path = str(path) + "/Base.csv"

# read the CSV into a DataFrame
df = pd.read_csv(csv_path)

#print(df.head)
size_val = df['fraud_bool'].size
print("Size of dataset - " + str(size_val))
#print("Column Names", df.columns)

Downloading from https://www.kaggle.com/api/v1/datasets/download/sgpjesus/bank-account-fraud-dataset-neurips-2022?dataset_version_number=2...


100%|██████████| 532M/532M [00:08<00:00, 63.1MB/s] 

Extracting files...





Path to dataset files: /Users/saimzafar2002-apple.com/.cache/kagglehub/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022/versions/2
Size of dataset - 1000000


In [3]:
# data processing
mask = df["month"] <= 5
full_training_data = df[mask].sample(frac=1).reset_index(drop=True).drop('month',axis=1) # train on months 0 to 5. drop month as a feature
full_test_data = df[~mask].sample(frac=1).reset_index(drop=True).drop('month',axis=1) # test on months 6 and 7. drop month as a feature

# 'device_fraud_count' is literally a constant column. get rid of it.
full_training_data = full_training_data.drop('device_fraud_count',axis=1)
full_test_data = full_test_data.drop('device_fraud_count',axis=1)

print("Full training data # rows: " + str(full_training_data.shape[0]))
print("Full test data # rows: " + str(full_test_data.shape[0]))
print("Column list: " + str(full_training_data.columns))

Full training data # rows: 794989
Full test data # rows: 205011
Column list: Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w'],
      dtype='object')


In [7]:
# more data processing
y_train_full = full_training_data["fraud_bool"]
X_train_full = full_training_data.drop("fraud_bool",axis=1)
y_test_full = full_test_data["fraud_bool"]
X_test_full = full_test_data.drop("fraud_bool",axis=1)

# make sure all numerical columns are actually stored numerically
y_train_full = y_train_full.astype(float)
y_test_full = y_test_full.astype(float)
for col in X_train_full.columns:
    converted = pd.to_numeric(X_train_full[col], errors='coerce')
    if converted.notna().sum() == X_train_full[col].notna().sum():
        X_train_full[col] = converted
for col in X_test_full.columns:
    converted = pd.to_numeric(X_test_full[col], errors='coerce')
    if converted.notna().sum() == X_test_full[col].notna().sum():
        X_test_full[col] = converted

categorical_cols = X_train_full.select_dtypes(exclude='number').columns.tolist()
print("Categorical columns: " + str(categorical_cols))

missing_cols = [
    "prev_address_months_count",
    "current_address_months_count",
    "bank_months_count",
    "session_length_in_minutes"
]

# Convert -1 to NaN in both train and test
X_train_full[missing_cols] = X_train_full[missing_cols].replace(-1, np.nan)
X_test_full[missing_cols] = X_test_full[missing_cols].replace(-1, np.nan)

Categorical columns: ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']


In [8]:
# one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

ohe = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False
)

preprocessor = ColumnTransformer(
    transformers=[("cat", ohe, categorical_cols)],
    remainder="passthrough"
)

preprocessor.set_output(transform="pandas")

X_train_full = preprocessor.fit_transform(X_train_full)
X_test_full  = preprocessor.transform(X_test_full)

# transform into numpy arrays
X_train_full = X_train_full.to_numpy()
X_test_full = X_test_full.to_numpy()

print(X_train_full.shape)
print(X_test_full.shape)

print(type(X_train_full))
print(type(X_test_full))

(794989, 45)
(205011, 45)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
import random

def sample_hyperparams(n_samples=20):

    combos = []

    for _ in range(n_samples):

        # learning_rate: logspace from 0.03 to 0.2 (base 10)
        lr_exp = np.random.uniform(np.log10(0.03), np.log10(0.2))
        learning_rate = 10 ** lr_exp

        # max_depth: integer uniform in [3,8]
        max_depth = np.random.randint(3, 9)

        # min_child_weight: uniform in [1,10]
        min_child_weight = np.random.uniform(1, 10)

        # subsample: uniform in [0.7,1.0]
        subsample = np.random.uniform(0.7, 1.0)

        # colsample_bytree: uniform in [0.6,1.0]
        colsample_bytree = np.random.uniform(0.6, 1.0)

        # scale_pos_weight: logspace from 10 to 200 (base 10)
        spw_exp = np.random.uniform(np.log10(10), np.log10(200))
        scale_pos_weight = 10 ** spw_exp

        combos.append((learning_rate, max_depth, min_child_weight, subsample, colsample_bytree, scale_pos_weight))

    return combos

n_hypers = 20
hyperparameter_combinations = sample_hyperparams(n_hypers)
print(hyperparameter_combinations)

# threshold candidates: 25 values from 0 to 1
thresholds = np.linspace(0.0, 1.0, 25)

full_hyperparameter_combinations = [hyper + (threshold,) for hyper in hyperparameter_combinations for threshold in thresholds]

[(0.17451520391697056, 4, 2.614887426109477, 0.9485060857785709, 0.7699750549936667, 69.20168627251141), (0.16905821771012308, 5, 2.656553443137637, 0.7536292501403133, 0.9018455006716423, 149.45525912295338), (0.16642979828658203, 6, 7.137705412276479, 0.741225660594348, 0.6188761821681114, 115.07840695382731), (0.03795041745754935, 4, 9.334959888208514, 0.9993629680507733, 0.6852665325348994, 197.5130497055601), (0.10043096967806842, 3, 6.1966714868861, 0.8443053450075143, 0.7360089157716136, 17.380685266385374), (0.07479153485986761, 8, 2.4941799305416206, 0.7655593670035046, 0.9102216553085859, 22.299158482193267), (0.07012870591758696, 7, 2.019503008339158, 0.9112736518539465, 0.6617652366696908, 59.9162649010584), (0.04279856656857445, 3, 3.467406013913698, 0.8697442933303215, 0.8984373361899485, 20.213830100895496), (0.10824452835945124, 5, 1.5440907833681796, 0.7551187188007809, 0.6054313477718477, 14.714304631234496), (0.087545711064077, 6, 4.003929859573627, 0.898660221152885

In [21]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score
import xgboost as xgb

# train the final model

learning_rate, max_depth, min_child_weight, subsample, colsample_bytree, scale_pos_weight, best_threshold = (0.06171184631285087, 3, 9.400860129136118, 0.7741235778036006, 0.7017163522086355, 10.92551522008694, 0.375)
best_iteration_int = 540

# Use XGBClassifier for Scikit-Learn compatibility
XGB_model = xgb.XGBClassifier(
    learning_rate=learning_rate,
    max_depth=max_depth,
    min_child_weight=min_child_weight,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="aucpr",
    tree_method="hist",
    verbosity=0,
    n_estimators=best_iteration_int,
    use_label_encoder=False
)
XGB_model.fit(X_train_full, y_train_full)
# predict_proba returns [prob_class_0, prob_class_1]
y_test_prob = XGB_model.predict_proba(X_test_full)[:, 1]
y_test_pred = (y_test_prob >= best_threshold).astype(int)


# Export to JSON

In [None]:
import json

# Example: get parameters from a scikit-learn model
model_params = XGB_model.get_params() 
filename = 'XGBoostModelParameters.json'

# Save parameters to a JSON file
with open(filename, 'w') as file:
    json.dump(model_params, file, indent=4)
