#1) Download the  data

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fraud-detection


In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('{0}/fraudTrain.csv'.format(path))
test = pd.read_csv('{0}/fraudTest.csv'.format(path))

#2a) Transform the data

1. Obtain the local time of day

2. Transform date of birth to age



In [3]:
#1) local transaction time

state_timezone_map = {
    'AL': 'America/Chicago',
    'AK': 'America/Anchorage',
    'AZ': 'America/Phoenix',       # No DST
    'AR': 'America/Chicago',
    'CA': 'America/Los_Angeles',
    'CO': 'America/Denver',
    'CT': 'America/New_York',
    'DE': 'America/New_York',
    'FL': 'America/New_York',      # Most of Florida
    'GA': 'America/New_York',
    'HI': 'Pacific/Honolulu',      # No DST
    'ID': 'America/Boise',         # Split between MT and PT
    'IL': 'America/Chicago',
    'IN': 'America/Indiana/Indianapolis',
    'IA': 'America/Chicago',
    'KS': 'America/Chicago',
    'KY': 'America/New_York',
    'LA': 'America/Chicago',
    'ME': 'America/New_York',
    'MD': 'America/New_York',
    'MA': 'America/New_York',
    'MI': 'America/Detroit',
    'MN': 'America/Chicago',
    'MS': 'America/Chicago',
    'MO': 'America/Chicago',
    'MT': 'America/Denver',
    'NE': 'America/Chicago',
    'NV': 'America/Los_Angeles',
    'NH': 'America/New_York',
    'NJ': 'America/New_York',
    'NM': 'America/Denver',
    'NY': 'America/New_York',
    'NC': 'America/New_York',
    'ND': 'America/Chicago',
    'OH': 'America/New_York',
    'OK': 'America/Chicago',
    'OR': 'America/Los_Angeles',
    'PA': 'America/New_York',
    'RI': 'America/New_York',
    'SC': 'America/New_York',
    'SD': 'America/Chicago',
    'TN': 'America/Chicago',
    'TX': 'America/Chicago',
    'UT': 'America/Denver',
    'VT': 'America/New_York',
    'VA': 'America/New_York',
    'WA': 'America/Los_Angeles',
    'WV': 'America/New_York',
    'WI': 'America/Chicago',
    'WY': 'America/Denver'
}

def local_time(df):
    df = df.copy()

    df['timezone'] = df['state'].map(state_timezone_map)
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date_trans_time'] = df['trans_date_trans_time'].dt.tz_localize('UTC')

    local_dt = df.apply(
        lambda row: row['trans_date_trans_time'].tz_convert(row['timezone']) if pd.notnull(row['timezone']) else pd.NaT,
        axis=1
    )

    time_in_seconds = local_dt.apply(
        lambda dt: dt.hour * 3600 + dt.minute * 60 + dt.second if pd.notnull(dt) else None
    )

    seconds_in_day = 24 * 60 * 60
    seconds_norm = time_in_seconds / seconds_in_day

    #acount for circularity of time of day
    ##i.e. 12:59pm is close to 1pm
    df['time_sin'] = np.sin(2 * np.pi * seconds_norm)
    df['time_cos'] = np.cos(2 * np.pi * seconds_norm)

    return df[['time_sin', 'time_cos']]


train[['time_sin', 'time_cos']] = local_time(train)
test[['time_sin', 'time_cos']] = local_time(test)

In [4]:
#2) Transform date of birth to age in years

def get_age(df):

    today = pd.Timestamp('2020-01-01')
    #could have reference 'today' be transaction time but doesn't matter much because transactions are all within two years

    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['dob'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

    return df['age']

train['age'] = get_age(train)
test['age'] = get_age(test)

#2b) Transform the data

###Drop categorical and numerical columns not helpful to fraud prediction. Properly encode categorical columns for XGBoost.

In [5]:
#columns to be removed
no_need = ['cc_num', 'Unnamed: 0','street', 'city',  'dob','job','first',
           'last','trans_num','trans_date_trans_time', 'lat', 'long', 'merch_lat',
           'merch_long', 'unix_time','merchant','state']


sparse_train = train.drop(columns = no_need)
sparse_train['gender'] = sparse_train['gender'].astype('category')  #XGBoost can handle categorical variable
sparse_train['category'] = sparse_train['category'].astype('category')

sparse_test = test.drop(columns = no_need)
sparse_test['gender'] = sparse_test['gender'].astype('category')
sparse_test['category'] = sparse_test['category'].astype('category')

#setup features (X) and the classification (y)
y_train, X_train  = sparse_train['is_fraud'], sparse_train.drop(columns='is_fraud')
y_test, X_test = sparse_test['is_fraud'], sparse_test.drop(columns='is_fraud')

#3) Explore the data

See `fraud_detection.ipynb` for this section.

#4) Alright, time for some modeling!

In [6]:
#Train XGB on transformed data.
#Reweight model to make fraud cases equal to non-fraud

from xgboost import XGBClassifier
from sklearn.metrics import classification_report

#find reweight
counts = y_train.value_counts()
scale_pos_weight = counts.get(0,0)/counts.get(1,0)  #use this to upweight fraud cases


# Train XGBoost classifier
clf_xgb = XGBClassifier(enable_categorical=True, device='cuda',tree_method='hist', eval_metric='logloss', scale_pos_weight=scale_pos_weight)
clf_xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = clf_xgb.predict(X_test)

print(classification_report(y_test, y_pred_xgb))


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.28      0.93      0.43      2145

    accuracy                           0.99    555719
   macro avg       0.64      0.96      0.71    555719
weighted avg       1.00      0.99      0.99    555719



##Try optimizing XGBoost parameters.


1.   Random search
2.   Grid search
3.   Bayes optimization



###A) Focus on recall

In [8]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.3],
    'n_estimators': [100, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 1, 5]
}

#this randomly samples
search = RandomizedSearchCV(
    clf_xgb, param_grid, n_iter=20, scoring='recall', cv=3, verbose=2, n_jobs=-1
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)

y_pred = search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 5, 'colsample_bytree': 0.8}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.14      0.97      0.24      2145

    accuracy                           0.98    555719
   macro avg       0.57      0.97      0.62    555719
weighted avg       1.00      0.98      0.99    555719



In [10]:
from sklearn.model_selection import GridSearchCV

#parameter space is smaller because too many options
#included parameters are default XGBoost and RandomizedSearchCV findings
param_grid = {
    'max_depth': [4, 6],
    'learning_rate': [0.05, 0.3],
    'n_estimators': [100, 300],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1],
    'gamma': [0, 5]
}

# This performs exhaustive search over the parameter grid
search = GridSearchCV(
    clf_xgb, param_grid, scoring='recall', cv=3, verbose=2, n_jobs=-1
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)

# Predict on test set using the best found model
y_pred = search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 0.6}
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.14      0.97      0.24      2145

    accuracy                           0.98    555719
   macro avg       0.57      0.97      0.62    555719
weighted avg       1.00      0.98      0.99    555719



In [11]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.5.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.5.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.5.0 scikit-optimize-0.10.2


In [12]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

opt = BayesSearchCV(
    clf_xgb,
    {
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'max_depth': Integer(3, 10),
        'subsample': Real(0.5, 1.0),
        'n_estimators': Integer(100, 500, prior = 'log-uniform'),
        'colsample_bytree': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10, prior = 'log-uniform'),
        'gamma': Integer(0, 5)
    },
    n_iter=50,
    cv=5,
    scoring='recall',
    random_state=42
)

opt.fit(X_train, y_train)

print("Best params:", opt.best_params_)

# Predict on test set using the best found model
y_pred = opt.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Best params: OrderedDict([('colsample_bytree', 0.641714667121138), ('gamma', 0), ('learning_rate', 0.12307633054170765), ('max_depth', 4), ('min_child_weight', 1), ('n_estimators', 100), ('subsample', 1.0)])
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    553574
           1       0.13      0.97      0.23      2145

    accuracy                           0.97    555719
   macro avg       0.56      0.97      0.61    555719
weighted avg       1.00      0.97      0.98    555719



###B) Consider both precision and recall

In [13]:
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.3],
    'n_estimators': [100, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 1, 5]
}

#this randomly samples
#average_precision maximizes AUC of precision-recall curve
search = RandomizedSearchCV(
    clf_xgb, param_grid, n_iter=20, scoring='average_precision', cv=3, verbose=2, n_jobs=-1
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)

y_pred = search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best params: {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 10, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.8}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.40      0.91      0.56      2145

    accuracy                           0.99    555719
   macro avg       0.70      0.95      0.78    555719
weighted avg       1.00      0.99      1.00    555719



In [15]:
#parameter space is smaller because too many options
#included parameters are default XGBoost and RandomizedSearchCV findings
param_grid = {
    'max_depth': [6,8],
    'learning_rate': [0.05, 0.3],
    'n_estimators': [100, 500],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 10],
    'gamma': [0]
}

# This performs exhaustive search over the parameter grid
search = GridSearchCV(
    clf_xgb, param_grid, scoring='average_precision', cv=3, verbose=2, n_jobs=-1
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)

# Predict on test set using the best found model
y_pred = search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best params: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 10, 'n_estimators': 500, 'subsample': 0.6}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.42      0.90      0.58      2145

    accuracy                           0.99    555719
   macro avg       0.71      0.95      0.79    555719
weighted avg       1.00      0.99      1.00    555719



In [14]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

opt = BayesSearchCV(
    clf_xgb,
    {
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'max_depth': Integer(3, 10),
        'subsample': Real(0.5, 1.0),
        'n_estimators': Integer(100, 500, prior = 'log-uniform'),
        'colsample_bytree': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10, prior = 'log-uniform'),
        'gamma': Integer(0, 5)
    },
    n_iter=50,
    cv=5,
    scoring='average_precision',
    random_state=42
)

opt.fit(X_train, y_train)

print("Best params:", opt.best_params_)

# Predict on test set using the best found model
y_pred = opt.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Best params: OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.0889011255833442), ('max_depth', 10), ('min_child_weight', 10), ('n_estimators', 100), ('subsample', 1.0)])
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.28      0.93      0.44      2145

    accuracy                           0.99    555719
   macro avg       0.64      0.96      0.72    555719
weighted avg       1.00      0.99      0.99    555719



###Summary of results
We are able to increase recall from 0.93 to 0.97 with optimized parameters for XGBoost. When optimizing based on 'average_precision', we find that precision increases at the slight detriment of recall. The different optimization approaches yield highly similiar results except for optimizing based on 'average_precision' for BayesianSearch.