#1) Download the  data

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fraud-detection


In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('{0}/fraudTrain.csv'.format(path))
test = pd.read_csv('{0}/fraudTest.csv'.format(path))

#2a) Transform the data

1. Obtain the local time of day

2. Transform date of birth to age



In [3]:
#1) local transaction time

state_timezone_map = {
    'AL': 'America/Chicago',
    'AK': 'America/Anchorage',
    'AZ': 'America/Phoenix',       # No DST
    'AR': 'America/Chicago',
    'CA': 'America/Los_Angeles',
    'CO': 'America/Denver',
    'CT': 'America/New_York',
    'DE': 'America/New_York',
    'FL': 'America/New_York',      # Most of Florida
    'GA': 'America/New_York',
    'HI': 'Pacific/Honolulu',      # No DST
    'ID': 'America/Boise',         # Split between MT and PT
    'IL': 'America/Chicago',
    'IN': 'America/Indiana/Indianapolis',
    'IA': 'America/Chicago',
    'KS': 'America/Chicago',
    'KY': 'America/New_York',
    'LA': 'America/Chicago',
    'ME': 'America/New_York',
    'MD': 'America/New_York',
    'MA': 'America/New_York',
    'MI': 'America/Detroit',
    'MN': 'America/Chicago',
    'MS': 'America/Chicago',
    'MO': 'America/Chicago',
    'MT': 'America/Denver',
    'NE': 'America/Chicago',
    'NV': 'America/Los_Angeles',
    'NH': 'America/New_York',
    'NJ': 'America/New_York',
    'NM': 'America/Denver',
    'NY': 'America/New_York',
    'NC': 'America/New_York',
    'ND': 'America/Chicago',
    'OH': 'America/New_York',
    'OK': 'America/Chicago',
    'OR': 'America/Los_Angeles',
    'PA': 'America/New_York',
    'RI': 'America/New_York',
    'SC': 'America/New_York',
    'SD': 'America/Chicago',
    'TN': 'America/Chicago',
    'TX': 'America/Chicago',
    'UT': 'America/Denver',
    'VT': 'America/New_York',
    'VA': 'America/New_York',
    'WA': 'America/Los_Angeles',
    'WV': 'America/New_York',
    'WI': 'America/Chicago',
    'WY': 'America/Denver'
}

def local_time(df):
    df = df.copy()

    df['timezone'] = df['state'].map(state_timezone_map)
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date_trans_time'] = df['trans_date_trans_time'].dt.tz_localize('UTC')

    local_dt = df.apply(
        lambda row: row['trans_date_trans_time'].tz_convert(row['timezone']) if pd.notnull(row['timezone']) else pd.NaT,
        axis=1
    )

    time_in_seconds = local_dt.apply(
        lambda dt: dt.hour * 3600 + dt.minute * 60 + dt.second if pd.notnull(dt) else None
    )

    seconds_in_day = 24 * 60 * 60
    seconds_norm = time_in_seconds / seconds_in_day

    #acount for circularity of time of day
    ##i.e. 12:59pm is close to 1pm
    df['time_sin'] = np.sin(2 * np.pi * seconds_norm)
    df['time_cos'] = np.cos(2 * np.pi * seconds_norm)

    return df[['time_sin', 'time_cos']]


train[['time_sin', 'time_cos']] = local_time(train)
test[['time_sin', 'time_cos']] = local_time(test)

In [4]:
#2) Transform date of birth to age in years

def get_age(df):

    today = pd.Timestamp('2020-01-01')
    #could have reference 'today' be transaction time but doesn't matter much because transactions are all within two years

    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['dob'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

    return df['age']

train['age'] = get_age(train)
test['age'] = get_age(test)

#2b) Transform the data
####Use one-shot encoding since not all models can work with categorical data.

In [5]:
#columns to be removed
no_need = ['cc_num', 'Unnamed: 0','street', 'city',  'dob','job','first',
           'last','trans_num','trans_date_trans_time', 'lat', 'long', 'merch_lat',
           'merch_long', 'unix_time','merchant','state']


# Drop unnecessary columns
sparse_train = train.drop(columns=no_need)
sparse_test = test.drop(columns=no_need)

# Specify categorical columns to encode
categorical_columns = ['gender', 'category']

# Concatenate train and test temporarily to ensure consistent encoding
combined = pd.concat([sparse_train, sparse_test], keys=["train", "test"])

# One-hot encode the specified categorical columns
combined = pd.get_dummies(combined, columns=categorical_columns, drop_first=False)

# Split back into train and test
sparse_train = combined.xs("train")
sparse_test = combined.xs("test")

#setup features (X) and the classification (y)
y_train, X_train  = sparse_train['is_fraud'], sparse_train.drop(columns='is_fraud')
y_test, X_test = sparse_test['is_fraud'], sparse_test.drop(columns='is_fraud')

#3) Explore the data

This section is skipped. See `features/one_shot/one_shot.ipynb` for the details.

#4) Alright, time for some modeling!
1. Catboost
2. Random Forest


In [7]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
#1) Catboost

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Train CatBoost classifier
clf_cat = CatBoostClassifier(
    task_type='GPU',  # Use GPU if available
    devices='0',      # GPU ID
    verbose=0,        # Suppress output during training
    eval_metric='Logloss'
)
clf_cat.fit(X_train, y_train)

# Predict and evaluate
y_pred_cat = clf_cat.predict(X_test)

print(classification_report(y_test, y_pred_cat))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.89      0.75      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.94      0.87      0.90    555719
weighted avg       1.00      1.00      1.00    555719



In [9]:
#1) Catboost with reweights

counts = y_train.value_counts()
weight_for_class_0 = 1.0
weight_for_class_1 = counts.get(0, 0) / counts.get(1, 1)  # reweight class 1 (e.g., fraud)

class_weights = [weight_for_class_0, weight_for_class_1]

# Train CatBoost classifier
clf_cat = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    verbose=0,
    eval_metric='Logloss',
    class_weights=class_weights
)
clf_cat.fit(X_train, y_train)

# Predict and evaluate
y_pred_cat = clf_cat.predict(X_test)

print(classification_report(y_test, y_pred_cat))


              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.17      0.97      0.28      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.98      0.64    555719
weighted avg       1.00      0.98      0.99    555719



In [10]:
#2) Random Forest
#no gpu implementation so have to be patient...
from sklearn.ensemble import RandomForestClassifier


# Train Random Forest classifier
clf_rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
clf_rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.92      0.73      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.96      0.86      0.91    555719
weighted avg       1.00      1.00      1.00    555719



In [11]:
#2) Random Forest with reweights

# Compute class weights (similar to scale_pos_weight logic)
counts = y_train.value_counts()
weight_for_class_0 = 1.0
weight_for_class_1 = counts.get(0, 0) / counts.get(1, 1)  # upweight positive class (e.g., fraud)

class_weights = {
    0: weight_for_class_0,
    1: weight_for_class_1
}

# Train Random Forest classifier
clf_rf = RandomForestClassifier(
    class_weight=class_weights,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
clf_rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.93      0.72      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.96      0.86      0.90    555719
weighted avg       1.00      1.00      1.00    555719



The recall of Catboost with reweights does better that of XGBoost with reweights (poorer precision though). We can get similiar results for precision and recall with XGBoost when optimizing the parameters for recall.

Interestingly, Random Forest results are not affected by reweighting. Results are essentially the same in both cases.