In [37]:
import pandas as pd
import numpy as np
import sys

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from geopy.distance import great_circle

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    confusion_matrix,
    classification_report
)

import matplotlib.pyplot as plt

from catboost import CatBoostClassifier

In [3]:
%load_ext autoreload
%autoreload 2

In [23]:
target_col = 'target'
categorical_cols = ['gender', 'merch', 'cat_id', 'one_city', 'us_state', 'jobs']
n_cats = 50

# Import Train dataset
train = pd.read_csv('../train_data/train.csv').drop(columns=['name_1', 'name_2', 'street', 'post_code'])
print(f"Train shape: {train.shape}")

Train shape: (786431, 14)


In [24]:
def add_time_features(df):
    df['transaction_time'] = pd.to_datetime(df['transaction_time'])
    dt = df['transaction_time'].dt
    df['hour'] = dt.hour
    df['year'] = dt.year
    df['month'] = dt.month
    df['day_of_month'] = dt.day
    df['day_of_week'] = dt.dayofweek
    df.drop(columns='transaction_time', inplace=True)
    return df

# Add some simple time features
train = add_time_features(train)
train.head()

Unnamed: 0,merch,cat_id,amount,gender,one_city,us_state,lat,lon,population_city,jobs,merchant_lat,merchant_lon,target,hour,year,month,day_of_month,day_of_week
0,fraud_Cormier LLC,health_fitness,148.04,M,Kent,OR,45.0838,-120.6649,60,Museum education officer,45.042827,-120.709327,0,15,2019,12,27,4
1,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,F,Plantersville,AL,32.6176,-86.9475,1412,Drilling engineer,31.872266,-87.828247,0,23,2019,4,17,2
2,fraud_Ruecker-Mayert,kids_pets,52.96,M,Mount Perry,OH,39.8788,-82.188,1831,Barrister's clerk,40.010874,-81.841249,0,15,2019,9,23,0
3,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,F,Ratcliff,TX,31.3833,-95.0619,43,"Engineer, production",30.888406,-95.141609,0,16,2019,5,13,0
4,fraud_Luettgen PLC,gas_transport,51.59,F,Blairsden-Graeagle,CA,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),39.376017,-121.311691,0,7,2019,8,18,6


In [25]:
for col in categorical_cols:
    new_col = col + '_cat'

    # Get table of categories
    temp_df = train\
        .groupby(col, dropna=False)[[target_col]]\
        .count()\
        .sort_values(target_col, ascending=False)\
        .reset_index()\
        .set_axis([col, 'count'], axis=1)\
        .reset_index()
    temp_df['index'] = temp_df.apply(lambda x: np.nan if pd.isna(x[col]) else x['index'], axis=1)
    temp_df[new_col] = ['cat_NAN' if pd.isna(x) else 'cat_' + str(x) if x < n_cats else f'cat_{n_cats}+' for x in temp_df['index']]

    train = train.merge(temp_df[[col, new_col]], how='left', on=col)

train.head()

Unnamed: 0,merch,cat_id,amount,gender,one_city,us_state,lat,lon,population_city,jobs,...,year,month,day_of_month,day_of_week,gender_cat,merch_cat,cat_id_cat,one_city_cat,us_state_cat,jobs_cat
0,fraud_Cormier LLC,health_fitness,148.04,M,Kent,OR,45.0838,-120.6649,60,Museum education officer,...,2019,12,27,4,cat_1,cat_1,cat_9,cat_50+,cat_30,cat_50+
1,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,F,Plantersville,AL,32.6176,-86.9475,1412,Drilling engineer,...,2019,4,17,2,cat_0,cat_50+,cat_9,cat_50+,cat_8,cat_50+
2,fraud_Ruecker-Mayert,kids_pets,52.96,M,Mount Perry,OH,39.8788,-82.188,1831,Barrister's clerk,...,2019,9,23,0,cat_1,cat_50+,cat_4,cat_50+,cat_4,cat_50+
3,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,F,Ratcliff,TX,31.3833,-95.0619,43,"Engineer, production",...,2019,5,13,0,cat_0,cat_50+,cat_9,cat_50+,cat_0,cat_50+
4,fraud_Luettgen PLC,gas_transport,51.59,F,Blairsden-Graeagle,CA,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),...,2019,8,18,6,cat_0,cat_38,cat_0,cat_50+,cat_3,cat_50+


In [26]:
def add_distance_features(df):
    df['distance'] = df.apply(
        lambda x: great_circle(
            (x['lat'], x['lon']), 
            (x['merchant_lat'], x['merchant_lon'])
        ).km,
        axis=1
    )
    return df.drop(columns=['lat', 'lon', 'merchant_lat', 'merchant_lon'])


# Calculate distance between a client and a merchant
train = add_distance_features(train)
train.head()

Unnamed: 0,merch,cat_id,amount,gender,one_city,us_state,population_city,jobs,target,hour,...,month,day_of_month,day_of_week,gender_cat,merch_cat,cat_id_cat,one_city_cat,us_state_cat,jobs_cat,distance
0,fraud_Cormier LLC,health_fitness,148.04,M,Kent,OR,60,Museum education officer,0,15,...,12,27,4,cat_1,cat_1,cat_9,cat_50+,cat_30,cat_50+,5.738663
1,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,F,Plantersville,AL,1412,Drilling engineer,0,23,...,4,17,2,cat_0,cat_50+,cat_9,cat_50+,cat_8,cat_50+,117.172347
2,fraud_Ruecker-Mayert,kids_pets,52.96,M,Mount Perry,OH,1831,Barrister's clerk,0,15,...,9,23,0,cat_1,cat_50+,cat_4,cat_50+,cat_4,cat_50+,33.007314
3,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,F,Ratcliff,TX,43,"Engineer, production",0,16,...,5,13,0,cat_0,cat_50+,cat_9,cat_50+,cat_0,cat_50+,55.550248
4,fraud_Luettgen PLC,gas_transport,51.59,F,Blairsden-Graeagle,CA,1725,Chartered legal executive (England and Wales),0,7,...,8,18,6,cat_0,cat_38,cat_0,cat_50+,cat_3,cat_50+,75.267367


In [27]:
train.columns

Index(['merch', 'cat_id', 'amount', 'gender', 'one_city', 'us_state',
       'population_city', 'jobs', 'target', 'hour', 'year', 'month',
       'day_of_month', 'day_of_week', 'gender_cat', 'merch_cat', 'cat_id_cat',
       'one_city_cat', 'us_state_cat', 'jobs_cat', 'distance'],
      dtype='object')

In [29]:
cat_cols = [col + '_cat' for col in categorical_cols] + ['hour', 'year', 'month',
       'day_of_month', 'day_of_week']
num_cols = ['amount', 'population_city', 'distance']

In [30]:
cat_cols

['gender_cat',
 'merch_cat',
 'cat_id_cat',
 'one_city_cat',
 'us_state_cat',
 'jobs_cat',
 'hour',
 'year',
 'month',
 'day_of_month',
 'day_of_week']

In [31]:
num_cols

['amount', 'population_city', 'distance']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    train[cat_cols + num_cols], train[target_col], train_size=0.8, random_state=777
)

In [33]:
model = CatBoostClassifier(iterations=100)

In [34]:
model.fit(X_train, y_train, cat_features=cat_cols)

Learning rate set to 0.5
0:	learn: 0.3210446	total: 194ms	remaining: 19.2s
1:	learn: 0.1814655	total: 300ms	remaining: 14.7s
2:	learn: 0.1108278	total: 403ms	remaining: 13s
3:	learn: 0.0718656	total: 542ms	remaining: 13s
4:	learn: 0.0499437	total: 685ms	remaining: 13s
5:	learn: 0.0375698	total: 786ms	remaining: 12.3s
6:	learn: 0.0280514	total: 889ms	remaining: 11.8s
7:	learn: 0.0233914	total: 998ms	remaining: 11.5s
8:	learn: 0.0205856	total: 1.11s	remaining: 11.2s
9:	learn: 0.0186199	total: 1.22s	remaining: 10.9s
10:	learn: 0.0153995	total: 1.34s	remaining: 10.9s
11:	learn: 0.0148536	total: 1.5s	remaining: 11s
12:	learn: 0.0141588	total: 1.63s	remaining: 10.9s
13:	learn: 0.0140661	total: 1.75s	remaining: 10.8s
14:	learn: 0.0137555	total: 1.94s	remaining: 11s
15:	learn: 0.0135592	total: 2.06s	remaining: 10.8s
16:	learn: 0.0135555	total: 2.18s	remaining: 10.6s
17:	learn: 0.0131929	total: 2.3s	remaining: 10.5s
18:	learn: 0.0131886	total: 2.38s	remaining: 10.2s
19:	learn: 0.0130084	total: 

<catboost.core.CatBoostClassifier at 0x1183b3f40>

In [38]:
y_proba = model.predict_proba(X_test)[:, 1]

y_pred_default = (y_proba >= 0.5).astype(int)

# 3. Compute robust metrics
auc_roc = roc_auc_score(y_test, y_proba)
auc_pr = average_precision_score(y_test, y_proba)  # Better for imbalanced data

print(f"ROC-AUC: {auc_roc:.4f}")
print(f"PR-AUC (Average Precision): {auc_pr:.4f}")

# 4. Optional: Find optimal threshold using Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# F1-score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5

print(f"Optimal threshold (max F1): {optimal_threshold:.4f}")
print(f"Max F1-score: {f1_scores[optimal_idx]:.4f}")

# 5. Make predictions with optimal threshold
y_pred_opt = (y_proba >= optimal_threshold).astype(int)

# 6. Detailed evaluation
print("\n--- Classification Report (Optimal Threshold) ---")
print(classification_report(y_test, y_pred_opt, target_names=['Class 0', 'Class 1']))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix

ROC-AUC: 0.9757
PR-AUC (Average Precision): 0.7196
Optimal threshold (max F1): 0.3318
Max F1-score: 0.6840

--- Classification Report (Optimal Threshold) ---
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00    156374
     Class 1       0.69      0.68      0.68       913

    accuracy                           1.00    157287
   macro avg       0.84      0.84      0.84    157287
weighted avg       1.00      1.00      1.00    157287


--- Confusion Matrix ---


In [39]:
model.save_model("../models/my_model.cbm")