# Training

In [6]:
import numpy as np
import pandas as pd

from geopy.distance import great_circle
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier, Pool

Helper functions to process data and lists of names for the features of different types

In [7]:
features = ['merch', 'cat_id', 'gender', 'one_city', 'us_state', 'jobs', 'hour', 'year', 'month', 'day_of_month', 'day_of_week', 'amount', 'population_city', 'distance']
cat_cols = ['merch', 'cat_id', 'gender', 'one_city', 'us_state', 'jobs', 'hour', 'year', 'month', 'day_of_month', 'day_of_week']
cont_cols = ['amount', 'population_city', 'distance']

In [8]:
def extract_time_features(df):
    """Extracts time-based features from the transaction_time column."""
     
    df['transaction_time'] = pd.to_datetime(df['transaction_time'])
    df['hour'] = df['transaction_time'].dt.hour
    df['year'] = df['transaction_time'].dt.year
    df['month'] = df['transaction_time'].dt.month
    df['day_of_month'] = df['transaction_time'].dt.day
    df['day_of_week'] = df['transaction_time'].dt.dayofweek
    
    return df

def calculate_distance(df):
    """Calculates the distance between customer and merchant."""

    df['distance'] = df.apply(
        lambda x: great_circle(
            (x['lat'], x['lon']), 
            (x['merchant_lat'], x['merchant_lon'])
        ).km,
        axis=1
    )
    
    return df

def run_preproc(input_df):
    """Runs the preprocessing pipeline."""
    
    df = input_df.copy()
    df = extract_time_features(df)
    df = calculate_distance(df)

    # Impute categorical features with the most frequent value
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    
    # Impute continuous features with the mean
    cont_imputer = SimpleImputer(strategy='mean')
    df[cont_cols] = cont_imputer.fit_transform(df[cont_cols])
    
    return df[features]

In [4]:
train_data = pd.read_csv('../data/train.csv')

In [9]:
train_data_proc = run_preproc(train_data)
train_label = train_data['target']

First we split training data to train and validation and check the metrics. If they're OK, we'll retrain on the whole dataset

In [10]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_data_proc, train_label, test_size=0.2, random_state=42, stratify=train_label)

In [15]:
pool_train = Pool(data=X_train, label=y_train, cat_features=cat_cols)
pool_val = Pool(data=X_val, label=y_val, cat_features=cat_cols)

We cann see, that the task is imbalanced

In [16]:
y_train.value_counts()

target
0    625541
1      3603
Name: count, dtype: int64

Given the imbalanced nature of the task, we assign weights

In [17]:
minor_class_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
dict_weight = {0: 1, 1: minor_class_weight}

In [18]:
model = CatBoostClassifier(iterations=50, class_weights=dict_weight, eval_metric='PRAUC', one_hot_max_size=50, random_seed=42)

In [19]:
model.fit(pool_train, eval_set=pool_val, plot=True, early_stopping_rounds=50)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.9494423	test: 0.9464673	best: 0.9464673 (0)	total: 420ms	remaining: 20.6s
1:	learn: 0.9671675	test: 0.9663581	best: 0.9663581 (1)	total: 723ms	remaining: 17.4s
2:	learn: 0.9741084	test: 0.9726790	best: 0.9726790 (2)	total: 1.02s	remaining: 16s
3:	learn: 0.9742348	test: 0.9781114	best: 0.9781114 (3)	total: 1.25s	remaining: 14.4s
4:	learn: 0.9805049	test: 0.9817951	best: 0.9817951 (4)	total: 1.48s	remaining: 13.3s
5:	learn: 0.9850306	test: 0.9840122	best: 0.9840122 (5)	total: 1.61s	remaining: 11.8s
6:	learn: 0.9859770	test: 0.9851912	best: 0.9851912 (6)	total: 1.79s	remaining: 11s
7:	learn: 0.9868444	test: 0.9860871	best: 0.9860871 (7)	total: 1.99s	remaining: 10.4s
8:	learn: 0.9884304	test: 0.9877156	best: 0.9877156 (8)	total: 2.21s	remaining: 10.1s
9:	learn: 0.9892365	test: 0.9884323	best: 0.9884323 (9)	total: 2.37s	remaining: 9.47s
10:	learn: 0.9921513	test: 0.9912025	best: 0.9912025 (10)	total: 2.56s	remaining: 9.07s
11:	learn: 0.9923175	test: 0.99

<catboost.core.CatBoostClassifier at 0x1351a0100>

The metrics are descent, so we will proceed with this model. Now we would like to find the best threshold based on `cohen_kappa_score` and `matthews_corrcoef`

In [21]:
from sklearn.metrics import cohen_kappa_score

In [22]:
pred_val = model.predict_proba(X_val)[:, 1]

In [23]:
thresholds = np.arange(0.0, 1.01, 0.01)
best_cohen = -1.0
best_threshold = 0.0

for threshold in thresholds:
    cohen = cohen_kappa_score(y_val, np.where(pred_val > threshold, 1, 0))
    if cohen > best_cohen:
        best_cohen = cohen
        best_threshold = threshold


In [26]:
print(f'Best cohen_cappa_score: {best_cohen}')
print(f'Best threshold: {best_threshold}')

Best cohen_cappa_score: 0.7123620113225572
Best threshold: 0.98


Now we can train the model on the whole train dataset

In [27]:
train_data = pd.read_csv('../data/train.csv')

train_data_proc = run_preproc(train_data)
train_label = train_data['target']

pool_train = Pool(data=train_data_proc, label=train_label, cat_features=cat_cols)

In [29]:
minor_class_weight = train_label.value_counts()[0] / train_label.value_counts()[1]
dict_weight = {0: 1, 1: minor_class_weight}

In [30]:
model = CatBoostClassifier(iterations=50, class_weights=dict_weight, eval_metric='PRAUC', one_hot_max_size=50, random_seed=42)

In [31]:
model.fit(pool_train, eval_set=pool_train, plot=True, early_stopping_rounds=50)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.9508108	test: 0.9508108	best: 0.9508108 (0)	total: 171ms	remaining: 8.36s
1:	learn: 0.9693717	test: 0.9693717	best: 0.9693717 (1)	total: 316ms	remaining: 7.58s
2:	learn: 0.9748894	test: 0.9748894	best: 0.9748894 (2)	total: 462ms	remaining: 7.23s
3:	learn: 0.9778988	test: 0.9778988	best: 0.9778988 (3)	total: 633ms	remaining: 7.28s
4:	learn: 0.9808035	test: 0.9808035	best: 0.9808035 (4)	total: 793ms	remaining: 7.14s
5:	learn: 0.9831287	test: 0.9831287	best: 0.9831287 (5)	total: 979ms	remaining: 7.18s
6:	learn: 0.9876798	test: 0.9876798	best: 0.9876798 (6)	total: 1.18s	remaining: 7.22s
7:	learn: 0.9888230	test: 0.9888230	best: 0.9888230 (7)	total: 1.33s	remaining: 7.01s
8:	learn: 0.9891194	test: 0.9891194	best: 0.9891194 (8)	total: 1.55s	remaining: 7.05s
9:	learn: 0.9905486	test: 0.9905486	best: 0.9905486 (9)	total: 1.73s	remaining: 6.92s
10:	learn: 0.9910163	test: 0.9910163	best: 0.9910163 (10)	total: 1.91s	remaining: 6.77s
11:	learn: 0.9920214	test: 

<catboost.core.CatBoostClassifier at 0x136b4b430>

In [33]:
model.save_model('../models/fraud_detection_model.cbm')