In [32]:
!pip3 install -q kagglehub

from pathlib import Path
import os, json, gc
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [33]:
DATASET_ID = "computingvictor/transactions-fraud-datasets"
DATASET_FOLDER_NAME = "transactions-fraud-datasets"

N_CLIENTS = 300
SEQ_LEN = 10
MAX_WINDOWS_TRAIN = 50_000
MAX_WINDOWS_VAL   = 20_000
MAX_WINDOWS_TEST  = 20_000

BATCH_SIZE = 128
EPOCHS = 15
LR = 1e-3

POS_WEIGHT_CAP = 50.0
SEED = 0
# =====================

np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

device: cpu


In [34]:
import kagglehub

def find_repo_root():
    cur = Path.cwd().resolve()
    for p in [cur] + list(cur.parents):
        if (p / ".git").exists():
            return p
    return cur

def resolve_dataset_dir():
    env = os.getenv("FRAUD_DATA_DIR")
    if env:
        p = Path(env).expanduser().resolve()
        if p.exists():
            return p

    repo_root = find_repo_root()
    local = repo_root / "data" / DATASET_FOLDER_NAME
    if local.exists():
        return local.resolve()

    print("Dataset not found locally - downloading via kagglehub...")
    return Path(kagglehub.dataset_download(DATASET_ID)).resolve()

def parse_amount(x):
    x = str(x).replace("$","").replace(",","").strip()
    try:
        return float(x)
    except:
        return np.nan

In [35]:
# Load Labels + Transactions Subset

dataset_dir = resolve_dataset_dir()
print("Using dataset directory:", dataset_dir)

tx_path = dataset_dir / "transactions_data.csv"
labels_path = dataset_dir / "train_fraud_labels.json"

assert tx_path.exists(), f"Missing {tx_path}"
assert labels_path.exists(), f"Missing {labels_path}"

with open(labels_path, "r") as f:
    labels_raw = json.load(f)

target_map = labels_raw["target"]
labels = pd.DataFrame({
    "transaction_id": list(target_map.keys()),
    "target": [1 if v == "Yes" else 0 for v in target_map.values()]
})
labels["transaction_id"] = labels["transaction_id"].astype(str)
labels["target"] = labels["target"].astype(int)

usecols = ["id","date","client_id","card_id","amount","use_chip","merchant_state","mcc","errors"]

# choose clients from an early slice
first_chunk = pd.read_csv(tx_path, usecols=["client_id"], nrows=200_000)
clients = first_chunk["client_id"].dropna().unique()

chosen_clients = set(np.random.choice(clients, size=min(N_CLIENTS, len(clients)), replace=False))

chunks = []
for chunk in pd.read_csv(tx_path, usecols=usecols, chunksize=200_000):
    keep = chunk[chunk["client_id"].isin(chosen_clients)]
    if len(keep):
        chunks.append(keep)

tx = pd.concat(chunks, ignore_index=True)
tx["id"] = tx["id"].astype(str)

tx = tx.merge(labels, left_on="id", right_on="transaction_id", how="left").drop(columns=["transaction_id"])

print("Rows:", len(tx))
print("Labeled rows:", tx["target"].notna().sum())
tx.head()

Dataset not found locally - downloading via kagglehub...
Using dataset directory: /Users/glennc/.cache/kagglehub/datasets/computingvictor/transactions-fraud-datasets/versions/1
Rows: 3358392
Labeled rows: 2249957


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target
0,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,IA,5311,,0.0
1,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,CA,4829,,0.0
2,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,NY,5942,,0.0
3,7475337,2010-01-01 00:21:00,351,1112,$10.74,Swipe Transaction,NY,5813,,
4,7475344,2010-01-01 00:32:00,646,2093,$73.79,Swipe Transaction,PA,7538,,0.0


In [36]:
# Clean + Basic Feature Columns

tx["date"] = pd.to_datetime(tx["date"], errors="coerce")
tx = tx.dropna(subset=["date"])

tx["amount"] = tx["amount"].apply(parse_amount)
tx = tx.dropna(subset=["amount"])

tx["errors"] = tx["errors"].fillna("None").astype(str)
tx["use_chip"] = tx["use_chip"].fillna("Unknown").astype(str)
tx["merchant_state"] = tx["merchant_state"].fillna("Unknown").astype(str)
tx["mcc"] = tx["mcc"].fillna(-1).astype(int).astype(str)

tx = tx.sort_values(["client_id","date"]).reset_index(drop=True)

tx["hour"] = tx["date"].dt.hour
tx["dayofweek"] = tx["date"].dt.dayofweek

tx.head()

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target,hour,dayofweek
0,7477094,2010-01-01 11:58:00,1,4652,15.09,Swipe Transaction,FL,4121,,0.0,11,4
1,7477168,2010-01-01 12:11:00,1,3682,6.01,Swipe Transaction,FL,5813,,0.0,12,4
2,7477216,2010-01-01 12:18:00,1,3682,14.58,Online Transaction,Unknown,4121,,0.0,12,4
3,7477978,2010-01-01 15:09:00,1,4652,14.66,Online Transaction,Unknown,4121,,0.0,15,4
4,7478279,2010-01-01 16:26:00,1,4652,22.77,Swipe Transaction,FL,4121,,,16,4


In [37]:
def time_split(df, client_col="client_id", frac_train=0.70, frac_val=0.15, min_len=20):
    train_idx, val_idx, test_idx = [], [], []
    for cid, g in df.groupby(client_col, sort=False):
        n = len(g)
        if n < min_len:
            continue
        t1 = int(n * frac_train)
        t2 = int(n * (frac_train + frac_val))
        idx = g.index.to_numpy()
        train_idx.append(idx[:t1])
        val_idx.append(idx[t1:t2])
        test_idx.append(idx[t2:])
    return np.concatenate(train_idx), np.concatenate(val_idx), np.concatenate(test_idx)

train_idx, val_idx, test_idx = time_split(tx)

train_df = tx.loc[train_idx].copy()
val_df   = tx.loc[val_idx].copy()
test_df  = tx.loc[test_idx].copy()

print(len(train_df), len(val_df), len(test_df))

2350739 503758 503895


In [38]:
# Encode + Scale

num_cols = ["amount", "hour", "dayofweek"]
cat_cols = ["use_chip", "merchant_state", "mcc", "errors"]

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
scaler = StandardScaler()

Xtr_cat = enc.fit_transform(train_df[cat_cols])
Xva_cat = enc.transform(val_df[cat_cols])
Xte_cat = enc.transform(test_df[cat_cols])

Xtr_num = scaler.fit_transform(train_df[num_cols])
Xva_num = scaler.transform(val_df[num_cols])
Xte_num = scaler.transform(test_df[num_cols])

X_train = np.hstack([Xtr_num, Xtr_cat]).astype(np.float32)
X_val   = np.hstack([Xva_num, Xva_cat]).astype(np.float32)
X_test  = np.hstack([Xte_num, Xte_cat]).astype(np.float32)

y_train = train_df["target"].to_numpy()
y_val   = val_df["target"].to_numpy()
y_test  = test_df["target"].to_numpy()

X_train.shape

(2350739, 7)

In [39]:
def build_sequences_sampled(df_part, X_part, y_part, seq_len=10, max_windows=50_000, client_col="client_id", per_client_cap=50, seed=0):
    X_seqs = np.zeros((max_windows, seq_len, X_part.shape[1]), dtype=np.float32)
    y_seqs = np.zeros((max_windows,), dtype=np.int64)

    k = 0
    start = 0
    rng = np.random.default_rng(seed)

    for cid, g in df_part.groupby(client_col, sort=False):
        n = len(g)
        if n <= seq_len:
            start += n
            continue

        Xg = X_part[start:start+n]
        yg = y_part[start:start+n]

        valid_t = np.where(~np.isnan(yg))[0]
        valid_t = valid_t[valid_t >= seq_len]
        if len(valid_t) == 0:
            start += n
            continue

        take = min(len(valid_t), per_client_cap)
        chosen = rng.choice(valid_t, size=take, replace=False)

        for t in chosen:
            if k >= max_windows:
                return X_seqs[:k], y_seqs[:k]
            X_seqs[k] = Xg[t-seq_len:t]
            y_seqs[k] = int(yg[t])
            k += 1

        start += n

    return X_seqs[:k], y_seqs[:k]


Xtr_seq, ytr_seq = build_sequences_sampled(train_df, X_train, y_train, seq_len=SEQ_LEN, max_windows=MAX_WINDOWS_TRAIN, seed=SEED)
Xva_seq, yva_seq = build_sequences_sampled(val_df,   X_val,   y_val,   seq_len=SEQ_LEN, max_windows=MAX_WINDOWS_VAL, seed=SEED)
Xte_seq, yte_seq = build_sequences_sampled(test_df,  X_test,  y_test,  seq_len=SEQ_LEN, max_windows=MAX_WINDOWS_TEST, seed=SEED)

print("Train windows:", Xtr_seq.shape, "fraud rate:", ytr_seq.mean(), "count:", int(ytr_seq.sum()))
print("Val windows:",   Xva_seq.shape, "fraud rate:", yva_seq.mean(), "count:", int(yva_seq.sum()))
print("Test windows:",  Xte_seq.shape, "fraud rate:", yte_seq.mean(), "count:", int(yte_seq.sum()))

Train windows: (15000, 10, 7) fraud rate: 0.0018 count: 27
Val windows: (15000, 10, 7) fraud rate: 0.001 count: 15
Test windows: (15000, 10, 7) fraud rate: 0.0018 count: 27


In [40]:
def compute_metrics(y_true, y_prob, threshold=0.5):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob)

    y_pred = (y_prob >= threshold).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    fpr = ((y_pred == 1) & (y_true == 0)).sum() / max((y_true == 0).sum(), 1)

    roc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    pr  = average_precision_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    return {"precision": prec, "recall": rec, "f1": f1, "fpr": fpr, "roc_auc": roc, "pr_auc": pr}

def best_threshold_f1(y_true, y_prob, thresholds=None, min_pred_pos=1):
    """
    Picks threshold that maximizes F1 on validation, but avoids degenerate thresholds
    that predict 0 positives (or fewer than min_pred_pos).

    Uses quantiles of y_prob as candidate thresholds by default (more stable than fixed linspace).
    """
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob)

    if thresholds is None:
        qs = np.linspace(0.01, 0.99, 300)
        thresholds = np.unique(np.quantile(y_prob, qs))

    best_t, best_f1, best_m = 0.5, -1, None
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        if y_pred.sum() < min_pred_pos:
            continue
        m = compute_metrics(y_true, y_prob, threshold=float(t))
        if m["f1"] > best_f1:
            best_f1, best_t, best_m = m["f1"], float(t), m

    if best_m is None:
        best_t, best_m = 0.5, compute_metrics(y_true, y_prob, threshold=0.5)

    return best_t, best_m

In [41]:
# Sklearn Baselines

Xtr_flat = Xtr_seq.reshape(Xtr_seq.shape[0], -1)
Xva_flat = Xva_seq.reshape(Xva_seq.shape[0], -1)
Xte_flat = Xte_seq.reshape(Xte_seq.shape[0], -1)

ytr = ytr_seq.astype(np.int64)
yva = yva_seq.astype(np.int64)
yte = yte_seq.astype(np.int64)

baseline_results = {}

# Logistic Regression
logreg = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
logreg.fit(Xtr_flat, ytr)

va_prob_lr = logreg.predict_proba(Xva_flat)[:, 1]
best_t_lr, va_metrics_lr = best_threshold_f1(yva, va_prob_lr)
te_prob_lr = logreg.predict_proba(Xte_flat)[:, 1]
te_metrics_lr = compute_metrics(yte, te_prob_lr, threshold=best_t_lr)

baseline_results["LogReg"] = {"best_t": best_t_lr, "val": va_metrics_lr, "test": te_metrics_lr}
print("LogReg:", baseline_results["LogReg"])

# Random Forest
rf = RandomForestClassifier(n_estimators=300, min_samples_leaf=2, class_weight="balanced_subsample", n_jobs=-1, random_state=SEED)
rf.fit(Xtr_flat, ytr)

va_prob_rf = rf.predict_proba(Xva_flat)[:, 1]
best_t_rf, va_metrics_rf = best_threshold_f1(yva, va_prob_rf)
te_prob_rf = rf.predict_proba(Xte_flat)[:, 1]
te_metrics_rf = compute_metrics(yte, te_prob_rf, threshold=best_t_rf)

baseline_results["RandomForest"] = {"best_t": best_t_rf, "val": va_metrics_rf, "test": te_metrics_rf}
print("RandomForest:", baseline_results["RandomForest"])

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


LogReg: {'best_t': 0.9963938021919775, 'val': {'precision': 0.003355704697986577, 'recall': 0.06666666666666667, 'f1': 0.006389776357827476, 'fpr': np.float64(0.01981981981981982), 'roc_auc': np.float64(0.5006161717272828), 'pr_auc': np.float64(0.0012614757612957484)}, 'test': {'precision': 0.0030211480362537764, 'recall': 0.037037037037037035, 'f1': 0.00558659217877095, 'fpr': np.float64(0.022039671408535362), 'roc_auc': np.float64(0.5150802308352566), 'pr_auc': np.float64(0.002514096476983739)}}
RandomForest: {'best_t': 0.019974690937071337, 'val': {'precision': 0.002881844380403458, 'recall': 0.06666666666666667, 'f1': 0.0055248618784530384, 'fpr': np.float64(0.023089756423089757), 'roc_auc': np.float64(0.5373462351240129), 'pr_auc': np.float64(0.0011903203970417913)}, 'test': {'precision': 0.0029069767441860465, 'recall': 0.037037037037037035, 'f1': 0.005390835579514825, 'fpr': np.float64(0.022907900888265546), 'roc_auc': np.float64(0.5216599756104197), 'pr_auc': np.float64(0.00215

In [42]:
# Torch Dataset + Loaders

class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_loader = DataLoader(SeqDataset(Xtr_seq, ytr_seq), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(SeqDataset(Xva_seq, yva_seq), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(SeqDataset(Xte_seq, yte_seq), batch_size=BATCH_SIZE, shuffle=False)

In [43]:
# Model Definitions (CNN, CNN-RNN)

class FraudCNNRNN(nn.Module):
    def __init__(self, num_features, conv_channels=64, kernel_size=3, hidden_size=64, num_layers=1, dropout=0.1):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Sequential(
            nn.Conv1d(num_features, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv1d(conv_channels, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.rnn = nn.LSTM(
            input_size=conv_channels,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout if num_layers > 1 else 0.0),
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = x.transpose(1, 2)      # (B,T,F)->(B,F,T)
        x = self.conv(x)           # (B,C,T)
        x = x.transpose(1, 2)      # (B,T,C)
        _, (h_n, _) = self.rnn(x)
        h_last = h_n[-1]
        return self.fc(h_last).squeeze(1)

class FraudCNNBaseline(nn.Module):
    def __init__(self, num_features, conv_channels=64, kernel_size=3, dropout=0.1, pool="max"):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Sequential(
            nn.Conv1d(num_features, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv1d(conv_channels, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        assert pool in ("max", "avg")
        self.pool = pool
        self.fc = nn.Linear(conv_channels, 1)

    def forward(self, x):
        x = x.transpose(1, 2)      # (B,T,F)->(B,F,T)
        x = self.conv(x)           # (B,C,T)
        x = x.max(dim=2).values if self.pool == "max" else x.mean(dim=2)  # (B,C)
        return self.fc(x).squeeze(1)

class FraudLSTM(nn.Module):
    def __init__(self, num_features, hidden_size=64, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout if num_layers > 1 else 0.0),
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)     # h_n: (num_layers, B, hidden)
        h_last = h_n[-1]               # (B, hidden)
        return self.fc(h_last).squeeze(1)


In [44]:
# Shared Torch Train/Eval Functions

def get_pos_weight(y_np, cap=POS_WEIGHT_CAP):
    pos = float(np.sum(y_np))
    neg = float(len(y_np) - pos)
    raw = neg / max(pos, 1.0)
    return torch.tensor([min(raw, cap)], dtype=torch.float32, device=device)

def eval_pr_auc_torch(model, loader):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            prob = torch.sigmoid(model(xb)).cpu().numpy()
            all_p.append(prob)
            all_y.append(yb.numpy())
    y_true = np.concatenate(all_y).astype(int)
    y_prob = np.concatenate(all_p)
    return average_precision_score(y_true, y_prob)

def get_probs_from_loader(model, loader):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            prob = torch.sigmoid(model(xb)).cpu().numpy()
            all_p.append(prob)
            all_y.append(yb.numpy())
    return np.concatenate(all_y).astype(int), np.concatenate(all_p)

def train_torch_model(model, train_loader, val_loader, epochs=EPOCHS, lr=LR):
    model = model.to(device)
    pos_weight = get_pos_weight(ytr_seq, cap=POS_WEIGHT_CAP)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_pr, best_state = -1, None

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0.0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(yb)

        val_pr = eval_pr_auc_torch(model, val_loader)
        print(f"Epoch {epoch:02d} | loss={total_loss/len(train_loader.dataset):.4f} | val_pr_auc={val_pr:.6f}")

        if val_pr > best_pr:
            best_pr = val_pr
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)

    return model

In [45]:
# Train CNN-RNN

cnn_rnn = FraudCNNRNN(num_features=Xtr_seq.shape[2], conv_channels=64, kernel_size=3, hidden_size=64, num_layers=1, dropout=0.1)
cnn_rnn = train_torch_model(cnn_rnn, train_loader, val_loader, epochs=EPOCHS, lr=LR)

Epoch 01 | loss=0.3445 | val_pr_auc=0.006962
Epoch 02 | loss=0.2941 | val_pr_auc=0.002631
Epoch 03 | loss=0.2770 | val_pr_auc=0.004131
Epoch 04 | loss=0.2646 | val_pr_auc=0.002166
Epoch 05 | loss=0.2558 | val_pr_auc=0.003843
Epoch 06 | loss=0.2523 | val_pr_auc=0.007441
Epoch 07 | loss=0.2409 | val_pr_auc=0.003058
Epoch 08 | loss=0.2167 | val_pr_auc=0.002017
Epoch 09 | loss=0.2088 | val_pr_auc=0.003092
Epoch 10 | loss=0.2056 | val_pr_auc=0.004360
Epoch 11 | loss=0.1870 | val_pr_auc=0.002242
Epoch 12 | loss=0.1806 | val_pr_auc=0.002791
Epoch 13 | loss=0.1572 | val_pr_auc=0.001762
Epoch 14 | loss=0.1746 | val_pr_auc=0.002490
Epoch 15 | loss=0.1576 | val_pr_auc=0.001127


In [46]:
# Train CNN Baseline

cnn = FraudCNNBaseline(num_features=Xtr_seq.shape[2], conv_channels=64, kernel_size=3, dropout=0.1, pool="max")
cnn = train_torch_model(cnn, train_loader, val_loader, epochs=EPOCHS, lr=LR)

Epoch 01 | loss=0.3663 | val_pr_auc=0.002111
Epoch 02 | loss=0.2826 | val_pr_auc=0.002334
Epoch 03 | loss=0.2830 | val_pr_auc=0.001084
Epoch 04 | loss=0.2495 | val_pr_auc=0.001129
Epoch 05 | loss=0.2442 | val_pr_auc=0.002141
Epoch 06 | loss=0.2298 | val_pr_auc=0.001918
Epoch 07 | loss=0.2276 | val_pr_auc=0.001700
Epoch 08 | loss=0.2249 | val_pr_auc=0.001010
Epoch 09 | loss=0.2013 | val_pr_auc=0.001912
Epoch 10 | loss=0.1915 | val_pr_auc=0.001745
Epoch 11 | loss=0.1794 | val_pr_auc=0.001749
Epoch 12 | loss=0.1832 | val_pr_auc=0.001196
Epoch 13 | loss=0.1603 | val_pr_auc=0.001146
Epoch 14 | loss=0.1488 | val_pr_auc=0.001228
Epoch 15 | loss=0.1385 | val_pr_auc=0.001010


In [47]:
# Train LSTM

lstm = FraudLSTM(num_features=Xtr_seq.shape[2], hidden_size=64, num_layers=1, dropout=0.1)
lstm = train_torch_model(lstm, train_loader, val_loader, epochs=EPOCHS, lr=LR)

Epoch 01 | loss=0.3608 | val_pr_auc=0.000923
Epoch 02 | loss=0.2892 | val_pr_auc=0.001046
Epoch 03 | loss=0.2899 | val_pr_auc=0.001406
Epoch 04 | loss=0.2806 | val_pr_auc=0.001990
Epoch 05 | loss=0.2785 | val_pr_auc=0.001424
Epoch 06 | loss=0.2818 | val_pr_auc=0.002527
Epoch 07 | loss=0.2723 | val_pr_auc=0.002771
Epoch 08 | loss=0.2710 | val_pr_auc=0.001718
Epoch 09 | loss=0.2676 | val_pr_auc=0.002457
Epoch 10 | loss=0.2681 | val_pr_auc=0.001705
Epoch 11 | loss=0.2570 | val_pr_auc=0.002725
Epoch 12 | loss=0.2626 | val_pr_auc=0.003067
Epoch 13 | loss=0.2561 | val_pr_auc=0.004302
Epoch 14 | loss=0.2580 | val_pr_auc=0.002443
Epoch 15 | loss=0.2520 | val_pr_auc=0.002783


In [48]:
# Evaluate All Models + Build Final Table

rows = []

def add_row(name, split, best_t, metrics):
    rows.append({"model": name, "split": split, "best_threshold_from_val": best_t, **metrics})

# sklearn baselines
for name, info in baseline_results.items():
    add_row(name, "val",  info["best_t"], info["val"])
    add_row(name, "test", info["best_t"], info["test"])

# CNN-RNN
yva_true, yva_prob = get_probs_from_loader(cnn_rnn, val_loader)
best_t, va_m = best_threshold_f1(yva_true, yva_prob)
yte_true, yte_prob = get_probs_from_loader(cnn_rnn, test_loader)
te_m = compute_metrics(yte_true, yte_prob, threshold=best_t)
add_row("CNN-RNN", "val", best_t, va_m)
add_row("CNN-RNN", "test", best_t, te_m)

# CNN
yva_true, yva_prob = get_probs_from_loader(cnn, val_loader)
best_t, va_m = best_threshold_f1(yva_true, yva_prob)
yte_true, yte_prob = get_probs_from_loader(cnn, test_loader)
te_m = compute_metrics(yte_true, yte_prob, threshold=best_t)
add_row("CNN", "val", best_t, va_m)
add_row("CNN", "test", best_t, te_m)

# LSTM
yva_true, yva_prob = get_probs_from_loader(lstm, val_loader)
best_t, va_m = best_threshold_f1(yva_true, yva_prob)
yte_true, yte_prob = get_probs_from_loader(lstm, test_loader)
te_m = compute_metrics(yte_true, yte_prob, threshold=best_t)
add_row("LSTM", "val", best_t, va_m)
add_row("LSTM", "test", best_t, te_m)

df_compare = pd.DataFrame(rows)
df_compare = df_compare[[
    "model","split","best_threshold_from_val",
    "precision","recall","f1","fpr","roc_auc","pr_auc"
]].sort_values(["model","split"])

df_compare

Unnamed: 0,model,split,best_threshold_from_val,precision,recall,f1,fpr,roc_auc,pr_auc
7,CNN,test,0.359137,0.005495,0.037037,0.009569,0.012088,0.622456,0.003026
6,CNN,val,0.359137,0.005,0.066667,0.009302,0.01328,0.647354,0.002334
5,CNN-RNN,test,0.2695,0.006757,0.037037,0.011429,0.009818,0.47345,0.002681
4,CNN-RNN,val,0.2695,0.02,0.2,0.036364,0.00981,0.56605,0.007441
9,LSTM,test,0.257887,0.006601,0.074074,0.012121,0.020103,0.422781,0.002664
8,LSTM,val,0.257887,0.010067,0.2,0.019169,0.019686,0.573133,0.004302
1,LogReg,test,0.996394,0.003021,0.037037,0.005587,0.02204,0.51508,0.002514
0,LogReg,val,0.996394,0.003356,0.066667,0.00639,0.01982,0.500616,0.001261
3,RandomForest,test,0.019975,0.002907,0.037037,0.005391,0.022908,0.52166,0.002157
2,RandomForest,val,0.019975,0.002882,0.066667,0.005525,0.02309,0.537346,0.00119


In [49]:
print("train fraud rate:", ytr_seq.mean(), "count:", int(ytr_seq.sum()), "/", len(ytr_seq))
print("val fraud rate:",   yva_seq.mean(), "count:", int(yva_seq.sum()), "/", len(yva_seq))
print("test fraud rate:",  yte_seq.mean(), "count:", int(yte_seq.sum()), "/", len(yte_seq))

fraud_by_client = tx.groupby("client_id")["target"].sum(min_count=1)
print("clients with >=1 fraud:", int((fraud_by_client > 0).sum()), "/", fraud_by_client.shape[0])

# probability ranges
yva_true_rnn, yva_prob_rnn = get_probs_from_loader(cnn_rnn, val_loader)
print("CNN-RNN val prob min/mean/max:", yva_prob_rnn.min(), yva_prob_rnn.mean(), yva_prob_rnn.max())

yva_true_cnn, yva_prob_cnn = get_probs_from_loader(cnn, val_loader)
print("CNN val prob min/mean/max:", yva_prob_cnn.min(), yva_prob_cnn.mean(), yva_prob_cnn.max())

train fraud rate: 0.0018 count: 27 / 15000
val fraud rate: 0.001 count: 15 / 15000
test fraud rate: 0.0018 count: 27 / 15000
clients with >=1 fraud: 296 / 300
CNN-RNN val prob min/mean/max: 0.007951761 0.051052894 0.4129434
CNN val prob min/mean/max: 0.016837407 0.079984374 0.74003637


In [50]:
# cleanup

del Xtr_seq, Xva_seq, Xte_seq
del ytr_seq, yva_seq, yte_seq
del train_loader, val_loader, test_loader
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()