In [1]:
!pip3 install -q kagglehub

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
from pathlib import Path
import os
import json
import pandas as pd
import kagglehub

DATASET_ID = "computingvictor/transactions-fraud-datasets"
DATASET_FOLDER_NAME = "transactions-fraud-datasets"

def find_repo_root():
    cur = Path.cwd().resolve()
    for p in [cur] + list(cur.parents):
        if (p / ".git").exists():
            return p
    return cur

def resolve_dataset_dir():
    # 1) Environment variable override
    env = os.getenv("FRAUD_DATA_DIR")
    if env:
        p = Path(env).expanduser().resolve()
        if p.exists():
            return p

    # 2) Repo-relative data folder
    repo_root = find_repo_root()
    local = repo_root / "data" / DATASET_FOLDER_NAME
    if local.exists():
        return local.resolve()

    # 3) Auto-download via kagglehub
    print("Dataset not found locally - downloading via kagglehub...")
    return Path(kagglehub.dataset_download(DATASET_ID)).resolve()

dataset_dir = resolve_dataset_dir()
print("Using dataset directory:", dataset_dir)

tx_path = dataset_dir / "transactions_data.csv"
labels_path = dataset_dir / "train_fraud_labels.json"

assert tx_path.exists(), f"Missing {tx_path}"
assert labels_path.exists(), f"Missing {labels_path}"

with open(labels_path, "r") as f:
    labels_raw = json.load(f)

target_map = labels_raw["target"]  # dict: transaction_id -> "Yes"/"No"

labels = pd.DataFrame({
    "transaction_id": list(target_map.keys()),
    "target": [1 if v == "Yes" else 0 for v in target_map.values()]
})

labels["transaction_id"] = labels["transaction_id"].astype(str)
labels["target"] = labels["target"].astype(int)

labels.head()

  from .autonotebook import tqdm as notebook_tqdm


Dataset not found locally - downloading via kagglehub...
Using dataset directory: /Users/glennc/.cache/kagglehub/datasets/computingvictor/transactions-fraud-datasets/versions/1


Unnamed: 0,transaction_id,target
0,10649266,0
1,23410063,0
2,9316588,0
3,12478022,0
4,9558530,0


In [3]:
import numpy as np

usecols = [
    "id", "date", "client_id", "card_id", "amount",
    "use_chip", "merchant_state", "mcc", "errors"
]

# Choose clients from an early slice of the dataset
first_chunk = pd.read_csv(tx_path, usecols=["client_id"], nrows=200_000)
clients = first_chunk["client_id"].dropna().unique()


np.random.seed(0)
N_CLIENTS = 300   
chosen_clients = set(np.random.choice(clients, size=min(N_CLIENTS, len(clients)), replace=False))

chunks = []
for chunk in pd.read_csv(tx_path, usecols=usecols, chunksize=200_000):
    keep = chunk[chunk["client_id"].isin(chosen_clients)]
    if len(keep):
        chunks.append(keep)

tx = pd.concat(chunks, ignore_index=True)
tx["id"] = tx["id"].astype(str)

print("Subset transactions:", tx.shape)
tx.head()

Subset transactions: (3358392, 9)


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors
0,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,IA,5311,
1,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,CA,4829,
2,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,NY,5942,
3,7475337,2010-01-01 00:21:00,351,1112,$10.74,Swipe Transaction,NY,5813,
4,7475344,2010-01-01 00:32:00,646,2093,$73.79,Swipe Transaction,PA,7538,


In [4]:
tx["id"] = tx["id"].astype(str)

tx = tx.merge(labels, left_on="id", right_on="transaction_id", how="left")
tx = tx.drop(columns=["transaction_id"])

print("Rows:", len(tx))
print("Labeled rows:", tx["target"].notna().sum())
tx.head()

Rows: 3358392
Labeled rows: 2249957


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target
0,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,IA,5311,,0.0
1,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,CA,4829,,0.0
2,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,NY,5942,,0.0
3,7475337,2010-01-01 00:21:00,351,1112,$10.74,Swipe Transaction,NY,5813,,
4,7475344,2010-01-01 00:32:00,646,2093,$73.79,Swipe Transaction,PA,7538,,0.0


In [5]:
tx["date"] = pd.to_datetime(tx["date"], errors="coerce")
tx = tx.dropna(subset=["date"])

def parse_amount(x):
    x = str(x).replace("$","").replace(",","").strip()
    try:
        return float(x)
    except:
        return np.nan

tx["amount"] = tx["amount"].apply(parse_amount)
tx = tx.dropna(subset=["amount"])

# Fill missing categoricals
tx["errors"] = tx["errors"].fillna("None").astype(str)
tx["use_chip"] = tx["use_chip"].fillna("Unknown").astype(str)
tx["merchant_state"] = tx["merchant_state"].fillna("Unknown").astype(str)
tx["mcc"] = tx["mcc"].fillna(-1).astype(int).astype(str)  # treat as categorical

# Sort per client
tx = tx.sort_values(["client_id", "date"]).reset_index(drop=True)

tx.head()

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target
0,7477094,2010-01-01 11:58:00,1,4652,15.09,Swipe Transaction,FL,4121,,0.0
1,7477168,2010-01-01 12:11:00,1,3682,6.01,Swipe Transaction,FL,5813,,0.0
2,7477216,2010-01-01 12:18:00,1,3682,14.58,Online Transaction,Unknown,4121,,0.0
3,7477978,2010-01-01 15:09:00,1,4652,14.66,Online Transaction,Unknown,4121,,0.0
4,7478279,2010-01-01 16:26:00,1,4652,22.77,Swipe Transaction,FL,4121,,


In [6]:
def time_split(df, client_col="client_id", frac_train=0.70, frac_val=0.15, min_len=20):
    train_idx, val_idx, test_idx = [], [], []
    for cid, g in df.groupby(client_col, sort=False):
        n = len(g)
        if n < min_len:
            continue
        t1 = int(n * frac_train)
        t2 = int(n * (frac_train + frac_val))
        idx = g.index.to_numpy()
        train_idx.append(idx[:t1])
        val_idx.append(idx[t1:t2])
        test_idx.append(idx[t2:])
    return np.concatenate(train_idx), np.concatenate(val_idx), np.concatenate(test_idx)

train_idx, val_idx, test_idx = time_split(tx)

train_df = tx.loc[train_idx].copy()
val_df   = tx.loc[val_idx].copy()
test_df  = tx.loc[test_idx].copy()

len(train_df), len(val_df), len(test_df)

(2350739, 503758, 503895)

In [7]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

for d in [train_df, val_df, test_df]:
    d["hour"] = d["date"].dt.hour
    d["dayofweek"] = d["date"].dt.dayofweek

num_cols = ["amount", "hour", "dayofweek"]
cat_cols = ["use_chip", "merchant_state", "mcc", "errors"]

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
scaler = StandardScaler()

Xtr_cat = enc.fit_transform(train_df[cat_cols])
Xva_cat = enc.transform(val_df[cat_cols])
Xte_cat = enc.transform(test_df[cat_cols])

Xtr_num = scaler.fit_transform(train_df[num_cols])
Xva_num = scaler.transform(val_df[num_cols])
Xte_num = scaler.transform(test_df[num_cols])

X_train = np.hstack([Xtr_num, Xtr_cat]).astype(np.float32)
X_val   = np.hstack([Xva_num, Xva_cat]).astype(np.float32)
X_test  = np.hstack([Xte_num, Xte_cat]).astype(np.float32)

y_train = train_df["target"].to_numpy()
y_val   = val_df["target"].to_numpy()
y_test  = test_df["target"].to_numpy()

X_train.shape

(2350739, 7)

In [8]:
def build_sequences_sampled(df_part, X_part, y_part, seq_len=10, max_windows=50_000, client_col="client_id"):
    X_seqs = np.zeros((max_windows, seq_len, X_part.shape[1]), dtype=np.float32)
    y_seqs = np.zeros((max_windows,), dtype=np.int64)

    k = 0
    start = 0
    rng = np.random.default_rng(0)

    for cid, g in df_part.groupby(client_col, sort=False):
        n = len(g)
        if n <= seq_len:
            start += n
            continue

        Xg = X_part[start:start+n]
        yg = y_part[start:start+n]

        # indices where label exists and we have enough history
        valid_t = np.where(~np.isnan(yg))[0]
        valid_t = valid_t[valid_t >= seq_len]
        if len(valid_t) == 0:
            start += n
            continue

        # sample up to some per client (keeps balance and speed)
        take = min(len(valid_t), 50)
        chosen = rng.choice(valid_t, size=take, replace=False)

        for t in chosen:
            if k >= max_windows:
                return X_seqs[:k], y_seqs[:k]
            X_seqs[k] = Xg[t-seq_len:t]
            y_seqs[k] = int(yg[t])
            k += 1

        start += n

    return X_seqs[:k], y_seqs[:k]


SEQ_LEN = 10
Xtr_seq, ytr_seq = build_sequences_sampled(train_df, X_train, y_train, seq_len=SEQ_LEN, max_windows=50_000)
Xva_seq, yva_seq = build_sequences_sampled(val_df, X_val, y_val, seq_len=SEQ_LEN, max_windows=20_000)
Xte_seq, yte_seq = build_sequences_sampled(test_df,  X_test,  y_test,  seq_len=SEQ_LEN)

Xtr_seq.shape, ytr_seq.mean()

((15000, 10, 7), np.float64(0.0018))

In [None]:
# Non-Neural Baseline Models (for comparison with LSTM)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score

# Flatten LSTM windows for sklearn baselines: (N, seq_len, feat) -> (N, seq_len*feat)
Xtr_flat = Xtr_seq.reshape(Xtr_seq.shape[0], -1)
Xva_flat = Xva_seq.reshape(Xva_seq.shape[0], -1)
Xte_flat = Xte_seq.reshape(Xte_seq.shape[0], -1)

ytr = ytr_seq.astype(np.int64)
yva = yva_seq.astype(np.int64)
yte = yte_seq.astype(np.int64)

def compute_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", zero_division=0
    )
    fpr = ((y_pred == 1) & (y_true == 0)).sum() / max((y_true == 0).sum(), 1)

    roc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    pr  = average_precision_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    return {"precision": prec, "recall": rec, "f1": f1, "fpr": fpr, "roc_auc": roc, "pr_auc": pr}

def best_threshold_f1(y_true, y_prob, thresholds=None):
    if thresholds is None:
        thresholds = np.linspace(0.01, 0.99, 99)
    best_t, best_f1, best_m = 0.5, -1, None
    for t in thresholds:
        m = compute_metrics(y_true, y_prob, threshold=t)
        if m["f1"] > best_f1:
            best_f1, best_t, best_m = m["f1"], t, m
    return best_t, best_m

# --- Train baseline models (store probs so we can compare later) ---
baseline_results = {}

# Logistic Regression baseline
logreg = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
logreg.fit(Xtr_flat, ytr)

va_prob_lr = logreg.predict_proba(Xva_flat)[:, 1]
best_t_lr, va_metrics_lr = best_threshold_f1(yva, va_prob_lr)
te_prob_lr = logreg.predict_proba(Xte_flat)[:, 1]
te_metrics_lr = compute_metrics(yte, te_prob_lr, threshold=best_t_lr)

baseline_results["LogReg"] = {
    "val_prob": va_prob_lr, "test_prob": te_prob_lr,
    "best_t": best_t_lr,
    "val": va_metrics_lr, "test": te_metrics_lr
}

print("LogReg best threshold (val):", best_t_lr, "| val metrics:", va_metrics_lr)

# Random Forest baseline (optional; can be slower)
rf = RandomForestClassifier(
    n_estimators=300, min_samples_leaf=2,
    class_weight="balanced_subsample",
    n_jobs=-1, random_state=0
)
rf.fit(Xtr_flat, ytr)

va_prob_rf = rf.predict_proba(Xva_flat)[:, 1]
best_t_rf, va_metrics_rf = best_threshold_f1(yva, va_prob_rf)
te_prob_rf = rf.predict_proba(Xte_flat)[:, 1]
te_metrics_rf = compute_metrics(yte, te_prob_rf, threshold=best_t_rf)

baseline_results["RandomForest"] = {
    "val_prob": va_prob_rf, "test_prob": te_prob_rf,
    "best_t": best_t_rf,
    "val": va_metrics_rf, "test": te_metrics_rf
}

print("RF best threshold (val):", best_t_rf, "| val metrics:", va_metrics_rf)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


LogReg best threshold (val): 0.99 | val metrics: {'precision': 0.002398081534772182, 'recall': 0.06666666666666667, 'f1': 0.004629629629629629, 'fpr': np.float64(0.027761094427761094), 'roc_auc': np.float64(0.5006161717272828), 'pr_auc': np.float64(0.0012614757612957484)}
RF best threshold (val): 0.01 | val metrics: {'precision': 0.000945179584120983, 'recall': 0.06666666666666667, 'f1': 0.001863932898415657, 'fpr': np.float64(0.0705372038705372), 'roc_auc': np.float64(0.5373462351240129), 'pr_auc': np.float64(0.0011903203970417913)}


### Hybrid CNN-RNN Architecture

In [29]:
import torch
torch.cuda.empty_cache()
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)         
        self.y = torch.from_numpy(y).float() 

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

batch_size = 128  
train_loader = DataLoader(SeqDataset(Xtr_seq, ytr_seq), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(SeqDataset(Xva_seq, yva_seq), batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(SeqDataset(Xte_seq, yte_seq), batch_size=batch_size, shuffle=False)

import torch
import torch.nn as nn

class FraudCNNRNN(nn.Module):
    """
    CNN over the time dimension (sequence of transactions),
    followed by an RNN (LSTM) to capture longer-range dependencies.

    Input:  x  shape (B, T, F)
    Output: logits shape (B,)
    """
    def __init__(
        self,
        num_features: int,
        conv_channels: int = 64,
        kernel_size: int = 3,
        hidden_size: int = 64,
        num_layers: int = 1,
        dropout: float = 0.1,
    ):
        super().__init__()

        # CNN expects (B, C, T). We'll treat features as channels (C=F).
        # Convolution runs across time T.
        padding = kernel_size // 2  # keeps T same when stride=1

        self.conv = nn.Sequential(
            nn.Conv1d(num_features, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Conv1d(conv_channels, conv_channels, kernel_size, padding=padding),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
        )


        # LSTM will take (B, T, conv_channels)
        self.rnn = nn.LSTM(
            input_size=conv_channels,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout if num_layers > 1 else 0.0),
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x: (B, T, F) -> (B, F, T)
        x = x.transpose(1, 2)

        # CNN across time: (B, F, T) -> (B, C, T)
        x = self.conv(x)

        # back to RNN format: (B, C, T) -> (B, T, C)
        x = x.transpose(1, 2)

        # LSTM: use final hidden state
        _, (h_n, _) = self.rnn(x)
        h_last = h_n[-1]          # (B, hidden_size)

        logits = self.fc(h_last).squeeze(1)  # (B,)
        return logits

In [30]:
class FraudLSTM(nn.Module):
    def __init__(self, num_features, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1]).squeeze(1)  # logits

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# model = FraudLSTM(num_features=Xtr_seq.shape[2], hidden_size=64).to(device)
model = FraudCNNRNN(
    num_features=Xtr_seq.shape[2],
    conv_channels=64,
    kernel_size=3,
    hidden_size=64,
    num_layers=1,
    dropout=0.1
).to(device)


pos = ytr_seq.sum()
neg = len(ytr_seq) - pos
raw_pw = neg / max(pos, 1)
# pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)
pos_weight = torch.tensor([min(raw_pw, 50.0)], dtype=torch.float32).to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [32]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score

def eval_model(model, loader, threshold=0.5):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            all_p.append(probs)
            all_y.append(yb.numpy())

    y_true = np.concatenate(all_y).astype(int)
    y_prob = np.concatenate(all_p)
    y_pred = (y_prob >= threshold).astype(int)

    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    fpr = ((y_pred == 1) & (y_true == 0)).sum() / max((y_true == 0).sum(), 1)

    roc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    pr_auc = average_precision_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan

    return {"precision": prec, "recall": rec, "f1": f1, "fpr": fpr, "roc_auc": roc, "pr_auc": pr_auc}


def eval_pr_auc(model, loader):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            all_p.append(probs)
            all_y.append(yb.numpy())

    y_true = np.concatenate(all_y).astype(int)
    y_prob = np.concatenate(all_p)
    return average_precision_score(y_true, y_prob)

def train_epochs(model, epochs=5):
    best_pr, best_state = -1, None
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(yb)

        val_pr = eval_pr_auc(model, val_loader)
        print(f"Epoch {epoch} | loss={total_loss/len(train_loader.dataset):.4f} | val_pr_auc={val_pr:.6f}")

        if val_pr > best_pr:
            best_pr = val_pr
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state:
        model.load_state_dict(best_state)

train_epochs(model, epochs=15)

Epoch 1 | loss=0.3329 | val_pr_auc=0.002606
Epoch 2 | loss=0.2851 | val_pr_auc=0.003836
Epoch 3 | loss=0.2611 | val_pr_auc=0.003550
Epoch 4 | loss=0.2683 | val_pr_auc=0.002362
Epoch 5 | loss=0.2546 | val_pr_auc=0.003928
Epoch 6 | loss=0.2447 | val_pr_auc=0.002223
Epoch 7 | loss=0.2350 | val_pr_auc=0.005006
Epoch 8 | loss=0.2181 | val_pr_auc=0.003970
Epoch 9 | loss=0.2167 | val_pr_auc=0.005005
Epoch 10 | loss=0.2044 | val_pr_auc=0.007647
Epoch 11 | loss=0.1975 | val_pr_auc=0.002831
Epoch 12 | loss=0.1823 | val_pr_auc=0.004729
Epoch 13 | loss=0.1735 | val_pr_auc=0.003916
Epoch 14 | loss=0.1807 | val_pr_auc=0.002774
Epoch 15 | loss=0.1658 | val_pr_auc=0.003305


In [33]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score

def get_probs_from_loader(model, loader, device):
    model.eval()
    all_y, all_prob = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            prob = torch.sigmoid(model(xb)).cpu().numpy()
            all_prob.append(prob)
            all_y.append(yb.numpy())
    return np.concatenate(all_y).astype(int), np.concatenate(all_prob)

def compute_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", zero_division=0
    )
    fpr = ((y_pred == 1) & (y_true == 0)).sum() / max((y_true == 0).sum(), 1)
    roc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    pr  = average_precision_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    return {"precision": prec, "recall": rec, "f1": f1, "fpr": fpr, "roc_auc": roc, "pr_auc": pr}

def best_threshold_f1(y_true, y_prob, thresholds=None, min_pred_pos=1):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob)

    if thresholds is None:
        qs = np.linspace(0.01, 0.99, 300)
        thresholds = np.unique(np.quantile(y_prob, qs))

    best_t, best_f1, best_m = 0.5, -1, None
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        if y_pred.sum() < min_pred_pos:
            continue  # skip thresholds that predict nothing positive

        m = compute_metrics(y_true, y_prob, threshold=float(t))
        if m["f1"] > best_f1:
            best_f1, best_t, best_m = m["f1"], float(t), m

    # fallback: if everything got skipped, return 0.5
    if best_m is None:
        best_t, best_m = 0.5, compute_metrics(y_true, y_prob, threshold=0.5)

    return best_t, best_m

# ---- LSTM metrics ----

# yva_true_lstm, yva_prob_lstm = get_probs_from_loader(model, val_loader, device)
# best_t_lstm, va_metrics_lstm = best_threshold_f1(yva_true_lstm, yva_prob_lstm)
yva_true_cnn, yva_prob_cnn = get_probs_from_loader(model, val_loader, device)
best_t_cnn, va_metrics_cnn = best_threshold_f1(yva_true_cnn, yva_prob_cnn)

# yte_true_lstm, yte_prob_lstm = get_probs_from_loader(model, test_loader, device)
# te_metrics_lstm = compute_metrics(yte_true_lstm, yte_prob_lstm, threshold=best_t_lstm)
yte_true_cnn, yte_prob_cnn = get_probs_from_loader(model, test_loader, device)
te_metrics_cnn = compute_metrics(yte_true_cnn, yte_prob_cnn, threshold=best_t_cnn)

# ---- Build comparison table ----
rows = []

def add_row(name, best_t, split, metrics):
    rows.append({
        "model": name,
        "split": split,
        "best_threshold_from_val": best_t,
        **metrics
    })

# baselines (from Cell A)
for name, info in baseline_results.items():
    add_row(name, info["best_t"], "val",  info["val"])
    add_row(name, info["best_t"], "test", info["test"])

# LSTM
# add_row("LSTM", best_t_lstm, "val",  va_metrics_lstm)
# add_row("LSTM", best_t_lstm, "test", te_metrics_lstm)

add_row("CNN-RNN", best_t_cnn, "val",  va_metrics_cnn)
add_row("CNN-RNN", best_t_cnn, "test", te_metrics_cnn)

df_compare = pd.DataFrame(rows)

# nicer ordering
df_compare = df_compare[[
    "model","split","best_threshold_from_val",
    "precision","recall","f1","fpr","roc_auc","pr_auc"
]].sort_values(["model","split"])

df_compare

Unnamed: 0,model,split,best_threshold_from_val,precision,recall,f1,fpr,roc_auc,pr_auc
5,CNN-RNN,test,0.560401,0.019608,0.111111,0.033333,0.010018,0.527183,0.003913
4,CNN-RNN,val,0.560401,0.013333,0.133333,0.024242,0.009877,0.592731,0.007647
1,LogReg,test,0.99,0.004338,0.074074,0.008197,0.030655,0.51508,0.002514
0,LogReg,val,0.99,0.002398,0.066667,0.00463,0.027761,0.500616,0.001261
3,RandomForest,test,0.01,0.002018,0.074074,0.003929,0.066052,0.52166,0.002157
2,RandomForest,val,0.01,0.000945,0.066667,0.001864,0.070537,0.537346,0.00119


In [34]:
print("train fraud rate:", ytr_seq.mean(), "count:", ytr_seq.sum(), "/", len(ytr_seq))
print("val fraud rate:", yva_seq.mean(), "count:", yva_seq.sum(), "/", len(yva_seq))
print("test fraud rate:", yte_seq.mean(), "count:", yte_seq.sum(), "/", len(yte_seq))
# after merging labels into tx
fraud_by_client = tx.groupby("client_id")["target"].sum(min_count=1)
print("clients with >=1 fraud:", (fraud_by_client > 0).sum(), "/", fraud_by_client.shape[0])

train fraud rate: 0.0018 count: 27 / 15000
val fraud rate: 0.001 count: 15 / 15000
test fraud rate: 0.0018 count: 27 / 15000
clients with >=1 fraud: 296 / 300


In [35]:
# 1) Are probabilities all near a constant?
yva_true, yva_prob = get_probs_from_loader(model, val_loader, device)
print("val prob min/mean/max:", yva_prob.min(), yva_prob.mean(), yva_prob.max())

# 2) How many predicted positives at a few thresholds?
for t in [0.01, 0.05, 0.1, 0.2, 0.5]:
    print(t, (yva_prob >= t).mean())

val prob min/mean/max: 0.0053295097 0.06495062 0.6725309
0.01 0.9025333333333333
0.05 0.24466666666666667
0.1 0.1496
0.2 0.0898
0.5 0.021533333333333335


In [None]:
import gc
import torch

# delete big arrays + loaders + model
del Xtr_seq, ytr_seq, Xva_seq, yva_seq, Xte_seq, yte_seq
del train_loader, val_loader, test_loader
del model, optimizer, criterion

gc.collect()

# if on GPU:
if torch.cuda.is_available():
    torch.cuda.empty_cache()