In [18]:
import pandas as pd
import os, json

path = "/Users/anissasoungpanya/Desktop/transactions-fraud-datasets"
tx_path = os.path.join(path, "transactions_data.csv")
labels_path = os.path.join(path, "train_fraud_labels.json")

with open(labels_path, "r") as f:
    labels_raw = json.load(f)

# labels_raw looks like: {"target": {"10649266": "No", ...}}
target_map = labels_raw["target"]  # dict: transaction_id -> "Yes"/"No"

labels = pd.DataFrame({
    "transaction_id": list(target_map.keys()),
    "target": [1 if v == "Yes" else 0 for v in target_map.values()]
})

labels["transaction_id"] = labels["transaction_id"].astype(str)
labels["target"] = labels["target"].astype(int)


In [19]:
import numpy as np

usecols = [
    "id", "date", "client_id", "card_id", "amount",
    "use_chip", "merchant_state", "mcc", "errors"
]

# Choose clients from an early slice of the dataset
first_chunk = pd.read_csv(tx_path, usecols=["client_id"], nrows=200_000)
clients = first_chunk["client_id"].dropna().unique()


np.random.seed(0)
N_CLIENTS = 300   
chosen_clients = set(np.random.choice(clients, size=min(N_CLIENTS, len(clients)), replace=False))

chunks = []
for chunk in pd.read_csv(tx_path, usecols=usecols, chunksize=200_000):
    keep = chunk[chunk["client_id"].isin(chosen_clients)]
    if len(keep):
        chunks.append(keep)

tx = pd.concat(chunks, ignore_index=True)
tx["id"] = tx["id"].astype(str)

print("Subset transactions:", tx.shape)
tx.head()


Subset transactions: (3358392, 9)


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors
0,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,IA,5311,
1,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,CA,4829,
2,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,NY,5942,
3,7475337,2010-01-01 00:21:00,351,1112,$10.74,Swipe Transaction,NY,5813,
4,7475344,2010-01-01 00:32:00,646,2093,$73.79,Swipe Transaction,PA,7538,


In [20]:
tx["id"] = tx["id"].astype(str)

tx = tx.merge(labels, left_on="id", right_on="transaction_id", how="left")
tx = tx.drop(columns=["transaction_id"])

print("Rows:", len(tx))
print("Labeled rows:", tx["target"].notna().sum())
tx.head()


Rows: 3358392
Labeled rows: 2249957


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target
0,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,IA,5311,,0.0
1,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,CA,4829,,0.0
2,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,NY,5942,,0.0
3,7475337,2010-01-01 00:21:00,351,1112,$10.74,Swipe Transaction,NY,5813,,
4,7475344,2010-01-01 00:32:00,646,2093,$73.79,Swipe Transaction,PA,7538,,0.0


In [21]:
tx["date"] = pd.to_datetime(tx["date"], errors="coerce")
tx = tx.dropna(subset=["date"])

def parse_amount(x):
    x = str(x).replace("$","").replace(",","").strip()
    try:
        return float(x)
    except:
        return np.nan

tx["amount"] = tx["amount"].apply(parse_amount)
tx = tx.dropna(subset=["amount"])

# Fill missing categoricals
tx["errors"] = tx["errors"].fillna("None").astype(str)
tx["use_chip"] = tx["use_chip"].fillna("Unknown").astype(str)
tx["merchant_state"] = tx["merchant_state"].fillna("Unknown").astype(str)
tx["mcc"] = tx["mcc"].fillna(-1).astype(int).astype(str)  # treat as categorical

# Sort per client
tx = tx.sort_values(["client_id", "date"]).reset_index(drop=True)

tx.head()


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_state,mcc,errors,target
0,7477094,2010-01-01 11:58:00,1,4652,15.09,Swipe Transaction,FL,4121,,0.0
1,7477168,2010-01-01 12:11:00,1,3682,6.01,Swipe Transaction,FL,5813,,0.0
2,7477216,2010-01-01 12:18:00,1,3682,14.58,Online Transaction,Unknown,4121,,0.0
3,7477978,2010-01-01 15:09:00,1,4652,14.66,Online Transaction,Unknown,4121,,0.0
4,7478279,2010-01-01 16:26:00,1,4652,22.77,Swipe Transaction,FL,4121,,


In [22]:
def time_split(df, client_col="client_id", frac_train=0.70, frac_val=0.15, min_len=20):
    train_idx, val_idx, test_idx = [], [], []
    for cid, g in df.groupby(client_col, sort=False):
        n = len(g)
        if n < min_len:
            continue
        t1 = int(n * frac_train)
        t2 = int(n * (frac_train + frac_val))
        idx = g.index.to_numpy()
        train_idx.append(idx[:t1])
        val_idx.append(idx[t1:t2])
        test_idx.append(idx[t2:])
    return np.concatenate(train_idx), np.concatenate(val_idx), np.concatenate(test_idx)

train_idx, val_idx, test_idx = time_split(tx)

train_df = tx.loc[train_idx].copy()
val_df   = tx.loc[val_idx].copy()
test_df  = tx.loc[test_idx].copy()

len(train_df), len(val_df), len(test_df)

(2350739, 503758, 503895)

In [23]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

for d in [train_df, val_df, test_df]:
    d["hour"] = d["date"].dt.hour
    d["dayofweek"] = d["date"].dt.dayofweek

num_cols = ["amount", "hour", "dayofweek"]
cat_cols = ["use_chip", "merchant_state", "mcc", "errors"]

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
scaler = StandardScaler()

Xtr_cat = enc.fit_transform(train_df[cat_cols])
Xva_cat = enc.transform(val_df[cat_cols])
Xte_cat = enc.transform(test_df[cat_cols])

Xtr_num = scaler.fit_transform(train_df[num_cols])
Xva_num = scaler.transform(val_df[num_cols])
Xte_num = scaler.transform(test_df[num_cols])

X_train = np.hstack([Xtr_num, Xtr_cat]).astype(np.float32)
X_val   = np.hstack([Xva_num, Xva_cat]).astype(np.float32)
X_test  = np.hstack([Xte_num, Xte_cat]).astype(np.float32)

y_train = train_df["target"].to_numpy()
y_val   = val_df["target"].to_numpy()
y_test  = test_df["target"].to_numpy()

X_train.shape

(2350739, 7)

In [24]:
def build_sequences_sampled(df_part, X_part, y_part, seq_len=10, max_windows=50_000, client_col="client_id"):
    X_seqs = np.zeros((max_windows, seq_len, X_part.shape[1]), dtype=np.float32)
    y_seqs = np.zeros((max_windows,), dtype=np.int64)

    k = 0
    start = 0
    rng = np.random.default_rng(0)

    for cid, g in df_part.groupby(client_col, sort=False):
        n = len(g)
        if n <= seq_len:
            start += n
            continue

        Xg = X_part[start:start+n]
        yg = y_part[start:start+n]

        # indices where label exists and we have enough history
        valid_t = np.where(~np.isnan(yg))[0]
        valid_t = valid_t[valid_t >= seq_len]
        if len(valid_t) == 0:
            start += n
            continue

        # sample up to some per client (keeps balance and speed)
        take = min(len(valid_t), 50)
        chosen = rng.choice(valid_t, size=take, replace=False)

        for t in chosen:
            if k >= max_windows:
                return X_seqs[:k], y_seqs[:k]
            X_seqs[k] = Xg[t-seq_len:t]
            y_seqs[k] = int(yg[t])
            k += 1

        start += n

    return X_seqs[:k], y_seqs[:k]


SEQ_LEN = 10
Xtr_seq, ytr_seq = build_sequences_sampled(train_df, X_train, y_train, seq_len=SEQ_LEN, max_windows=50_000)
Xva_seq, yva_seq = build_sequences_sampled(val_df, X_val, y_val, seq_len=SEQ_LEN, max_windows=20_000)
Xte_seq, yte_seq = build_sequences_sampled(test_df,  X_test,  y_test,  seq_len=SEQ_LEN)

Xtr_seq.shape, ytr_seq.mean()

((15000, 10, 7), np.float64(0.0018))

In [25]:
import torch
torch.cuda.empty_cache()
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)         
        self.y = torch.from_numpy(y).float() 

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

batch_size = 128  
train_loader = DataLoader(SeqDataset(Xtr_seq, ytr_seq), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(SeqDataset(Xva_seq, yva_seq), batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(SeqDataset(Xte_seq, yte_seq), batch_size=batch_size, shuffle=False)

class FraudLSTM(nn.Module):
    def __init__(self, num_features, hidden_size=64):
        super().__init__()
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1]).squeeze(1)  # logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model = FraudLSTM(num_features=Xtr_seq.shape[2], hidden_size=64).to(device)

pos = ytr_seq.sum()
neg = len(ytr_seq) - pos
pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [26]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score

def eval_model(model, loader, threshold=0.5):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            all_p.append(probs)
            all_y.append(yb.numpy())

    y_true = np.concatenate(all_y).astype(int)
    y_prob = np.concatenate(all_p)
    y_pred = (y_prob >= threshold).astype(int)

    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    fpr = ((y_pred == 1) & (y_true == 0)).sum() / max((y_true == 0).sum(), 1)

    roc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan
    pr_auc = average_precision_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else np.nan

    return {"precision": prec, "recall": rec, "f1": f1, "fpr": fpr, "roc_auc": roc, "pr_auc": pr_auc}

def train_epochs(model, epochs=5):
    best_f1, best_state = -1, None
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(yb)

        val_metrics = eval_model(model, val_loader)
        print(f"Epoch {epoch} | loss={total_loss/len(train_loader.dataset):.4f} | val={val_metrics}")

        if val_metrics["f1"] > best_f1:
            best_f1 = val_metrics["f1"]
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state:
        model.load_state_dict(best_state)

train_epochs(model, epochs=5)

Epoch 1 | loss=1.3025 | val={'precision': 0.0013480722566729577, 'recall': 0.3333333333333333, 'f1': 0.0026852846401718583, 'fpr': np.float64(0.2471805138471805), 'roc_auc': 0.5441797352908464, 'pr_auc': 0.0017104245010958405}
Epoch 2 | loss=1.2639 | val={'precision': 0.0010119750379490638, 'recall': 0.4, 'f1': 0.002018842530282638, 'fpr': np.float64(0.39526192859526194), 'roc_auc': 0.5397041485930375, 'pr_auc': 0.0031269677205532998}
Epoch 3 | loss=1.1848 | val={'precision': 0.0013774104683195593, 'recall': 0.3333333333333333, 'f1': 0.0027434842249657062, 'fpr': np.float64(0.24190857524190856), 'roc_auc': 0.5550973195417639, 'pr_auc': 0.0018843177108498342}
Epoch 4 | loss=1.1540 | val={'precision': 0.0013252054068380599, 'recall': 0.3333333333333333, 'f1': 0.0026399155227032735, 'fpr': np.float64(0.2514514514514514), 'roc_auc': 0.5893137582026471, 'pr_auc': 0.0027082668578467077}
Epoch 5 | loss=1.1499 | val={'precision': 0.0015424164524421595, 'recall': 0.2, 'f1': 0.003061224489795918

In [27]:
import gc
import torch

# delete big arrays + loaders + model
del Xtr_seq, ytr_seq, Xva_seq, yva_seq, Xte_seq, yte_seq
del train_loader, val_loader, test_loader
del model, optimizer, criterion

gc.collect()

# if on GPU:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
