## Neural Network for Credit Card Fraud

In [2]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils import class_weight
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import sparse
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


SAMPLE_SIZE = None # use smaller sample for testing, set to None for full dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
# ---- Data Preprocessing Functions ----
def load_data_optimized():
    print("[INFO] Loading data...")
    transaction = pd.read_csv("train_transaction.csv", usecols=lambda c: c != "TransactionDT")
    identity = pd.read_csv("train_identity.csv")
    df = transaction.merge(identity, on="TransactionID", how="left")
    if SAMPLE_SIZE:
        df = df.sample(SAMPLE_SIZE, random_state=42)
        print(f"[INFO] Using sample of {len(df):,} rows")
    return df


def preprocess_data_optimized(df):
    print("[INFO] Preprocessing data...")
    y = df["isFraud"].astype(int)
    X = df.drop(columns=["isFraud"])

    numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

    # Fill missing values
    X[numeric_cols] = X[numeric_cols].fillna(0)
    X[categorical_cols] = X[categorical_cols].fillna("missing")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(with_mean=False), numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_cols)
        ]
    )

    X_transformed = preprocessor.fit_transform(X)
    print(f"[INFO] Transformed shape: {X_transformed.shape}")

    if sparse.issparse(X_transformed):
        X_transformed = X_transformed.toarray()

    return X_transformed, y, preprocessor


In [4]:
class FraudNet(nn.Module):
    def __init__(self, input_dim, hidden_sizes=(128, 64)):
        super().__init__()
        h1, h2 = hidden_sizes
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.ReLU(),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Linear(h2, 1)  # binary logit
        )

    def forward(self, x):
        return self.net(x).squeeze(1)  # [batch]


def train_torch_model(
    X_train,
    y_train,
    X_test,
    y_test,
    model_name=None,
    hidden_sizes=(128, 64),
    num_epochs=5,
    batch_size=1024,
    lr=1e-3
):
    print(f"[INFO] Using device: {device}")
    input_dim = X_train.shape[1]
    print(f"[INFO] Input dim: {input_dim}")

    # Ensure y are NumPy arrays
    y_train_np = np.asarray(y_train)
    y_test_np  = np.asarray(y_test)

    # Class imbalance -> pos_weight for BCEWithLogitsLoss
    weights = class_weight.compute_class_weight(
        class_weight="balanced",
        classes=np.unique(y_train_np),
        y=y_train_np
    )
    class_weights = dict(zip(np.unique(y_train_np), weights))
    w0, w1 = class_weights[0], class_weights[1]
    pos_weight = torch.tensor([w1 / w0], dtype=torch.float32, device=device)

    # ----- KEY PART: cast to float32 -----
    X_train_t = torch.from_numpy(np.asarray(X_train)).float()     # <- float32
    y_train_t = torch.from_numpy(y_train_np.astype(np.float32))   # labels as float32 for BCE
    X_test_t  = torch.from_numpy(np.asarray(X_test)).float()      # <- float32
    y_test_t  = torch.from_numpy(y_test_np.astype(np.int64))      # keep ints for reporting

    train_ds = TensorDataset(X_train_t, y_train_t)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    model = FraudNet(input_dim, hidden_sizes).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    history = {"epoch": [], "loss": []}

    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            logits = model(batch_X)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * batch_X.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        history["epoch"].append(epoch)
        history["loss"].append(epoch_loss)
        print(f"[EPOCH {epoch}/{num_epochs}] loss = {epoch_loss:.4f}")

    # ---- Evaluation ----
    model.eval()
    with torch.no_grad():
        X_test_t = X_test_t.to(device)
        logits = model(X_test_t)
        probs = torch.sigmoid(logits).cpu().numpy()
        y_pred = (probs >= 0.5).astype(int)

    print("\n[METRICS]")
    print(classification_report(y_test_np, y_pred, digits=4))
    roc = roc_auc_score(y_test_np, probs)
    cm = confusion_matrix(y_test_np, y_pred)
    print("ROC AUC:", roc)
    print("Confusion Matrix:\n", cm)

    if model_name is not None:
        os.makedirs("models", exist_ok=True)
        path = f"models/{model_name}.pt"
        torch.save(model.state_dict(), path)
        print(f"[SAVED] {path}")

    return model, history, {"roc_auc": roc, "confusion_matrix": cm}


In [None]:
# ----- Run NN on FULL Dataset -----
# Load + preprocess
df = load_data_optimized()
X_full, y_full, preproc = preprocess_data_optimized(df)

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full,
    y_full,
    test_size=0.2,
    random_state=42,
    stratify=y_full
)

print(X_train_full.shape, X_test_full.shape)

# Train model
full_model, full_hist, full_metrics = train_torch_model(
    X_train_full,
    y_train_full,
    X_test_full,
    y_test_full,
    model_name="torch_nn_full",
    hidden_sizes=(128, 64),
    num_epochs=5,        # bump to e.g. 10–20 once it’s stable
    batch_size=1024,
    lr=1e-3
)
# 5, 1024, 1e-3 gave ROC: 0.891


[INFO] Loading data...
[INFO] Preprocessing data...
[INFO] Transformed shape: (590540, 2863)
(472432, 2863) (118108, 2863)
[INFO] Using device: cpu
[INFO] Input dim: 2863
[EPOCH 1/5] loss = 0.9331
[EPOCH 2/5] loss = 0.8569
[EPOCH 3/5] loss = 0.8235
[EPOCH 4/5] loss = 0.8086
[EPOCH 5/5] loss = 0.7874

[METRICS]
              precision    recall  f1-score   support

           0     0.9885    0.9033    0.9440    113975
           1     0.2104    0.7106    0.3247      4133

    accuracy                         0.8966    118108
   macro avg     0.5995    0.8070    0.6343    118108
weighted avg     0.9613    0.8966    0.9223    118108

ROC AUC: 0.8910849269467332
Confusion Matrix:
 [[102953  11022]
 [  1196   2937]]
[SAVED] models/torch_nn_full.pt


In [6]:
# ----- Run NN On PCA Dataset -----
# Load PCA features
pca_df = pd.read_csv("pca_features.csv")
X_pca = pca_df.values.astype(np.float32)

# Load labels
df_labels = load_data_optimized()
y_pca = df_labels["isFraud"].astype(int).values

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca,
    y_pca,
    test_size=0.2,
    random_state=42,
    stratify=y_pca
)

print(X_train_pca.shape, X_test_pca.shape)

# Train PCA model
pca_model, pca_hist, pca_metrics = train_torch_model(
    X_train_pca,
    y_train_pca,
    X_test_pca,
    y_test_pca,
    model_name="torch_nn_pca",
    hidden_sizes=(64, 32),
    num_epochs=5,
    batch_size=1024,
    lr=1e-3
)


[INFO] Loading data...
(472432, 50) (118108, 50)
[INFO] Using device: cpu
[INFO] Input dim: 50
[EPOCH 1/5] loss = 1.0331
[EPOCH 2/5] loss = 0.9488
[EPOCH 3/5] loss = 0.9190
[EPOCH 4/5] loss = 0.8964
[EPOCH 5/5] loss = 0.8803

[METRICS]
              precision    recall  f1-score   support

           0     0.9885    0.8159    0.8939    113975
           1     0.1269    0.7380    0.2166      4133

    accuracy                         0.8132    118108
   macro avg     0.5577    0.7769    0.5552    118108
weighted avg     0.9583    0.8132    0.8702    118108

ROC AUC: 0.8622823802151611
Confusion Matrix:
 [[92990 20985]
 [ 1083  3050]]
[SAVED] models/torch_nn_pca.pt
