In [107]:
#import libraries
import pandas as pd
import numpy as np
import random
import math
#sklearn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
#matplotlib and seaborn to graph
import matplotlib.pyplot as plt
import seaborn as sns
#pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [108]:
#load in dataset
df = pd.read_csv('creditcard_fraud_detection.csv')
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [109]:
#check nas
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [110]:
#calculate proportion of fraud in dataset
print(sum(df['Class'])/len(df))
#dataset is sparse, much more non fraud than fraud

0.001727485630620034


In [111]:
#preprocess and split for train test

#X is everything but "Class" (PCA features, time, amount)
X = df.drop('Class', axis = 1).values
#y is class (0 = no fraud, 1 = fraud)
y= df['Class'].values

scalar = StandardScaler()
#PCA features are already scaled, but time and amount aren't
X_scaled = scalar.fit_transform(X)

#train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size= 0.2, random_state= 50, 
    #stratify y since dataset is sparse and we want to ensureki
    stratify= y
    )

#create validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

In [112]:
#wrap dataset in classes for pytorch
class CreditFraudDataset(Dataset):
    def __init__(self, X, y):
        #convert np arrays to tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    #return len dataset
    def __len__(self):
        return len(self.X)

    #return item of X, y at specific index
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 2048

#wrap datasets
train_dataset = CreditFraudDataset(X_train, y_train)
val_dataset   = CreditFraudDataset(X_val, y_val)
test_dataset  = CreditFraudDataset(X_test, y_test)

In [113]:
#set up neural net
class FraudNet(nn.Module):
    def __init__(self, input_dim, dropout=0.3):  # dropout is now a hyperparameter
        super().__init__()
        self.net = nn.Sequential(

            #layer 1, 64 neurons
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),

            #layer 2, 64 neurons
            nn.Linear(64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),

            #layer 3, 32 neurons
            nn.Linear(64,32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(dropout),

            #only one output logit, for fraud or not
            nn.Linear(32, 1)
        )

    def forward(self, x):
        #x: (batch_size, input_dim)
        logits = self.net(x).squeeze(1)  #(batch_size,)
        return logits

input_dim = X_train.shape[1]
#init a temporary model just to inspect the architecture
tmp_model = FraudNet(input_dim)
print(tmp_model)

FraudNet(
  (net): Sequential(
    (0): Linear(in_features=30, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [114]:
#set up device
#todo: set up cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [115]:
#set up bce for model
#count number of each class
class_counts = np.bincount(y_train)
neg, pos = class_counts[0], class_counts[1]
#set weights
pos_weight = torch.tensor([neg / pos], dtype=torch.float32).to(device)

#use BCE to weight fraud more in the model. Model treats missed fraud transactions more harshly in loss function
#without weights model can get high performance just by predicting no fraud always, since 99%+ of samples aren't fraud
#BCE fixes this by penalizing no fraud much more harshly than no fraud

In [None]:
def train_eval_one_config(lr, dropout, batch_size, weight_decay=0.0):
    """
    Train FraudNet with given hyperparameters and return the best validation PR-AUC
    and the model state dict for that configuration.
    """
    #initialize loaders (depends on batch_size)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    #set up model
    model = FraudNet(input_dim, dropout=dropout).to(device)

    #set up bce for model
    #use BCE to weight fraud more in the model. Model treats missed fraud transactions more harshly in loss function
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_val_pr_auc = -np.inf
    best_state_dict = None

    num_epochs = 30 

    #run training loop for each epoch
    for epoch in range(num_epochs):
        #train
        model.train()
        running_loss = 0.0

        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            logits = model(X_batch)               #output scores as logits instead of probabilities for bce
            loss = criterion(logits, y_batch)     #calc loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

        #evaluate on validation set for each epoch
        model.eval()
        all_logits = []
        all_targets = []

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                logits = model(X_batch)
                all_logits.append(logits.cpu())
                all_targets.append(y_batch)

        all_logits = torch.cat(all_logits)
        all_targets = torch.cat(all_targets)

        #converts logits to p
        probs = torch.sigmoid(all_logits).numpy()
        targets = all_targets.numpy()

        #calculate roc auc score
        roc = roc_auc_score(targets, probs)
        #auc pr score (validation metric we care most about)
        pr_auc = average_precision_score(targets, probs) 

        #track best validation PR-AUC for this config
        if pr_auc > best_val_pr_auc:
            best_val_pr_auc = pr_auc
            best_state_dict = model.state_dict()

        #print output for epoch (optional, you can comment this out if it's too verbose)
        print(f"  Epoch {epoch+1:02d} | Loss: {epoch_loss:.4f} | Val ROC-AUC: {roc:.4f} | Val PR-AUC: {pr_auc:.4f}")

    return best_val_pr_auc, best_state_dict


In [117]:
#hyperparameter "ranges" for random search
#we'll *sample* from these instead of trying every combination
def sample_hparams():
    #learning rate: sample log-uniform between 1e-4 and 3e-3
    log_lr_min = math.log10(1e-4)
    log_lr_max = math.log10(3e-3)
    log_lr = random.uniform(log_lr_min, log_lr_max)
    lr = 10 ** log_lr

    #dropout: uniform between 0.1 and 0.5
    dropout = random.uniform(0.1, 0.5)

    #batch size: pick from a small discrete set
    batch_size = random.choice([512, 1024, 2048])

    #weight decay: small set of options (including no weight decay)
    weight_decay = random.choice([0.0, 1e-5, 1e-4])

    return {
        "lr": lr,
        "dropout": dropout,
        "batch_size": batch_size,
        "weight_decay": weight_decay
    }

In [118]:
#number of random configs to try
num_trials = 15  # increase to 20â€“30 if you have time / GPU

best_hparams = None
best_val_score = -np.inf
best_model_state = None

for trial in range(1, num_trials + 1):
    print(f"\n=== Trial {trial}/{num_trials} ===")

    #sample a random hyperparameter configuration
    hparams = sample_hparams()
    lr           = hparams["lr"]
    dropout      = hparams["dropout"]
    batch_size   = hparams["batch_size"]
    weight_decay = hparams["weight_decay"]

    print(f"Sampled hyperparameters:")
    print(f"  lr          = {lr:.6f}")
    print(f"  dropout     = {dropout:.3f}")
    print(f"  batch_size  = {batch_size}")
    print(f"  weight_decay= {weight_decay}")

    #train and evaluate this config on the validation set
    val_pr_auc, state_dict = train_eval_one_config(
        lr=lr,
        dropout=dropout,
        batch_size=batch_size,
        weight_decay=weight_decay
    )

    print(f"--> Best val PR-AUC for this trial: {val_pr_auc:.4f}")

    #keep track of overall best config
    if val_pr_auc > best_val_score:
        best_val_score = val_pr_auc
        best_hparams = hparams
        best_model_state = state_dict

print("\n=== Best hyperparameters found (random search) ===")
print(best_hparams)
print("Best validation PR-AUC:", best_val_score)


=== Trial 1/15 ===
Sampled hyperparameters:
  lr          = 0.001776
  dropout     = 0.220
  batch_size  = 512
  weight_decay= 0.0
  Epoch 01 | Loss: 0.5421 | Val ROC-AUC: 0.9762 | Val PR-AUC: 0.6663
  Epoch 02 | Loss: 0.3486 | Val ROC-AUC: 0.9816 | Val PR-AUC: 0.6746
  Epoch 03 | Loss: 0.3466 | Val ROC-AUC: 0.9784 | Val PR-AUC: 0.6477
  Epoch 04 | Loss: 0.2606 | Val ROC-AUC: 0.9762 | Val PR-AUC: 0.6983
  Epoch 05 | Loss: 0.2864 | Val ROC-AUC: 0.9836 | Val PR-AUC: 0.6723
  Epoch 06 | Loss: 0.2204 | Val ROC-AUC: 0.9825 | Val PR-AUC: 0.6821
  Epoch 07 | Loss: 0.2189 | Val ROC-AUC: 0.9851 | Val PR-AUC: 0.6834
  Epoch 08 | Loss: 0.2273 | Val ROC-AUC: 0.9817 | Val PR-AUC: 0.6774
  Epoch 09 | Loss: 0.2119 | Val ROC-AUC: 0.9795 | Val PR-AUC: 0.6395
  Epoch 10 | Loss: 0.2081 | Val ROC-AUC: 0.9828 | Val PR-AUC: 0.6750
  Epoch 11 | Loss: 0.1678 | Val ROC-AUC: 0.9843 | Val PR-AUC: 0.7019
  Epoch 12 | Loss: 0.1640 | Val ROC-AUC: 0.9853 | Val PR-AUC: 0.6832
  Epoch 13 | Loss: 0.1839 | Val ROC-AUC:

KeyboardInterrupt: 

In [None]:
#rebuild the best model and load its weights
best_model = FraudNet(input_dim, dropout=best_hparams["dropout"]).to(device)
best_model.load_state_dict(best_model_state)
best_model.eval()

#use the best batch_size for the test loader
test_loader  = DataLoader(test_dataset, batch_size=best_hparams["batch_size"], shuffle=False)

all_logits = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        logits = best_model(X_batch)
        all_logits.append(logits.cpu())
        all_targets.append(y_batch)

all_logits = torch.cat(all_logits)
all_targets = torch.cat(all_targets)

#converts logits to p
probs = torch.sigmoid(all_logits).numpy()
targets = all_targets.numpy()

#calculate roc auc score on the test set
roc = roc_auc_score(targets, probs)
#auc pr score on the test set
pr_auc = average_precision_score(targets, probs) 

print("\n=== Test set performance with best hyperparameters ===")
print(f"Test ROC-AUC: {roc:.4f}")
print(f"Test PR-AUC : {pr_auc:.4f}")

In [None]:
#threshold for confusion matrix
threshold = 0.4
#make predictions
preds = (probs >= threshold).astype(int)

#print confusion matrix
print("Confusion matrix:")
print(confusion_matrix(targets, preds))

print("\nClassification report:")
print(classification_report(targets, preds, digits=4))


Confusion matrix:
[[56353   511]
 [   10    88]]

Classification report:
              precision    recall  f1-score   support

         0.0     0.9998    0.9910    0.9954     56864
         1.0     0.1469    0.8980    0.2525        98

    accuracy                         0.9909     56962
   macro avg     0.5734    0.9445    0.6240     56962
weighted avg     0.9984    0.9909    0.9941     56962

