In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load and preprocess data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive/creditcard.csv')

# Split data into fraud and legitimate
fraud_data = df[df['Class'] == 1]
legit_data = df[df['Class'] == 0]

# Split both classes into train and test (80/20)
fraud_train, fraud_test = train_test_split(fraud_data, test_size=0.2, random_state=42)
legit_train, legit_test = train_test_split(legit_data, test_size=0.2, random_state=42)

# Preserve test set for final evaluation
final_test_set = pd.concat([fraud_test, legit_test])

# GAN will only train on fraud TRAINING data
fraud_train_features = fraud_train.drop(columns=['Class']).values

# Normalize data using scaler fitted ONLY on fraud training data
scaler = MinMaxScaler()
fraud_train_scaled = scaler.fit_transform(fraud_train_features)

# Convert to PyTorch dataset
real_data = torch.tensor(fraud_train_scaled, dtype=torch.float32)
dataset = TensorDataset(real_data)
data_loader = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)

In [9]:
# Improved Generator with regularization
class Generator(nn.Module):
    def __init__(self, z_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(z_dim, 512),  # Increased from 256
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, 1024),  # New layer
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, output_dim),
            nn.Sigmoid()
        )

    def forward(self, z):
        return self.model(z)

# Critic (Discriminator) for WGAN-GP
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)

In [10]:
# Parameters
input_dim = real_data.shape[1]
# Adjust training parameters
z_dim = 50                        # Increase from 30
epochs = 5000                     # Increase from 2000
n_critic = 3                      # Reduce from 5
lambda_gp = 5                     # Reduce fro
batch_size = 128

# Initialize models and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = Generator(z_dim, input_dim).to(device)
critic = Critic(input_dim).to(device)

# Modify learning rates
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.9))  # Increased LR
optimizer_C = optim.Adam(critic.parameters(), lr=0.00005, betas=(0.5, 0.9))    # Reduced LR

In [13]:
# Gradient penalty for WGAN-GP
def compute_gradient_penalty(critic, real_samples, fake_samples):
    alpha = torch.rand(real_samples.size(0), 1).to(device)
    interpolates = (alpha * real_samples + ((1 - alpha) * fake_samples)).requires_grad_(True)
    d_interpolates = critic(interpolates)

    gradients = torch.autograd.grad(
        outputs=d_interpolates,
        inputs=interpolates,
        grad_outputs=torch.ones_like(d_interpolates),
        create_graph=True,
        retain_graph=True
    )[0]

    gradients = gradients.view(gradients.size(0), -1)
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty

# Enable cuDNN auto-tuner for faster training
torch.backends.cudnn.benchmark = True

# Training loop
for epoch in range(epochs):
    for i, batch in enumerate(data_loader):
        real_batch = batch[0].to(device)

        # Train Critic (n_critic times)
        for _ in range(n_critic):
            # Generate fake data
            z = torch.randn(real_batch.size(0), z_dim, device=device)
            fake_batch = generator(z)

            # Compute critic scores
            real_validity = critic(real_batch)
            fake_validity = critic(fake_batch)

            # Gradient penalty
            gradient_penalty = compute_gradient_penalty(critic, real_batch.data, fake_batch.data)

            # Critic loss
            c_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + lambda_gp * gradient_penalty

            optimizer_C.zero_grad()
            c_loss.backward()
            optimizer_C.step()

        # Train Generator
        z = torch.randn(real_batch.size(0), z_dim, device=device)
        gen_batch = generator(z)
        g_loss = -torch.mean(critic(gen_batch))

        optimizer_G.zero_grad()
        g_loss.backward()
        optimizer_G.step()

    # Print progress
    if epoch % 200 == 0:
        print(f"Epoch {epoch}/{epochs} | C_loss: {c_loss.item():.4f} | G_loss: {g_loss.item():.4f}")


Epoch 0/5000 | C_loss: 0.0238 | G_loss: -0.2328
Epoch 200/5000 | C_loss: 0.0556 | G_loss: -0.1204
Epoch 400/5000 | C_loss: 0.1717 | G_loss: -0.1943
Epoch 600/5000 | C_loss: 0.0603 | G_loss: -0.0489
Epoch 800/5000 | C_loss: 0.0939 | G_loss: -0.0863
Epoch 1000/5000 | C_loss: 0.0757 | G_loss: -0.1036
Epoch 1200/5000 | C_loss: 0.0895 | G_loss: -0.3069
Epoch 1400/5000 | C_loss: 0.0500 | G_loss: -0.5330
Epoch 1600/5000 | C_loss: 0.0361 | G_loss: -0.4892
Epoch 1800/5000 | C_loss: 0.0347 | G_loss: -0.5312
Epoch 2000/5000 | C_loss: 0.1000 | G_loss: -0.5551
Epoch 2200/5000 | C_loss: 0.1245 | G_loss: -0.4831
Epoch 2400/5000 | C_loss: 0.0887 | G_loss: -0.4958
Epoch 2600/5000 | C_loss: 0.1020 | G_loss: -0.5078
Epoch 2800/5000 | C_loss: 0.0156 | G_loss: -0.4872
Epoch 3000/5000 | C_loss: 0.0672 | G_loss: -0.5447
Epoch 3200/5000 | C_loss: -0.0126 | G_loss: -0.5390
Epoch 3400/5000 | C_loss: 0.0180 | G_loss: -0.5799
Epoch 3600/5000 | C_loss: 0.0605 | G_loss: -0.7294
Epoch 3800/5000 | C_loss: -0.0067 | G

In [14]:
# Generate synthetic fraud samples
generator.eval()
num_original_fraud_train = len(fraud_train)
num_legit_train = len(legit_train)
num_synthetic_needed = num_legit_train - num_original_fraud_train

synthetic_samples = []

with torch.no_grad():
    # Generate in batches
    for _ in range(0, num_synthetic_needed, batch_size):
        batch_size_ = min(batch_size, num_synthetic_needed - len(synthetic_samples))
        z = torch.randn(batch_size_, z_dim, device=device)
        gen_samples = generator(z).cpu().numpy()
        synthetic_samples.extend(gen_samples)

# Create synthetic fraud dataframe
synthetic_fraud = pd.DataFrame(synthetic_samples, columns=fraud_train.drop(columns=['Class']).columns)
synthetic_fraud['Class'] = 1

# Create balanced training set
balanced_train = pd.concat([
    fraud_train,  # Original fraud training data
    legit_train,   # Original legitimate training data
    synthetic_fraud  # Synthetic fraud
], ignore_index=True)

# Shuffle the dataset
balanced_train = balanced_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Save datasets
balanced_train.to_csv('/content/drive/MyDrive/Colab Notebooks/archive/balanced_creditcard_train.csv', index=False)
final_test_set.to_csv('/content/drive/MyDrive/Colab Notebooks/archive/creditcard_test.csv', index=False)

print(f"Original fraud training samples: {len(fraud_train)}")
print(f"Synthetic fraud samples generated: {len(synthetic_fraud)}")
print(f"Balanced training set size: {len(balanced_train)}")
print(f"Test set size: {len(final_test_set)}")
print(f"Class distribution in training set:\n{balanced_train['Class'].value_counts()}")

Original fraud training samples: 393
Synthetic fraud samples generated: 227059
Balanced training set size: 454904
Test set size: 56962
Class distribution in training set:
Class
1    227452
0    227452
Name: count, dtype: int64


In [16]:

print(f"Original fraud training samples: {len(fraud_train)}")
print(f"Synthetic fraud samples generated: {len(synthetic_fraud)}")
print(f"Balanced training set size: {len(balanced_train)}")
print(f"Test set size: {len(final_test_set)}")
print(f"Class distribution in training set:\n{balanced_train['Class'].value_counts()}")
print(f"Class distribution in testing set:\n{final_test_set['Class'].value_counts()}")

Original fraud training samples: 393
Synthetic fraud samples generated: 227059
Balanced training set size: 454904
Test set size: 56962
Class distribution in training set:
Class
1    227452
0    227452
Name: count, dtype: int64
Class distribution in testing set:
Class
0    56863
1       99
Name: count, dtype: int64
