In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Data Preprocessing and Label Encoding
def df_label_encoder(df, columns):
    le = preprocessing.LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

In [3]:
# Preprocess the dataset and extract features
def preprocess(df):
    df = df_label_encoder(df, ['merchant', 'category', 'city', 'state', 'job'])
    df['amt'] = (df['amt'] - df['amt'].min()) / (df['amt'].max() - df['amt'].min())  # Normalize 'amt'
    return df

In [4]:
# Load dataset and preprocess
df = pd.read_csv('creditcard/fraudTrain.csv')  # Update with your .csv file path
df = preprocess(df)

In [5]:
# Extract features and labels (remove graph-related parts)
features = torch.tensor(df[['amt', 'category', 'city', 'state']].values, dtype=torch.float)
labels = torch.tensor(df['is_fraud'].values, dtype=torch.long)

In [6]:
# Function to compute statistics: mean, variance, and standard deviation
def compute_statistics(features):
    mean = torch.mean(features, dim=0)
    var = torch.var(features, dim=0)
    std = torch.std(features, dim=0)
    return mean, var, std

In [7]:
# Print initial tatistics
initial_mean, initial_var, initial_std = compute_statistics(features)
print(f"Initial Mean: {initial_mean}, Initial Variance: {initial_var}, Initial Std Dev: {initial_std}")

Initial Mean: tensor([2.3957e-03, 6.2278e+00, 4.4526e+02, 2.6677e+01]), Initial Variance: tensor([3.0670e-05, 1.5315e+01, 6.6874e+04, 2.0538e+02]), Initial Std Dev: tensor([5.5381e-03, 3.9134e+00, 2.5860e+02, 1.4331e+01])


In [8]:
# Define WGAN Generator and Discriminator (unchanged)
class WGANGenerator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(WGANGenerator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, z):
        z = torch.relu(self.fc1(z))
        return self.fc2(z)

In [9]:
class WGANDiscriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(WGANDiscriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [10]:
# WGAN parameters
input_size = features.shape[1]
hidden_size = 128
output_size = input_size
latent_size = 64

In [11]:
# Initialize WGAN components
generator = WGANGenerator(latent_size, hidden_size, output_size)
discriminator = WGANDiscriminator(input_size, hidden_size)

In [12]:
# Optimizers
optimizer_g = optim.Adam(generator.parameters(), lr=0.0001)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0001)

In [13]:
# Training with early stopping
num_epochs = 16
target_minority_class = torch.sum(labels == 0)
real_data = features[labels == 1]

In [14]:
# Early stopping parameters
best_loss_d = float('inf')
patience = 1
trigger_times = 0

In [None]:
for epoch in range(num_epochs):
    current_minority_count = torch.sum(labels == 1)
    if current_minority_count >= target_minority_class:
        break

    for _ in range(5):
        z = torch.randn(real_data.size(0), latent_size)
        fake_data = generator(z)

        optimizer_d.zero_grad()
        d_real = discriminator(real_data)
        d_fake = discriminator(fake_data.detach())
        loss_d = -torch.mean(d_real) + torch.mean(d_fake)
        loss_d.backward()
        optimizer_d.step()

        # Clip weights
        for p in discriminator.parameters():
            p.data.clamp_(-0.01, 0.01)

    optimizer_g.zero_grad()
    fake_data = generator(torch.randn(real_data.size(0), latent_size))
    loss_g = -torch.mean(discriminator(fake_data))
    loss_g.backward()
    optimizer_g.step()

    # Update labels and features with generated samples
    labels = torch.cat((labels, torch.zeros(fake_data.size(0), dtype=torch.long)))
    features = torch.cat((features, fake_data))

    if loss_d.item() < best_loss_d:
        best_loss_d = loss_d.item()
        trigger_times = 0
    else:
        trigger_times += 1

    if trigger_times >= patience:
        print("Early stopping triggered")
        break

    if epoch % 1 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Loss D: {loss_d.item()}, Loss G: {loss_g.item()}')

In [None]:
# Generate enough samples to match the class distribution
num_generated_samples = target_minority_class - current_minority_count
generated_data = generator(torch.randn(num_generated_samples, latent_size))
y_generated = torch.ones(num_generated_samples, dtype=torch.long)

In [None]:
# Combine generated data with the original data
x_augmented = torch.cat([features, generated_data], dim=0)
y_augmented = torch.cat([labels, y_generated], dim=0)

In [None]:
# Print final statistics
final_mean, final_var, final_std = compute_statistics(x_augmented)
print(f"Final Mean: {final_mean}, Final Variance: {final_var}, Final Std Dev: {final_std}")

In [None]:
def r_squared(real_data, generated_data):
    ss_res = torch.sum((real_data - generated_data) ** 2, dim=0)  # Residual sum of squares
    ss_tot = torch.sum((real_data - torch.mean(real_data, dim=0)) ** 2, dim=0)  # Total sum of squares
    r2 = 1 - (ss_res / ss_tot)
    return r2
# Ensure real and generated data have the same number of samples
min_size = min(real_data.size(0), generated_data.size(0))
real_data_sampled = real_data[:min_size]  # Sample the first min_size samples from real_data
generated_data_sampled = generated_data[:min_size]  # Sample the first min_size samples from generated_data

In [None]:
r2_scores = r_squared(real_data_sampled, generated_data_sampled)
print(f"R-squared for each feature: {r2_scores}")
print(f"Mean R-squared: {r2_scores.mean()}")

In [None]:
# In[27]:

# Function to plot class distribution
def plot_class_distribution(labels, title="Class Distribution"):
    unique_classes, counts = torch.unique(labels, return_counts=True)
    plt.bar(unique_classes.numpy(), counts.numpy(), color=['blue', 'orange'])
    plt.xticks(unique_classes.numpy(), [f"Class {int(c)}" for c in unique_classes.numpy()])
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.title(title)
    plt.show()

# Plot class distribution after data augmentation
plot_class_distribution(y_augmented, title="Class Distribution After Data Augmentation")
