In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Data Preprocessing and Label Encoding
def df_label_encoder(df, columns):
    le = preprocessing.LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

In [3]:
# Preprocess the dataset and extract features
def preprocess(df):
    df = df_label_encoder(df, ['type'])  # Encoding the 'type' column
    df['amount'] = (df['amount'] - df['amount'].min()) / (df['amount'].max() - df['amount'].min())  # Normalize 'amount'
    return df

In [4]:
# Load dataset and preprocess
df = pd.read_csv('paysim/paysim.csv')  # Update with your PaySim .csv file path
df = preprocess(df)

In [5]:
# Extract features and labels (remove graph-related parts)
features = torch.tensor(df[['amount', 'type', 'oldbalanceOrg', 'newbalanceOrig']].values, dtype=torch.float)
labels = torch.tensor(df['isFraud'].values, dtype=torch.long)

In [6]:
# Function to compute statistics: mean, variance, and standard deviation
def compute_statistics(features):
    mean = torch.mean(features, dim=0)
    var = torch.var(features, dim=0)
    std = torch.std(features, dim=0)
    return mean, var, std

In [7]:
# Print initial statistics
initial_mean, initial_var, initial_std = compute_statistics(features)
print(f"Initial Mean: {initial_mean}, Initial Variance: {initial_var}, Initial Std Dev: {initial_std}")

Initial Mean: tensor([1.9456e-03, 1.7141e+00, 8.3388e+05, 8.5511e+05]), Initial Variance: tensor([4.2668e-05, 1.8228e+00, 8.3419e+12, 8.5501e+12]), Initial Std Dev: tensor([6.5320e-03, 1.3501e+00, 2.8882e+06, 2.9240e+06])


In [8]:
# Define WGAN Generator and Discriminator
class WGANGenerator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(WGANGenerator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, z):
        z = torch.relu(self.fc1(z))
        return self.fc2(z)

In [9]:
class WGANDiscriminator(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(WGANDiscriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [10]:
# WGAN parameters
input_size = features.shape[1]
hidden_size = 128
output_size = input_size
latent_size = 64

In [11]:
# Initialize WGAN components
generator = WGANGenerator(latent_size, hidden_size, output_size)
discriminator = WGANDiscriminator(input_size, hidden_size)

In [12]:
# Optimizers
optimizer_g = optim.Adam(generator.parameters(), lr=0.0001)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0001)

In [13]:
# Training with early stopping
num_epochs = 16
target_minority_class = torch.sum(labels == 0)
real_data = features[labels == 1]

# Early stopping parameters
best_loss_d = float('inf')
patience = 1
trigger_times = 0

In [14]:
for epoch in range(num_epochs):
    current_minority_count = torch.sum(labels == 1)
    if current_minority_count >= target_minority_class:
        break

    for _ in range(5):
        z = torch.randn(real_data.size(0), latent_size)
        fake_data = generator(z)

        optimizer_d.zero_grad()
        d_real = discriminator(real_data)
        d_fake = discriminator(fake_data.detach())
        loss_d = -torch.mean(d_real) + torch.mean(d_fake)
        loss_d.backward()
        optimizer_d.step()

        # Clip weights
        for p in discriminator.parameters():
            p.data.clamp_(-0.01, 0.01)

    optimizer_g.zero_grad()
    fake_data = generator(torch.randn(real_data.size(0), latent_size))
    loss_g = -torch.mean(discriminator(fake_data))
    loss_g.backward()
    optimizer_g.step()

    # Update labels and features with generated samples
    labels = torch.cat((labels, torch.zeros(fake_data.size(0), dtype=torch.long)))
    features = torch.cat((features, fake_data))

    if loss_d.item() < best_loss_d:
        best_loss_d = loss_d.item()
        trigger_times = 0
    else:
        trigger_times += 1

    if trigger_times >= patience:
        print("Early stopping triggered")
        break

    if epoch % 1 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Loss D: {loss_d.item()}, Loss G: {loss_g.item()}')


Epoch [0/16], Loss D: 1630.6500244140625, Loss G: 0.01010693795979023
Epoch [1/16], Loss D: 1207.72900390625, Loss G: 0.01008095033466816
Epoch [2/16], Loss D: 819.1281127929688, Loss G: 0.01005446445196867
Epoch [3/16], Loss D: 452.3153076171875, Loss G: 0.010031403973698616
Epoch [4/16], Loss D: 93.80571746826172, Loss G: 0.010013188235461712
Epoch [5/16], Loss D: -257.90582275390625, Loss G: 0.009990428574383259
Epoch [6/16], Loss D: -602.37890625, Loss G: 0.009963007643818855
Epoch [7/16], Loss D: -928.41748046875, Loss G: 0.009930303320288658
Epoch [8/16], Loss D: -1242.38818359375, Loss G: 0.009889287874102592
Epoch [9/16], Loss D: -1548.499267578125, Loss G: 0.009841403923928738
Epoch [10/16], Loss D: -1839.0980224609375, Loss G: 0.009804402478039265
Epoch [11/16], Loss D: -2121.008544921875, Loss G: 0.009762074798345566
Epoch [12/16], Loss D: -2396.053955078125, Loss G: 0.009723300114274025
Epoch [13/16], Loss D: -2664.0625, Loss G: 0.009691725485026836
Epoch [14/16], Loss D: -

In [15]:
# Generate enough samples to match the class distribution
num_generated_samples = target_minority_class - current_minority_count
generated_data = generator(torch.randn(num_generated_samples, latent_size))
y_generated = torch.ones(num_generated_samples, dtype=torch.long)

# Combine generated data with the original data
x_augmented = torch.cat([features, generated_data], dim=0)
y_augmented = torch.cat([labels, y_generated], dim=0)

In [16]:
# Print final statistics
final_mean, final_var, final_std = compute_statistics(x_augmented)
print(f"Final Mean: {final_mean}, Final Variance: {final_var}, Final Std Dev: {final_std}")

Final Mean: tensor([6.8371e-03, 9.4688e-01, 4.1321e+05, 4.2373e+05],
       grad_fn=<MeanBackward1>), Final Variance: tensor([1.6829e-02, 1.5026e+00, 4.3074e+12, 4.4195e+12],
       grad_fn=<VarBackward0>), Final Std Dev: tensor([1.2973e-01, 1.2258e+00, 2.0754e+06, 2.1023e+06],
       grad_fn=<StdBackward0>)


In [17]:
# Compute R-squared between real and generated data samples
def r_squared(real_data, generated_data):
    ss_res = torch.sum((real_data - generated_data) ** 2, dim=0)  # Residual sum of squares
    ss_tot = torch.sum((real_data - torch.mean(real_data, dim=0)) ** 2, dim=0)  # Total sum of squares
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [18]:
real_data_sampled = real_data[:num_generated_samples]
generated_data_sampled = generated_data[:num_generated_samples]

In [19]:
r2_scores = r_squared(real_data_sampled, generated_data_sampled)
print(f"R-squared for each feature: {r2_scores}")
print(f"Mean R-squared: {r2_scores.mean()}")

RuntimeError: The size of tensor a (8213) must match the size of tensor b (6346194) at non-singleton dimension 0