# How to use the simple GAN (Generative Adversarial Network) in PyTorch

In [None]:
# Core Libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# Data Handling & Preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Visualization & Debugging
import matplotlib.pyplot as plt
import seaborn as sns

## Data Preparation
The dataset I'm using is UNSW-NB15, the data already split so I don't need to split it manually. The dataset is divided into:
1. Training-set with CSV format.
2. Testing-set with CSV format.

Load the Dataset

In [None]:
# Load CSV files
df_train = pd.read_csv('UNSW_NB15_training-set.csv')
df_test = pd.read_csv('UNSW_NB15_testing-set.csv')

# Display the first 5 rows
df_train.head()

Check class distribution and answer this question:
1. What are the minority attack classes?
2. How severe is the imbalance?

In [None]:
# Check column names
print(df_train.columns)

# Count occurences of each unieque value in the attack_cat column
print(df_train["attack_cat"].value_counts())

In [None]:
# Drop rows where 'attack_cat' is 'Normal'
df_train_attack = df_train[df_train['attack_cat'] != 'Normal']

# Count occurences of each attack classes
print(df_train_attack["attack_cat"].value_counts())

Visualize the Class Imbalance

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x=df_train_attack['attack_cat'], order=df_train_attack['attack_cat'].value_counts().index)
plt.xticks(rotation=45)
plt.title('Class Distribution of Attack Samples')
plt.show()

To measure how severe the imbalance is, I'm use imbalance ratio (IR) and other statistical metrics.
The imbalance ratio (IR) tells us how imbalanced the dataset is. It’s calculated as:

IR = Majority class count / Minority class count

If IR > 1.5, the dataset is imbalanced.  
If IR > 10, the imbalance is severe.  
If any class has less than 5% of the dataset, which indicates a significant imbalance.

In [None]:
# Find the majority and minority class sizes
class_counts = df_train_attack['attack_cat'].value_counts()
majority_class = class_counts.max()
minority_class = class_counts.min()

# Compute imbalance ratio
imbalance_ratio = majority_class / minority_class
print(f"Imbalance Ratio (IR): {imbalance_ratio:.2f}\n")

# Compute percentage distribution
total_samples = len(df_train_attack)
percentages = (class_counts / total_samples) * 100
print(percentages)

The answer above question is:
1. There is few minority attack classes **less then 5%** of the dataset here: Worms, Shellcode, Backdoor, and Analysis.
2. The Imbalance Ratio is 307.69 which indicate the dataset is **highly imbalance!**.

Extract Minority Classes  
I'm focus on attack types below **5%** (Analysis, Backdoor, Shellcode, Worms)

In [None]:
# Define the minority classes
minority_classes = ['Analysis', 'Backdoor', 'Shellcode', 'Worms']

# Extract samples of minority classes
minority_df = df_train_attack[df_train_attack['attack_cat'].isin(minority_classes)]
print(minority_df['attack_cat'].value_counts())

Normalize Features  
GANs work best when features are scaled between **0 and 1**

In [None]:
# Drop unnecessary columns first
df_train_attack.drop(columns=['id', 'label', 'service', 'state', 'proto'], inplace=True)

# Select only numerical features
numerical_cols = df_train_attack.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Apply MinMaxScalar to normalize the numerical features
scaler = MinMaxScaler()
df_train_attack[numerical_cols] = scaler.fit_transform(df_train_attack[numerical_cols])

# Display the first 5 rows
print(f"Selected numerical columns: {numerical_cols}")  # Debugging step
df_train_attack.head()

## Define the GAN Architecture

GAN Architecture Overview  
A GAN consist of two networks:
1. **Generator (G)** --> Takes random noise and generates fake attack samples.
2. **Discriminator (D)** --> Determine if a sample is real or fake.

Define the Generator  
The generator takes **random noise** as input and outputs a synthetic attack sample.

In [None]:
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128), # Fully connected layer
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim), # Output the same number of features as the dataset
            nn.Tanh() # Output values between -1 and 1
        )
    
    def forward(self, z):
        return self.model(z)

**input_dim** --> Size of the random noise (e.g., 100).  
**output_dim** --> Number of features in your dataset.

Define the Discriminator  
The discriminator classifies data as **real (1)** or **fake (0)**.

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2), 
            nn.Linear(128, 1), 
            nn.Sigmoid() # Output probability of being real
        )
    
    def forward(self, x):
        return self.model(x)

Uses **LeakyReLU** to avoid dead neurons.  
Ends with **Sigmoid** to output probability.

Initialize Models & Optimizers

In [None]:
# Set random seed for reproducibility
torch.manual_seed(42)

# Define dimensions
input_dim = 100 # Random noise size
output_dim = len(numerical_cols) # Number of features in dataset

# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(output_dim)

# Optimizers & Loss Functions
lr = 0.0002 # Learning rate
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)
criterion = nn.BCELoss() # Binary Cross-Entropy loss

**criterion** helps measure how well the discriminator differentiates real vs. fake.  
**Adam optimizer** helps in faster convergence.

## Training the GAN

**Training Process:**
1. The **generator** tries to fool the discriminator.
2. The **discriminator** improves by correctly classifying real vs. fake samples.
3. Over time, the generator creates more realistic attack samples.

In [None]:
# Set device to GPU if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Training parameters
num_epochs = 10000 # Set the number of epochs for training
batch_size = 64 # Set the batch size for each iteration

# Loop through each minority attack class and train a separate GAN for each
for attack_class in minority_classes:
    print(f"Training GAN for {attack_class}...")

    # Extract only sampls of this attack class
    real_data_class = df_train_attack[df_train_attack['attack_cat'] == attack_class][numerical_cols].values
    real_data_class = torch.tensor(real_data_class, dtype=torch.float32).to(device)

    # Initialize a new generator and discriminator for this class
    generator = Generator(input_dim, output_dim).to(device)
    discriminator = Discriminator(output_dim).to(device)

    # Optimizers
    optimizer_G = optim.Adam(generator.parameters(), lr=lr)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

    # Loss function
    criterion = nn.BCELoss()

    # Train the GAN for this specific attack class
    for epoch in range(num_epochs):
        # Shuffle data at the start of each epoch
        real_data_class = real_data_class[torch.randperm(real_data_class.size(0))]

        # Train the GAN in mini-batches (batch_size)
        for i in range(0, len(real_data_class), batch_size):
            # Define batch_data as a slice of real_data_class
            batch_data = real_data_class[i:i+batch_size].to(device)

            ### Step 1: Train Discriminator ###
            optimizer_D.zero_grad()

            # Real samples
            real_labels = torch.ones(batch_data.shape[0], 1).to(device) # Real data labels = 1
            real_output = discriminator(batch_data)
            loss_real = criterion(real_output, real_labels)

            # Fake samples
            noise = torch.randn(batch_data.shape[0], input_dim).to(device) # Generate random noise
            fake_data = generator(noise) # Generate fake attack samples
            fake_labels = torch.zeros(batch_data.shape[0], 1).to(device) # Fake data labels = 0
            fake_output = discriminator(fake_data.detach()) # Detach to avoid generator update
            loss_fake = criterion(fake_output, fake_labels)

            # Total Discriminator loss
            loss_D = loss_real + loss_fake
            loss_D.backward()
            optimizer_D.step()

            ### Step 2: Train Generator ###
            optimizer_G.zero_grad()

            # Want to fool the discriminator
            fake_output = discriminator(fake_data)
            loss_G = criterion(fake_output, torch.ones(batch_data.shape[0], 1).to(device)) # Fake labels = 1 (to fool D)

            loss_G.backward()
            optimizer_G.step()

        print(f"Epoch [{epoch+1}/{num_epochs}] done for {attack_class}")

    print(f"{attack_class} GAN training Done!")

**Step 1**: Train the **discriminator** on real and fake data.  
**Step 2**: Train the **generator** to fool the discriminator.  
Runs for **10,000 epochs**

## Generate Synthetic Attack Samples

Generate New Attack Samples  
After training, use the generator to create synthetic attack data.

In [None]:
# Identify the size of the majority class
majority_class_size = df_train_attack['attack_cat'].value_counts().max()

# Loop through each minority attack class and generate synthetic samples
synthetic_dataframes = []

for attack_class in minority_classes:
    print(f"Generating synthetic samples for {attack_class}...")

    # Extract only samples of this attack class
    real_data_class = df_train_attack[df_train_attack['attack_cat'] == attack_class][numerical_cols].values
    real_data_class = torch.tensor(real_data_class, dtype=torch.float32)

    # Generate synthetic samples after training the GAN
    num_samples_needed = majority_class_size - len(real_data_class) # Calculate number of synthetic samples needed
    noise = torch.randn(num_samples_needed, input_dim)
    synthetic_samples = generator(noise).detach().numpy()

    # Convert back to original scale
    synthetic_samples = scaler.inverse_transform(synthetic_samples)

    # Create DataFrame for the synthetic samples and label them with the same attack class
    synthetic_df = pd.DataFrame(synthetic_samples, columns=numerical_cols)
    synthetic_df['attack_cat'] = attack_class # Same name as the original attack class

    # Store for merging later
    synthetic_dataframes.append(synthetic_df)

# Combine all synthetic data into one DataFrame
final_synthetic_df = pd.concat(synthetic_dataframes, ignore_index=True)

# Save the synthetic data to a CSV file
final_synthetic_df.to_csv('synthetic_attack_data.csv', index=False)
print("Synthetic attack samples for all minority classes saved!")

## Evaluate and Validate