## Part 1: Setup and GPU Configuration

In [None]:
# GPU Setup - MUST RUN FIRST
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler  # Use MinMaxScaler for [0,1] range
from sklearn.model_selection import train_test_split
from scipy import stats
from tqdm import tqdm
import json
import pickle
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Seeds
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Part 2: Feature Type Definitions (CRITICAL)

In [None]:
# ============================================================================
# FEATURE TYPE DEFINITIONS - Used for post-processing
# ============================================================================

# Integer features (counts - must be >= 0 and integer)
INTEGER_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'dups',
    'origin_0', 'origin_2', 'origin_changes',
    'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
    'as_path_max', 'unique_as_path_max',
    'edit_distance_max',
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6',
    'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
    'number_rare_ases',
    'nadas', 'flaps'
]

# Continuous features (can be float but must be >= 0)
CONTINUOUS_FEATURES = [
    'edit_distance_avg',
    'rare_ases_avg'
]

# All features
ALL_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'dups',
    'origin_0', 'origin_2', 'origin_changes',
    'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
    'as_path_max', 'unique_as_path_max',
    'edit_distance_avg', 'edit_distance_max',
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6',
    'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
    'number_rare_ases', 'rare_ases_avg',
    'nadas', 'flaps'
]

print(f"Integer features: {len(INTEGER_FEATURES)}")
print(f"Continuous features: {len(CONTINUOUS_FEATURES)}")
print(f"Total: {len(ALL_FEATURES)}")

In [None]:
# ============================================================================
# POST-PROCESSING FUNCTION (CRITICAL)
# ============================================================================

def post_process_synthetic_data(synthetic_data, feature_names, scaler):
    """
    Post-process synthetic data:
    1. Inverse transform (denormalize)
    2. Clip negative values to 0
    3. Round integer features
    
    Args:
        synthetic_data: numpy array (n_samples, seq_len, n_features) - NORMALIZED
        feature_names: list of feature names
        scaler: fitted MinMaxScaler
    
    Returns:
        processed_data: numpy array with proper constraints
    """
    n_samples, seq_len, n_features = synthetic_data.shape
    
    # Step 1: Flatten for inverse transform
    flat_data = synthetic_data.reshape(-1, n_features)
    
    # Step 2: Inverse transform (denormalize)
    denorm_data = scaler.inverse_transform(flat_data)
    
    # Step 3: Apply constraints per feature
    for i, feat in enumerate(feature_names):
        # Clip all features to >= 0 (all BGP features are non-negative)
        denorm_data[:, i] = np.maximum(denorm_data[:, i], 0)
        
        # Round integer features
        if feat in INTEGER_FEATURES:
            denorm_data[:, i] = np.round(denorm_data[:, i]).astype(int)
    
    # Step 4: Reshape back
    processed_data = denorm_data.reshape(n_samples, seq_len, n_features)
    
    return processed_data


def validate_synthetic_data(synthetic_data, feature_names):
    """
    Validate that synthetic data meets constraints.
    """
    flat = synthetic_data.reshape(-1, len(feature_names))
    
    print("\nValidation Report:")
    print("-" * 50)
    
    issues = []
    
    for i, feat in enumerate(feature_names):
        col = flat[:, i]
        min_val = col.min()
        max_val = col.max()
        has_negative = min_val < 0
        has_float = not np.allclose(col, np.round(col)) if feat in INTEGER_FEATURES else False
        
        status = "✓" if not has_negative and not has_float else "✗"
        
        if has_negative or has_float:
            issues.append(feat)
            print(f"{status} {feat}: min={min_val:.2f}, max={max_val:.2f}, "
                  f"neg={has_negative}, float={has_float}")
    
    if not issues:
        print("✓ All features passed validation!")
    else:
        print(f"\n✗ {len(issues)} features have issues: {issues}")
    
    return len(issues) == 0

print("Post-processing functions defined.")

## Part 3: Configuration

In [None]:
# ============================================================================
# CONFIGURATION FOR ANOMALY TRAFFIC GENERATION
# ============================================================================

# Paths - ANOMALY DATA
DATA_PATH = '/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS/all_incidents_anomalies_reinforced_v2.csv'
OUTPUT_DIR = '/home/smotaali/BGP_Traffic_Generation/results/gan_anomaly_outputs/'

# High confidence anomaly labels to filter
HIGH_CONFIDENCE_LABELS = ['medium_confidence', 'high_confidence', 'very_high_confidence']

# Sequence parameters
SEQ_LEN = 30
STRIDE = 1

# ============================================================================
# FIXED HYPERPARAMETERS (based on best practices)
# ============================================================================
BATCH_SIZE = 64  # Smaller batch for stability
EPOCHS = 150

# SEPARATE LEARNING RATES (key fix!)
LR_G = 0.0002  # Generator learning rate
LR_D = 0.0001  # Discriminator learning rate (LOWER to prevent D from dominating)

BETA1 = 0.5
BETA2 = 0.999

# Model architecture
HIDDEN_DIM = 128
LATENT_DIM = 32
NUM_LAYERS = 2  # Reduced for stability

# Training stability
LABEL_SMOOTHING = 0.1  # Smooth labels: real=0.9, fake=0.1
NOISE_STD = 0.05  # Add noise to discriminator inputs
N_CRITIC = 1  # Train D only once per G update
CLIP_VALUE = 1.0  # Gradient clipping

# Data split
TEST_SIZE = 0.2
VAL_SIZE = 0.1

# Generation
N_SYNTHETIC = 2000

os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Configuration set for ANOMALY traffic generation.")
print(f"Data path: {DATA_PATH}")
print(f"Output dir: {OUTPUT_DIR}")
print(f"High confidence labels: {HIGH_CONFIDENCE_LABELS}")
print(f"LR_G: {LR_G}, LR_D: {LR_D}")
print(f"Label smoothing: {LABEL_SMOOTHING}")

## Part 4: Data Loading and Preprocessing

In [None]:
# Load data
print("Loading anomaly data...")
df = pd.read_csv(DATA_PATH)
print(f"Total rows: {len(df):,}")

# Check available columns
print(f"\nAvailable columns: {df.columns.tolist()}")

# Check confidence_label distribution
if 'confidence_label' in df.columns:
    print(f"\nConfidence labels distribution:")
    print(df['confidence_label'].value_counts())

# Filter high confidence anomalies
df_anomaly = df[df['confidence_label'].isin(HIGH_CONFIDENCE_LABELS)].copy()
print(f"\nHigh confidence anomaly rows: {len(df_anomaly):,}")

# Select features
available_features = [f for f in ALL_FEATURES if f in df_anomaly.columns]
print(f"Available features: {len(available_features)}")

# Get feature indices for post-processing
integer_indices = [available_features.index(f) for f in INTEGER_FEATURES if f in available_features]
continuous_indices = [available_features.index(f) for f in CONTINUOUS_FEATURES if f in available_features]

# Extract data
data = df_anomaly[available_features].fillna(0).values
NUM_FEATURES = len(available_features)
print(f"Data shape: {data.shape}")

In [None]:
# Show original data statistics (BEFORE normalization)
print("\nOriginal Anomaly Data Statistics (BEFORE normalization):")
print("-" * 60)
for i, feat in enumerate(available_features[:10]):  # Show first 10
    print(f"{feat}: min={data[:, i].min():.2f}, max={data[:, i].max():.2f}, "
          f"mean={data[:, i].mean():.2f}")

In [None]:
# Normalize using MinMaxScaler (scales to [0, 1])
# This allows us to use Sigmoid output in generator
print("\nNormalizing with MinMaxScaler [0, 1]...")
scaler = MinMaxScaler(feature_range=(0, 1))
data_norm = scaler.fit_transform(data)

print(f"Normalized range: [{data_norm.min():.4f}, {data_norm.max():.4f}]")
print(f"Mean: {data_norm.mean():.4f}")

In [None]:
# Create sequences
print(f"\nCreating sequences (T={SEQ_LEN}, stride={STRIDE})...")
sequences = []
for i in range(0, len(data_norm) - SEQ_LEN + 1, STRIDE):
    sequences.append(data_norm[i:i + SEQ_LEN])
sequences = np.array(sequences)
print(f"Sequences: {sequences.shape}")

# Split
X_train_val, X_test = train_test_split(sequences, test_size=TEST_SIZE, random_state=SEED)
X_train, X_val = train_test_split(X_train_val, test_size=VAL_SIZE, random_state=SEED)

print(f"Train: {X_train.shape[0]:,}")
print(f"Val: {X_val.shape[0]:,}")
print(f"Test: {X_test.shape[0]:,}")

# DataLoaders
train_tensor = torch.FloatTensor(X_train).to(device)
train_loader = DataLoader(TensorDataset(train_tensor), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
print(f"Batches: {len(train_loader)}")

In [None]:
# ============================================================================
# FIXED LSTM-GAN with Sigmoid output and stability improvements
# ============================================================================

class LSTM_Generator(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(latent_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # OUTPUT IN [0, 1] - matches MinMaxScaler
        )
    
    def forward(self, z):
        batch_size, seq_len, _ = z.shape
        out, _ = self.lstm(z)
        # Reshape for batch norm
        out = out.contiguous().view(-1, out.size(-1))
        out = self.fc(out)
        return out.view(batch_size, seq_len, -1)


class LSTM_Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.4),  # More dropout
            nn.Linear(hidden_dim // 2, 1)
            # No sigmoid - using BCEWithLogitsLoss
        )
    
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])


class FixedLSTMGAN:
    def __init__(self, input_dim, hidden_dim, latent_dim, num_layers, seq_len, device):
        self.device = device
        self.latent_dim = latent_dim
        self.seq_len = seq_len
        
        self.G = LSTM_Generator(latent_dim, hidden_dim, input_dim, num_layers).to(device)
        self.D = LSTM_Discriminator(input_dim, hidden_dim, num_layers).to(device)
        self.criterion = nn.BCEWithLogitsLoss()
        
    def train(self, train_loader, epochs, lr_g=0.0002, lr_d=0.0001, 
              label_smoothing=0.1, noise_std=0.05):
        """
        Train with stability improvements:
        - Separate learning rates
        - Label smoothing
        - Noise injection to D
        - Gradient clipping
        """
        opt_g = optim.Adam(self.G.parameters(), lr=lr_g, betas=(BETA1, BETA2))
        opt_d = optim.Adam(self.D.parameters(), lr=lr_d, betas=(BETA1, BETA2))
        
        # Learning rate schedulers
        scheduler_g = optim.lr_scheduler.StepLR(opt_g, step_size=50, gamma=0.9)
        scheduler_d = optim.lr_scheduler.StepLR(opt_d, step_size=50, gamma=0.9)
        
        history = {'g_loss': [], 'd_loss': [], 'd_real': [], 'd_fake': []}
        start = time.time()
        
        # Smoothed labels
        real_label = 1.0 - label_smoothing  # 0.9
        fake_label = label_smoothing  # 0.1
        
        for epoch in range(epochs):
            g_losses, d_losses = [], []
            d_reals, d_fakes = [], []
            
            for batch in train_loader:
                real = batch[0]
                bs = real.size(0)
                
                # Smoothed labels
                real_labels = torch.full((bs, 1), real_label).to(self.device)
                fake_labels = torch.full((bs, 1), fake_label).to(self.device)
                
                # =====================
                # Train Discriminator
                # =====================
                opt_d.zero_grad()
                
                # Add noise to real data (instance noise)
                real_noisy = real + noise_std * torch.randn_like(real)
                real_noisy = torch.clamp(real_noisy, 0, 1)  # Keep in [0,1]
                
                d_real_out = self.D(real_noisy)
                loss_real = self.criterion(d_real_out, real_labels)
                
                # Generate fake
                z = torch.randn(bs, self.seq_len, self.latent_dim).to(self.device)
                fake = self.G(z)
                
                # Add noise to fake
                fake_noisy = fake.detach() + noise_std * torch.randn_like(fake)
                fake_noisy = torch.clamp(fake_noisy, 0, 1)
                
                d_fake_out = self.D(fake_noisy)
                loss_fake = self.criterion(d_fake_out, fake_labels)
                
                d_loss = loss_real + loss_fake
                d_loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(self.D.parameters(), CLIP_VALUE)
                opt_d.step()
                
                # =====================
                # Train Generator (train more when D is strong)
                # =====================
                n_g_steps = 2 if d_loss.item() < 0.5 else 1  # Train G more when D is too good
                
                for _ in range(n_g_steps):
                    opt_g.zero_grad()
                    z = torch.randn(bs, self.seq_len, self.latent_dim).to(self.device)
                    fake = self.G(z)
                    d_fake_out = self.D(fake)
                    
                    # Generator wants D to output real_label
                    g_loss = self.criterion(d_fake_out, real_labels)
                    
                    # Feature matching loss (stabilizes training)
                    real_mean = real.mean(dim=[0, 1])
                    fake_mean = fake.mean(dim=[0, 1])
                    fm_loss = torch.mean((real_mean - fake_mean) ** 2)
                    
                    total_g_loss = g_loss + 0.1 * fm_loss
                    total_g_loss.backward()
                    
                    torch.nn.utils.clip_grad_norm_(self.G.parameters(), CLIP_VALUE)
                    opt_g.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
                d_reals.append(torch.sigmoid(d_real_out).mean().item())
                d_fakes.append(torch.sigmoid(d_fake_out).mean().item())
            
            scheduler_g.step()
            scheduler_d.step()
            
            history['g_loss'].append(np.mean(g_losses))
            history['d_loss'].append(np.mean(d_losses))
            history['d_real'].append(np.mean(d_reals))
            history['d_fake'].append(np.mean(d_fakes))
            
            if (epoch + 1) % 20 == 0:
                print(f"  Epoch {epoch+1:3d}/{epochs} | D: {history['d_loss'][-1]:.4f} | "
                      f"G: {history['g_loss'][-1]:.4f} | D(real): {history['d_real'][-1]:.2f} | "
                      f"D(fake): {history['d_fake'][-1]:.2f}")
        
        print(f"  Training time: {(time.time()-start)/60:.1f} min")
        return history
    
    def generate(self, n_samples):
        self.G.eval()
        with torch.no_grad():
            z = torch.randn(n_samples, self.seq_len, self.latent_dim).to(self.device)
            fake = self.G(z)
        return fake.cpu().numpy()

print("Fixed LSTM-GAN defined.")

In [None]:
# ============================================================================
# FIXED TimeGAN
# ============================================================================

class TimeGAN_Embedder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, x):
        h, _ = self.rnn(x)
        return torch.sigmoid(self.fc(h))


class TimeGAN_Recovery(nn.Module):
    def __init__(self, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.rnn = nn.GRU(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # Output in [0, 1]
        )
    
    def forward(self, h):
        r, _ = self.rnn(h)
        return self.fc(r)


class TimeGAN_Generator(nn.Module):
    def __init__(self, latent_dim, hidden_dim, num_layers):
        super().__init__()
        self.rnn = nn.GRU(latent_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, z):
        e, _ = self.rnn(z)
        return torch.sigmoid(self.fc(e))


class TimeGAN_Supervisor(nn.Module):
    def __init__(self, hidden_dim, num_layers):
        super().__init__()
        self.rnn = nn.GRU(hidden_dim, hidden_dim, max(1, num_layers - 1), batch_first=True)
        self.fc = nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, h):
        s, _ = self.rnn(h)
        return torch.sigmoid(self.fc(s))


class TimeGAN_Discriminator(nn.Module):
    def __init__(self, hidden_dim, num_layers):
        super().__init__()
        self.rnn = nn.GRU(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, h):
        d, _ = self.rnn(h)
        return self.fc(d)


class FixedTimeGAN:
    def __init__(self, input_dim, hidden_dim, latent_dim, num_layers, device):
        self.device = device
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        
        self.embedder = TimeGAN_Embedder(input_dim, hidden_dim, num_layers).to(device)
        self.recovery = TimeGAN_Recovery(hidden_dim, input_dim, num_layers).to(device)
        self.generator = TimeGAN_Generator(latent_dim, hidden_dim, num_layers).to(device)
        self.supervisor = TimeGAN_Supervisor(hidden_dim, num_layers).to(device)
        self.discriminator = TimeGAN_Discriminator(hidden_dim, num_layers).to(device)
        
        self.mse = nn.MSELoss()
        self.bce = nn.BCEWithLogitsLoss()
    
    def train(self, train_loader, epochs, lr_g=0.0002, lr_d=0.0001, label_smoothing=0.1):
        # Optimizers
        opt_e = optim.Adam(list(self.embedder.parameters()) + list(self.recovery.parameters()), lr=lr_g)
        opt_s = optim.Adam(self.supervisor.parameters(), lr=lr_g)
        opt_g = optim.Adam(list(self.generator.parameters()) + list(self.supervisor.parameters()), lr=lr_g)
        opt_d = optim.Adam(self.discriminator.parameters(), lr=lr_d)  # Slower D
        
        history = {'e_loss': [], 's_loss': [], 'g_loss': [], 'd_loss': []}
        start = time.time()
        
        real_label = 1.0 - label_smoothing
        fake_label = label_smoothing
        
        # Phase 1: Embedding
        print("  Phase 1: Embedding...")
        for epoch in range(epochs // 3):
            e_losses = []
            for batch in train_loader:
                x = batch[0]
                opt_e.zero_grad()
                h = self.embedder(x)
                x_tilde = self.recovery(h)
                loss = self.mse(x, x_tilde)
                loss.backward()
                opt_e.step()
                e_losses.append(loss.item())
            history['e_loss'].append(np.mean(e_losses))
            if (epoch + 1) % 15 == 0:
                print(f"    Epoch {epoch+1}/{epochs//3} | E_loss: {history['e_loss'][-1]:.4f}")
        
        # Phase 2: Supervised
        print("  Phase 2: Supervised...")
        for epoch in range(epochs // 3):
            s_losses = []
            for batch in train_loader:
                x = batch[0]
                opt_s.zero_grad()
                h = self.embedder(x)
                h_s = self.supervisor(h)
                loss = self.mse(h[:, 1:, :], h_s[:, :-1, :])
                loss.backward()
                opt_s.step()
                s_losses.append(loss.item())
            history['s_loss'].append(np.mean(s_losses))
            if (epoch + 1) % 15 == 0:
                print(f"    Epoch {epoch+1}/{epochs//3} | S_loss: {history['s_loss'][-1]:.4f}")
        
        # Phase 3: Joint
        print("  Phase 3: Joint...")
        for epoch in range(epochs // 3):
            g_losses, d_losses = [], []
            for batch in train_loader:
                x = batch[0]
                bs, seq_len, _ = x.shape
                z = torch.randn(bs, seq_len, self.latent_dim).to(self.device)
                
                real_labels = torch.full((bs, seq_len, 1), real_label).to(self.device)
                fake_labels = torch.full((bs, seq_len, 1), fake_label).to(self.device)
                
                # Generator (train 2x when D is strong)
                for _ in range(2):
                    opt_g.zero_grad()
                    h = self.embedder(x)
                    h_s = self.supervisor(h)
                    e_hat = self.generator(z)
                    h_hat = self.supervisor(e_hat)
                    x_hat = self.recovery(h_hat)
                    
                    y_fake = self.discriminator(h_hat)
                    g_loss = self.bce(y_fake, real_labels)
                    g_loss += self.mse(h[:, 1:, :], h_s[:, :-1, :]) * 10
                    
                    # Feature matching
                    g_loss += torch.mean((x_hat.mean(dim=0) - x.mean(dim=0)) ** 2) * 100
                    
                    g_loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.generator.parameters(), CLIP_VALUE)
                    opt_g.step()
                
                # Discriminator
                opt_d.zero_grad()
                h = self.embedder(x)
                e_hat = self.generator(z)
                h_hat = self.supervisor(e_hat)
                
                y_real = self.discriminator(h)
                y_fake = self.discriminator(h_hat.detach())
                
                d_loss = self.bce(y_real, real_labels) + self.bce(y_fake, fake_labels)
                
                # Only update D if not too strong
                if d_loss.item() > 0.3:
                    d_loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.discriminator.parameters(), CLIP_VALUE)
                    opt_d.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
            
            history['g_loss'].append(np.mean(g_losses))
            history['d_loss'].append(np.mean(d_losses))
            
            if (epoch + 1) % 15 == 0:
                print(f"    Epoch {epoch+1}/{epochs//3} | D: {history['d_loss'][-1]:.4f} | "
                      f"G: {history['g_loss'][-1]:.4f}")
        
        print(f"  Training time: {(time.time()-start)/60:.1f} min")
        return history
    
    def generate(self, n_samples, seq_len):
        self.generator.eval()
        self.supervisor.eval()
        self.recovery.eval()
        with torch.no_grad():
            z = torch.randn(n_samples, seq_len, self.latent_dim).to(self.device)
            e_hat = self.generator(z)
            h_hat = self.supervisor(e_hat)
            x_hat = self.recovery(h_hat)
        return x_hat.cpu().numpy()

print("Fixed TimeGAN defined.")

In [None]:
# ============================================================================
# FIXED DoppelGANger
# ============================================================================

class DG_Generator(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(latent_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # Output in [0, 1]
        )
    
    def forward(self, z):
        out, _ = self.lstm(z)
        return self.fc(out)


class DG_Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])


class FixedDoppelGANger:
    def __init__(self, input_dim, hidden_dim, latent_dim, num_layers, seq_len, device):
        self.device = device
        self.latent_dim = latent_dim
        self.seq_len = seq_len
        
        self.G = DG_Generator(latent_dim, hidden_dim, input_dim, num_layers).to(device)
        self.D = DG_Discriminator(input_dim, hidden_dim, num_layers).to(device)
        self.criterion = nn.BCEWithLogitsLoss()
    
    def train(self, train_loader, epochs, lr_g=0.0002, lr_d=0.0001, label_smoothing=0.1):
        opt_g = optim.Adam(self.G.parameters(), lr=lr_g, betas=(BETA1, BETA2))
        opt_d = optim.Adam(self.D.parameters(), lr=lr_d, betas=(BETA1, BETA2))
        
        history = {'g_loss': [], 'd_loss': []}
        start = time.time()
        
        real_label = 1.0 - label_smoothing
        fake_label = label_smoothing
        
        for epoch in range(epochs):
            g_losses, d_losses = [], []
            
            for batch in train_loader:
                real = batch[0]
                bs = real.size(0)
                
                real_labels = torch.full((bs, 1), real_label).to(self.device)
                fake_labels = torch.full((bs, 1), fake_label).to(self.device)
                
                # Train D
                opt_d.zero_grad()
                d_real = self.D(real)
                loss_real = self.criterion(d_real, real_labels)
                
                z = torch.randn(bs, self.seq_len, self.latent_dim).to(self.device)
                fake = self.G(z)
                d_fake = self.D(fake.detach())
                loss_fake = self.criterion(d_fake, fake_labels)
                
                d_loss = loss_real + loss_fake
                
                # Only update if not too strong
                if d_loss.item() > 0.3:
                    d_loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.D.parameters(), CLIP_VALUE)
                    opt_d.step()
                
                # Train G (more times when D is strong)
                n_g = 2 if d_loss.item() < 0.5 else N_CRITIC
                for _ in range(n_g):
                    opt_g.zero_grad()
                    z = torch.randn(bs, self.seq_len, self.latent_dim).to(self.device)
                    fake = self.G(z)
                    d_fake = self.D(fake)
                    g_loss = self.criterion(d_fake, real_labels)
                    
                    # Feature matching
                    g_loss += torch.mean((fake.mean(dim=0) - real.mean(dim=0)) ** 2) * 10
                    
                    g_loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.G.parameters(), CLIP_VALUE)
                    opt_g.step()
                
                g_losses.append(g_loss.item())
                d_losses.append(d_loss.item())
            
            history['g_loss'].append(np.mean(g_losses))
            history['d_loss'].append(np.mean(d_losses))
            
            if (epoch + 1) % 20 == 0:
                print(f"  Epoch {epoch+1:3d}/{epochs} | D: {history['d_loss'][-1]:.4f} | "
                      f"G: {history['g_loss'][-1]:.4f}")
        
        print(f"  Training time: {(time.time()-start)/60:.1f} min")
        return history
    
    def generate(self, n_samples):
        self.G.eval()
        with torch.no_grad():
            z = torch.randn(n_samples, self.seq_len, self.latent_dim).to(self.device)
            fake = self.G(z)
        return fake.cpu().numpy()

print("Fixed DoppelGANger defined.")

## Part 6: Train Models

In [None]:
models = {}
histories = {}
synthetic_raw = {}  # Normalized synthetic data
synthetic_processed = {}  # Post-processed synthetic data

print("="*60)
print("TRAINING WITH FIXED HYPERPARAMETERS - ANOMALY DATA")
print("="*60)
print(f"LR_G: {LR_G}, LR_D: {LR_D}")
print(f"Label smoothing: {LABEL_SMOOTHING}")
print(f"Noise std: {NOISE_STD}")
total_start = time.time()

In [None]:
# Train LSTM-GAN
print("\n" + "="*60)
print("Training Fixed LSTM-GAN on Anomaly Data")
print("="*60)

lstmgan = FixedLSTMGAN(NUM_FEATURES, HIDDEN_DIM, LATENT_DIM, NUM_LAYERS, SEQ_LEN, device)
histories['LSTM-GAN'] = lstmgan.train(
    train_loader, EPOCHS, 
    lr_g=LR_G, lr_d=LR_D,
    label_smoothing=LABEL_SMOOTHING,
    noise_std=NOISE_STD
)
models['LSTM-GAN'] = lstmgan

# Generate (normalized)
synthetic_raw['LSTM-GAN'] = lstmgan.generate(N_SYNTHETIC)
print(f"Generated (normalized): {synthetic_raw['LSTM-GAN'].shape}")
print(f"Range: [{synthetic_raw['LSTM-GAN'].min():.4f}, {synthetic_raw['LSTM-GAN'].max():.4f}]")

In [None]:
# Train TimeGAN
print("\n" + "="*60)
print("Training Fixed TimeGAN on Anomaly Data")
print("="*60)

timegan = FixedTimeGAN(NUM_FEATURES, HIDDEN_DIM, LATENT_DIM, NUM_LAYERS, device)
histories['TimeGAN'] = timegan.train(
    train_loader, EPOCHS,
    lr_g=LR_G, lr_d=LR_D,
    label_smoothing=LABEL_SMOOTHING
)
models['TimeGAN'] = timegan

synthetic_raw['TimeGAN'] = timegan.generate(N_SYNTHETIC, SEQ_LEN)
print(f"Generated (normalized): {synthetic_raw['TimeGAN'].shape}")

In [None]:
# Train DoppelGANger
print("\n" + "="*60)
print("Training Fixed DoppelGANger on Anomaly Data")
print("="*60)

doppelganger = FixedDoppelGANger(NUM_FEATURES, HIDDEN_DIM, LATENT_DIM, NUM_LAYERS, SEQ_LEN, device)
histories['DoppelGANger'] = doppelganger.train(
    train_loader, EPOCHS,
    lr_g=LR_G, lr_d=LR_D,
    label_smoothing=LABEL_SMOOTHING
)
models['DoppelGANger'] = doppelganger

synthetic_raw['DoppelGANger'] = doppelganger.generate(N_SYNTHETIC)
print(f"Generated (normalized): {synthetic_raw['DoppelGANger'].shape}")

print(f"\nTotal time: {(time.time()-total_start)/60:.1f} min")

## Part 7: Post-Processing (CRITICAL)

In [None]:
# Post-process all synthetic data
print("="*60)
print("POST-PROCESSING SYNTHETIC ANOMALY DATA")
print("="*60)
print("Steps: Denormalize → Clip negatives → Round integers")

for name, raw_data in synthetic_raw.items():
    print(f"\n--- {name} ---")
    print(f"Before: range=[{raw_data.min():.4f}, {raw_data.max():.4f}]")
    
    # Apply post-processing
    processed = post_process_synthetic_data(raw_data, available_features, scaler)
    synthetic_processed[name] = processed
    
    print(f"After: range=[{processed.min():.2f}, {processed.max():.2f}]")
    
    # Validate
    validate_synthetic_data(processed, available_features)

In [None]:
# Show sample of processed data
print("\nSample of processed anomaly data (first 5 features, first timestep):")
for name, data in synthetic_processed.items():
    print(f"\n{name}:")
    sample = data[0, 0, :5]  # First sequence, first timestep, first 5 features
    for i, val in enumerate(sample):
        print(f"  {available_features[i]}: {val}")

## Part 8: Fixed Evaluation Metrics

In [None]:
def calculate_metrics(real_data, syn_data, feature_names):
    """
    Calculate evaluation metrics with proper NaN handling.
    """
    real_flat = real_data.reshape(-1, len(feature_names))
    syn_flat = syn_data.reshape(-1, len(feature_names))
    
    metrics = {
        'per_feature': {},
        'ks_stats': [],
        'mae_values': [],
        'wasserstein': []
    }
    
    for i, feat in enumerate(feature_names):
        real_col = real_flat[:, i]
        syn_col = syn_flat[:, i]
        
        # Skip if all values are the same (causes NaN)
        if real_col.std() < 1e-10 or syn_col.std() < 1e-10:
            continue
        
        # KS statistic
        ks_stat, _ = stats.ks_2samp(real_col, syn_col)
        if not np.isnan(ks_stat):
            metrics['ks_stats'].append(ks_stat)
        
        # MAE (between means)
        mae = np.abs(real_col.mean() - syn_col.mean())
        if not np.isnan(mae):
            metrics['mae_values'].append(mae)
        
        # Wasserstein distance
        try:
            wd = stats.wasserstein_distance(real_col, syn_col)
            if not np.isnan(wd):
                metrics['wasserstein'].append(wd)
        except:
            pass
        
        metrics['per_feature'][feat] = {
            'real_mean': float(real_col.mean()),
            'syn_mean': float(syn_col.mean()),
            'real_std': float(real_col.std()),
            'syn_std': float(syn_col.std()),
            'ks': float(ks_stat) if not np.isnan(ks_stat) else None,
            'mae': float(mae) if not np.isnan(mae) else None
        }
    
    # Aggregate metrics
    metrics['ks_mean'] = np.mean(metrics['ks_stats']) if metrics['ks_stats'] else float('nan')
    metrics['mae_mean'] = np.mean(metrics['mae_values']) if metrics['mae_values'] else float('nan')
    metrics['wasserstein_mean'] = np.mean(metrics['wasserstein']) if metrics['wasserstein'] else float('nan')
    
    # Correlation matrix error
    real_corr = np.corrcoef(real_flat.T)
    syn_corr = np.corrcoef(syn_flat.T)
    real_corr = np.nan_to_num(real_corr, nan=0)
    syn_corr = np.nan_to_num(syn_corr, nan=0)
    metrics['corr_error'] = np.mean(np.abs(real_corr - syn_corr))
    
    # Overall score
    metrics['overall_score'] = (
        0.3 * metrics['ks_mean'] +
        0.3 * min(metrics['mae_mean'] / 100, 1) +  # Normalize MAE
        0.2 * metrics['wasserstein_mean'] / 10 +
        0.2 * metrics['corr_error']
    )
    
    return metrics

print("Fixed evaluation metrics defined.")

In [None]:
# Evaluate using POST-PROCESSED data (original scale)
print("\n" + "="*60)
print("EVALUATION (on post-processed anomaly data)")
print("="*60)

# Get real test data in original scale
X_test_np = X_test.cpu().numpy() if torch.is_tensor(X_test) else X_test
X_test_original = post_process_synthetic_data(X_test_np, available_features, scaler)

all_metrics = {}
for name, syn_data in synthetic_processed.items():
    metrics = calculate_metrics(X_test_original, syn_data, available_features)
    all_metrics[name] = metrics
    
    print(f"\n{name}:")
    print(f"  KS Statistic:     {metrics['ks_mean']:.4f}")
    print(f"  MAE:              {metrics['mae_mean']:.4f}")
    print(f"  Wasserstein:      {metrics['wasserstein_mean']:.4f}")
    print(f"  Correlation Err:  {metrics['corr_error']:.4f}")
    print(f"  Overall Score:    {metrics['overall_score']:.4f}")

In [None]:
# Best model
best_model = min(all_metrics.keys(), key=lambda k: all_metrics[k]['overall_score'])
print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model}")
print(f"{'='*60}")

## Part 9: Visualizations

In [None]:
# Training curves with D(real) and D(fake)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, hist) in enumerate(histories.items()):
    ax = axes[idx]
    ax.plot(hist['g_loss'], label='G Loss', alpha=0.8)
    ax.plot(hist['d_loss'], label='D Loss', alpha=0.8)
    if 'd_real' in hist:
        ax.plot(hist['d_real'], label='D(real)', linestyle='--', alpha=0.6)
        ax.plot(hist['d_fake'], label='D(fake)', linestyle='--', alpha=0.6)
    ax.set_title(f'{name} Training (Anomaly)')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss / Prob')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'training_curves_anomaly.png'), dpi=150)
plt.show()

In [None]:
# Distribution comparison (original scale!)
key_features = ['announcements', 'withdrawals', 'as_path_max', 'flaps', 'nadas', 'edit_distance_avg']
key_idx = [available_features.index(f) for f in key_features if f in available_features]

real_flat = X_test_original.reshape(-1, NUM_FEATURES)

for name, syn in synthetic_processed.items():
    syn_flat = syn.reshape(-1, NUM_FEATURES)
    
    fig, axes = plt.subplots(2, 3, figsize=(14, 8))
    axes = axes.flatten()
    
    for i, idx in enumerate(key_idx[:6]):
        axes[i].hist(real_flat[:, idx], bins=50, alpha=0.5, label='Real', density=True)
        axes[i].hist(syn_flat[:, idx], bins=50, alpha=0.5, label='Synthetic', density=True)
        axes[i].set_title(f"{available_features[idx]}")
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)
    
    plt.suptitle(f'{name} - Distribution Comparison (Anomaly Data)', fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f'{name.lower().replace("-", "_")}_dist_anomaly.png'), dpi=150)
    plt.show()

In [None]:
# Model comparison
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

metric_names = ['ks_mean', 'mae_mean', 'corr_error', 'overall_score']
titles = ['KS Statistic', 'MAE', 'Correlation Error', 'Overall Score']
model_names = list(all_metrics.keys())
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

for i, (metric, title) in enumerate(zip(metric_names, titles)):
    values = [all_metrics[m][metric] for m in model_names]
    bars = axes[i].bar(model_names, values, color=colors)
    axes[i].set_title(f'{title} (lower is better)')
    axes[i].grid(True, alpha=0.3)
    
    # Highlight best
    best_idx = np.argmin(values)
    bars[best_idx].set_edgecolor('gold')
    bars[best_idx].set_linewidth(3)

plt.suptitle('Model Comparison - Anomaly Traffic Generation', fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'model_comparison_anomaly.png'), dpi=150)
plt.show()

## Part 10: Save Results

In [None]:
# Save post-processed synthetic anomaly data
print("Saving post-processed synthetic anomaly data...")

for name, data in synthetic_processed.items():
    records = []
    for i in range(data.shape[0]):
        for t in range(data.shape[1]):
            rec = {'sequence_id': i, 'timestep': t, 'label': 'anomaly'}
            for j, feat in enumerate(available_features):
                val = data[i, t, j]
                # Ensure integer features are int
                if feat in INTEGER_FEATURES:
                    rec[feat] = int(val)
                else:
                    rec[feat] = float(val)
            records.append(rec)
    
    df_out = pd.DataFrame(records)
    path = os.path.join(OUTPUT_DIR, f'synthetic_{name.lower().replace("-", "_")}_anomaly.csv')
    df_out.to_csv(path, index=False)
    print(f"Saved: {path}")

# Save best model's data as main output
best_path = os.path.join(OUTPUT_DIR, 'synthetic_anomaly_traffic.csv')
import shutil
shutil.copy(
    os.path.join(OUTPUT_DIR, f'synthetic_{best_model.lower().replace("-", "_")}_anomaly.csv'),
    best_path
)
print(f"Best model output: {best_path}")

In [None]:
# Verify saved data
print("\nVerifying saved anomaly data...")
df_check = pd.read_csv(best_path)

print(f"\nSample of saved data:")
print(df_check.head())

print(f"\nData types:")
print(df_check.dtypes[:10])

print(f"\nValue ranges:")
for col in ['announcements', 'withdrawals', 'flaps', 'edit_distance_avg']:
    if col in df_check.columns:
        print(f"  {col}: min={df_check[col].min()}, max={df_check[col].max()}, has_negative={df_check[col].min() < 0}")

In [None]:
# Save scaler and config
with open(os.path.join(OUTPUT_DIR, 'scaler_anomaly.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

config = {
    'data_type': 'anomaly',
    'high_confidence_labels': HIGH_CONFIDENCE_LABELS,
    'seq_len': SEQ_LEN,
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'lr_g': LR_G,
    'lr_d': LR_D,
    'label_smoothing': LABEL_SMOOTHING,
    'hidden_dim': HIDDEN_DIM,
    'latent_dim': LATENT_DIM,
    'features': available_features,
    'integer_features': INTEGER_FEATURES,
    'continuous_features': CONTINUOUS_FEATURES,
    'best_model': best_model,
    'metrics': {k: {kk: vv for kk, vv in v.items() if kk != 'per_feature'} 
                for k, v in all_metrics.items()},
    'timestamp': datetime.now().isoformat()
}

with open(os.path.join(OUTPUT_DIR, 'config_anomaly.json'), 'w') as f:
    json.dump(config, f, indent=2, default=str)

print("Saved config_anomaly.json and scaler_anomaly.pkl")

In [None]:
# Summary
print("\n" + "="*70)
print("COMPLETE - ANOMALY TRAFFIC GENERATION SUMMARY")
print("="*70)
print(f"""
Data Configuration:
  Data path: {DATA_PATH}
  High confidence labels: {HIGH_CONFIDENCE_LABELS}
  Training samples: {len(X_train):,}

Fixes Applied:
  ✓ Separate learning rates (G: {LR_G}, D: {LR_D})
  ✓ Label smoothing ({LABEL_SMOOTHING})
  ✓ Sigmoid output (data in [0,1])
  ✓ Post-processing: denormalize → clip → round
  ✓ Fixed MAE calculation (NaN handling)

Results:
  Best Model: {best_model}
  KS Statistic: {all_metrics[best_model]['ks_mean']:.4f}
  MAE: {all_metrics[best_model]['mae_mean']:.4f}

Output: {OUTPUT_DIR}
""")

for f in sorted(os.listdir(OUTPUT_DIR)):
    size = os.path.getsize(os.path.join(OUTPUT_DIR, f)) / 1024
    print(f"  {f}: {size:.1f} KB")