In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/network-intrusion-dataset/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Tuesday-WorkingHours.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Monday-WorkingHours.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Friday-WorkingHours-Morning.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
/kaggle/input/network-intrusion-dataset/Wednesday-workingHours.pcap_ISCX.csv
/kaggle/input/cicids2017/cicids2017.csv


In [2]:
#!/usr/bin/env python3
"""
Enhanced Binary RTIDS Training Script - GPU T4 x2 Optimized (Kaggle Compatible)
- Implements ALL performance improvements without imblearn dependency
- Uses optimal hyperparameters: d_model=160, focal_gamma=1.8, focal_alpha=0.75
- Advanced architecture with residual scaling and attention pooling
- Custom balancing (undersampling only, NO synthetic samples, NO feature engineering, NO early stopping)
- Expected performance: 99.85%+ ROC-AUC
"""
import os
import time
import random
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from torch.nn.parallel import DataParallel
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix, 
                           roc_auc_score, precision_recall_curve, auc,
                           f1_score, precision_score, recall_score)
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.neighbors import NearestNeighbors
import math
import warnings
warnings.filterwarnings('ignore')

# Multi-GPU Setup for T4 x2
def setup_device():
    """Setup multi-GPU device configuration for T4 x2"""
    if torch.cuda.is_available():
        device_count = torch.cuda.device_count()
        print(f"üî• CUDA available with {device_count} GPU(s)")
        
        for i in range(device_count):
            gpu_name = torch.cuda.get_device_name(i)
            memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
            print(f"   GPU {i}: {gpu_name} ({memory:.1f}GB)")
        
        device = torch.device("cuda:0")
        return device, device_count > 1
    else:
        print("üíª CUDA not available, using CPU")
        return torch.device("cpu"), False

DEVICE, MULTI_GPU = setup_device()

# ======================================================================================
# ENHANCED CONFIGURATION WITH OPTIMAL HYPERPARAMETERS
# ======================================================================================
class EnhancedConfig:
    def __init__(self):
        # Kaggle paths
        self.input_path = '/kaggle/input/cicids2017/cicids2017.csv'
        self.output_dir = '/kaggle/working/'
        
        # OPTIMAL TRAINING PARAMETERS (from experimental results)
        self.epochs = 35  # Extended for better convergence
        self.batch_size = 1024 if MULTI_GPU else 512  # Optimal from experiments
        self.val_batch_size = 2048 if MULTI_GPU else 1024
        
        # OPTIMAL MODEL ARCHITECTURE (best from Phase 2)
        self.d_model = 160  # Best performing architecture
        self.num_layers = 4
        self.heads = 10
        self.d_ff = 640
        self.dropout = 0.15
        
        # OPTIMAL OPTIMIZATION (from Phase 4)
        self.lr = 0.002  # Best learning rate
        self.weight_decay = 1e-4
        self.warmup_steps = 1000
        
        # OPTIMAL LOSS CONFIGURATION (from Phase 3)
        self.focal_gamma = 1.8  # Best focal gamma
        self.focal_alpha = 0.75  # Best focal alpha
        self.use_class_weights = True
        self.label_smoothing = 0.1
        
        # OPTIMAL DATA HANDLING (from Phase 1) - NO SYNTHETIC SAMPLES, NO FEATURE ENGINEERING
        self.test_size = 0.2
        self.random_state = 42
        self.use_robust_scaling = True
        self.undersampling_ratio = 0.12  # Slightly higher ratio since no SMOTE
        
        # ADVANCED TRAINING FEATURES
        self.use_swa = True
        self.swa_start = 20
        self.swa_freq = 3
        self.use_mixup = True
        self.mixup_alpha = 0.2
        self.gradient_accumulation_steps = 2
        
        # Multi-GPU specific
        self.use_multi_gpu = MULTI_GPU
        self.num_workers = 6 if MULTI_GPU else 4

# ======================================================================================
# CUSTOM DATA PROCESSING (NO SYNTHETIC SAMPLES, NO FEATURE ENGINEERING)
# ======================================================================================
class IntelligentDataBalancer:
    """Intelligent data balancing using undersampling only (NO synthetic samples)"""
    def __init__(self, undersampling_ratio=0.12, random_state=42):
        self.undersampling_ratio = undersampling_ratio
        self.random_state = random_state
        np.random.seed(random_state)
    
    def balance_classes(self, X, y):
        """ Advanced balancing using intelligent undersampling only"""
        print("‚öñÔ∏è Intelligent class balancing (undersampling only)...")
        
        unique_classes, counts = np.unique(y, return_counts=True)
        print(f"Original distribution: {dict(zip(['Normal', 'Attack'], counts))}")
        
        # Intelligent undersampling
        majority_indices = np.where(y == 0)[0]
        minority_indices = np.where(y == 1)[0]
        
        # Calculate target majority size for better balance
        target_majority = max(
            len(minority_indices) * 3,  # 3:1 ratio for better balance
            int(len(majority_indices) * self.undersampling_ratio)
        )
        
        if len(majority_indices) > target_majority:
            # Sample majority class with slight preference for samples closer to minority class
            try:
                # Quick distance-based sampling for better boundary representation
                minority_samples = X[minority_indices]
                majority_samples = X[majority_indices]
                
                # Calculate distances to minority class center
                minority_center = np.mean(minority_samples, axis=0)
                distances = np.linalg.norm(majority_samples - minority_center, axis=1)
                
                # Create sampling weights (closer samples have higher probability)
                weights = 1 / (distances + 1e-8)
                weights = weights / np.sum(weights)
                
                selected_majority = np.random.choice(
                    majority_indices, 
                    size=target_majority, 
                    replace=False,
                    p=weights
                )
                print(f"‚úÖ Applied distance-based intelligent undersampling")
            except:
                # Fallback to random sampling
                selected_majority = np.random.choice(
                    majority_indices, size=target_majority, replace=False
                )
                print(f"‚úÖ Applied random undersampling")
        else:
            selected_majority = majority_indices
        
        # Combine undersampled data
        selected_indices = np.concatenate([selected_majority, minority_indices])
        X_balanced = X[selected_indices]
        y_balanced = y[selected_indices]
        
        final_counts = np.bincount(y_balanced)
        print(f"Final distribution: Normal={final_counts[0]:,}, Attack={final_counts[1]:,}")
        print(f"Class ratio: {final_counts[0]/final_counts[1]:.2f}:1 (Normal:Attack)")
        
        return X_balanced, y_balanced

class RobustPreprocessor:
    def __init__(self, scaling_method='quantile', handle_outliers=True, n_features=120):
        self.scaling_method = scaling_method
        self.handle_outliers = handle_outliers
        self.n_features = n_features
        self.scaler = None
        self.feature_selector = None
        self.outlier_detector = None
        
    def fit_transform(self, X, y=None):
        """Robust preprocessing pipeline"""
        print("üîß Advanced preprocessing...")
        
        # Handle infinite and missing values
        X_processed = X.replace([np.inf, -np.inf], np.nan)
        
        # Intelligent missing value imputation
        for col in X_processed.columns:
            if X_processed[col].isna().sum() > 0:
                if y is not None:
                    # Class-specific imputation
                    for class_val in np.unique(y):
                        mask = (y == class_val) & X_processed[col].notna()
                        if mask.sum() > 0:
                            fill_value = X_processed.loc[mask, col].median()
                            class_mask = (y == class_val) & X_processed[col].isna()
                            X_processed.loc[class_mask, col] = fill_value
                else:
                    X_processed[col].fillna(X_processed[col].median(), inplace=True)
        
        # Outlier handling
        if self.handle_outliers:
            try:
                # Simple quantile-based outlier capping
                for col in X_processed.columns:
                    Q1 = X_processed[col].quantile(0.005)
                    Q99 = X_processed[col].quantile(0.995)
                    X_processed[col] = X_processed[col].clip(lower=Q1, upper=Q99)
                print("‚úÖ Outlier capping applied")
            except Exception as e:
                print(f"‚ö†Ô∏è Outlier handling failed: {e}")
        
        # Feature selection
        if y is not None and self.n_features < X_processed.shape[1]:
            try:
                self.feature_selector = SelectKBest(mutual_info_classif, k=self.n_features)
                X_selected = self.feature_selector.fit_transform(X_processed, y)
                X_processed = pd.DataFrame(X_selected, index=X_processed.index)
                print(f"üìä Selected {self.n_features} most informative features")
            except Exception as e:
                print(f"‚ö†Ô∏è Feature selection failed: {e}")
        
        # Advanced scaling
        try:
            if self.scaling_method == 'quantile':
                self.scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
            elif self.scaling_method == 'robust':
                self.scaler = RobustScaler()
            else:
                self.scaler = StandardScaler()
            
            X_scaled = self.scaler.fit_transform(X_processed)
            print(f"‚úÖ Applied {self.scaling_method} scaling")
        except Exception as e:
            print(f"‚ö†Ô∏è Scaling failed: {e}")
            X_scaled = X_processed.values
        
        return X_scaled
    
    def transform(self, X):
        """Transform new data using fitted preprocessor"""
        X_processed = X.replace([np.inf, -np.inf], np.nan)
        
        for col in X_processed.columns:
            if X_processed[col].isna().sum() > 0:
                X_processed[col].fillna(X_processed[col].median(), inplace=True)
        
        if self.handle_outliers:
            for col in X_processed.columns:
                Q1 = X_processed[col].quantile(0.005)
                Q99 = X_processed[col].quantile(0.995)
                X_processed[col] = X_processed[col].clip(lower=Q1, upper=Q99)
        
        if self.feature_selector is not None:
            X_processed = self.feature_selector.transform(X_processed)
            X_processed = pd.DataFrame(X_processed)
        
        if self.scaler is not None:
            return self.scaler.transform(X_processed)
        else:
            return X_processed.values

# ======================================================================================
# TRANSFORMER ARCHITECTURE
# ======================================================================================
class FeatureImportanceLayer(nn.Module):
    def __init__(self, input_dim, d_model):
        super().__init__()
        self.feature_attention = nn.Sequential(
            nn.Linear(input_dim, d_model),
            nn.Tanh(),
            nn.Linear(d_model, input_dim),
            nn.Sigmoid()
        )
        self.projection = nn.Linear(input_dim, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        # Calculate feature importance scores
        # Input: [batch_size, 78]
        # Through: Linear(78‚Üí160) ‚Üí Tanh ‚Üí Linear(160‚Üí78) ‚Üí Sigmoid
        # Output: [batch_size, 78] (importance scores between 0-1)
        importance_scores = self.feature_attention(x)
        
        # Apply attention to input features
        attended_features = x * importance_scores
        
        # Project to model dimension
        embedded = self.projection(attended_features)
        normalized = self.layer_norm(embedded)
        embedded = self.dropout(normalized)

        # Add sequence dimension for transformer processing
        return embedded.unsqueeze(1), importance_scores
        # Transformers expect sequences, but we have tabular data
        # We treat each sample as a "sequence" of length 1
        # This allows transformer attention mechanisms to work

class EnhancedMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.qkv_projection = nn.Linear(d_model, d_model * 3, bias=False)
        self.output_projection = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
        self.scale = math.sqrt(self.d_k)
    
    def forward(self, x):
        batch_size, seq_len, d_model = x.size()
        
        # Compute Q, K, V
        qkv = self.qkv_projection(x) # Linear(160 ‚Üí 480)
        qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.d_k) # 160/10 = 16 per head
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        #q = q.reshape(batch_size, 1, 10, 16)  # 160/10 = 16 per head
        # k = k.reshape(batch_size, 1, 10, 16)
        # v = v.reshape(batch_size, 1, 10, 16)
        
        # Attention computation Each attention head focuses on different feature relationships
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        attention_output = torch.matmul(attention_weights, v)
        
        # Reshape and project
        attention_output = attention_output.transpose(1, 2).contiguous()
        attention_output = attention_output.reshape(batch_size, seq_len, d_model)
        
        return self.output_projection(attention_output)

class EnhancedTransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.attention = EnhancedMultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff), # Expand dimension
            nn.GELU(),                # Activation function
            nn.Dropout(dropout),      # Regularization
            nn.Linear(d_ff, d_model)  # Project back to original dimension
        )
        
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
        # Residual scaling for better gradient flow
        self.residual_scale = nn.Parameter(torch.ones(1) * 0.8)
    
    def forward(self, x):
        # Enhanced residual connections with scaling
        #Layer Normalization 
        attended = self.attention(self.norm1(x))
        x = x + self.dropout(attended) * self.residual_scale
        
        fed_forward = self.feed_forward(self.norm2(x))
        x = x + self.dropout(fed_forward) * self.residual_scale
        
        return x
                                    # Block 1: Learns basic feature patterns
                                    # Block 2: Learns feature interactions
                                    # Block 3: Learns complex attack signatures
                                    # Block 4: Refines final representations
class AttentionPoolingClassifier(nn.Module):
    def __init__(self, d_model, dropout=0.15):
        super().__init__()
        # Attention-based pooling
        self.attention_pool = nn.MultiheadAttention(d_model, num_heads=8, batch_first=True)
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        
        # Enhanced classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2), # First reduction
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, d_model // 4), # Second reduction
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 4, 2)      # Final output for binary classification [Normal, Attack]
        )
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # Add learnable CLS token
        cls_token = self.cls_token.expand(batch_size, -1, -1)
        x_with_cls = torch.cat([cls_token, x], dim=1)
        
        # Attention pooling
        pooled, _ = self.attention_pool(cls_token, x_with_cls, x_with_cls)
        pooled = pooled.squeeze(1)
        
        return self.classifier(pooled)

class EnhancedBinaryTransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model=160, num_layers=4, num_heads=10, d_ff=640, dropout=0.15):
        super().__init__()
        
        # Enhanced feature embedding with importance
        self.feature_embedder = FeatureImportanceLayer(input_dim, d_model)
        
        # Enhanced transformer blocks
        self.transformer_blocks = nn.ModuleList([
            EnhancedTransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final layer norm
        self.final_norm = nn.LayerNorm(d_model)
        
        # Enhanced classifier with attention pooling
        self.classifier = AttentionPoolingClassifier(d_model, dropout)
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                torch.nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.constant_(module.bias, 0)
            torch.nn.init.constant_(module.weight, 1.0)
    
    def forward(self, x):
        # Enhanced feature embedding with importance
        x, feature_importance = self.feature_embedder(x)
        
        # Pass through enhanced transformer blocks
        for block in self.transformer_blocks:
            x = block(x)
        
        # Final normalization
        x = self.final_norm(x)
        
        # Enhanced classification
        logits = self.classifier(x)
        
        return logits, feature_importance

# ======================================================================================
# ADVANCED TRAINING COMPONENTS
# ======================================================================================
class AdaptiveFocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma_min=1.0, gamma_max=3.0, 
                 class_weights=None, label_smoothing=0.1):
        super().__init__()
        self.alpha = alpha
        self.gamma_min = gamma_min
        self.gamma_max = gamma_max
        self.class_weights = class_weights
        self.label_smoothing = label_smoothing
        
        # Learnable gamma parameter (initialized to optimal value)
        self.gamma = nn.Parameter(torch.tensor(1.8))
        
    def forward(self, inputs, targets):
        # Clamp gamma to reasonable range
        gamma = torch.clamp(self.gamma, self.gamma_min, self.gamma_max)
        
        # Standard focal loss computation
        ce_loss = F.cross_entropy(inputs, targets, weight=self.class_weights,
                                 reduction='none', label_smoothing=self.label_smoothing)
        
        pt = torch.exp(-ce_loss)
        
        # Apply alpha weighting
        if self.alpha is not None:
            alpha_weight = torch.where(targets == 1, self.alpha, 1 - self.alpha)
            focal_loss = alpha_weight * (1 - pt) ** gamma * ce_loss
        else:
            focal_loss = (1 - pt) ** gamma * ce_loss
        
        return focal_loss.mean()
                #Why Focal Loss?
                # Handles class imbalance (more normal than attack traffic)
                # Focuses on hard-to-classify examples
                # Œ±=0.75 gives more weight to attack class
                # Œ≥=1.8 reduces loss for easy examples

class TabularMixup:
    def __init__(self, alpha=0.2):
        self.alpha = alpha
        
    def __call__(self, x, y):
        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
        else:
            lam = 1
            
        batch_size = x.size(0)
        index = torch.randperm(batch_size).to(x.device)
        
        mixed_x = lam * x + (1 - lam) * x[index, :]
        y_a, y_b = y, y[index]
        
        return mixed_x, y_a, y_b, lam
    
    def mixup_criterion(self, pred, y_a, y_b, lam, criterion):
        if isinstance(pred, tuple):
            pred, _ = pred
        return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

class SWAOptimizer:
    def __init__(self, base_optimizer, swa_start=20, swa_freq=3, swa_lr=0.001):
        self.base_optimizer = base_optimizer
        self.swa_start = swa_start
        self.swa_freq = swa_freq
        self.swa_lr = swa_lr
        self.swa_model = None
        self.n_averaged = 0
        
    def update_swa(self, model, epoch):
        if epoch >= self.swa_start and (epoch - self.swa_start) % self.swa_freq == 0:
            if self.swa_model is None:
                self.swa_model = copy.deepcopy(model)
            else:
                # Update SWA model
                for swa_param, param in zip(self.swa_model.parameters(), model.parameters()):
                    swa_param.data = (swa_param.data * self.n_averaged + param.data) / (self.n_averaged + 1)
            self.n_averaged += 1
            return True
        return False
    
    def get_swa_model(self):
        return self.swa_model

# ======================================================================================
# ENHANCED TRAINING UTILITIES
# ======================================================================================
def get_device_and_model(model, config):
    """Setup device and model with multi-GPU support"""
    model = model.to(DEVICE)
    
    if config.use_multi_gpu and MULTI_GPU:
        print(f"üî• Using DataParallel with {torch.cuda.device_count()} GPUs")
        model = DataParallel(model)
    
    print(f"üíª Using device: {DEVICE}")
    if config.use_multi_gpu and MULTI_GPU:
        print(f"üî• Multi-GPU mode enabled")
    
    return DEVICE, model

def create_enhanced_data_loaders(X_train, y_train, X_val, y_val, config):
    """Create optimized data loaders"""
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train),
        torch.LongTensor(y_train)
    )
    val_dataset = TensorDataset(
        torch.FloatTensor(X_val),
        torch.LongTensor(y_val)
    )
    
    print(f"üìä Dataset sizes - Train: {len(train_dataset)}, Val: {len(val_dataset)}")
    
    # Enhanced weighted sampler
    class_counts = np.bincount(y_train)
    weights = 1.0 / class_counts
    sample_weights = weights[y_train]
    
    weighted_sampler = WeightedRandomSampler(
        torch.DoubleTensor(sample_weights),
        len(sample_weights),
        replacement=True
    )
    
    # Optimized data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        sampler=weighted_sampler,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
        persistent_workers=True if config.num_workers > 0 else False
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.val_batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=True,
        persistent_workers=True if config.num_workers > 0 else False
    )
    
    return train_loader, val_loader

def calculate_comprehensive_metrics(y_true, y_pred, y_prob):
    """Calculate comprehensive metrics"""
    if len(y_true) == 0 or len(y_pred) == 0 or len(y_prob) == 0:
        return {
            'accuracy': 0.0, 'auc_roc': 0.0, 'auc_pr': 0.0,
            'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0
        }
    
    try:
        accuracy = np.mean(y_true == y_pred)
        
        unique_classes = np.unique(y_true)
        if len(unique_classes) < 2:
            return {
                'accuracy': accuracy, 'auc_roc': 0.5, 'auc_pr': np.mean(y_true),
                'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0
            }
        
        auc_roc = roc_auc_score(y_true, y_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_prob)
        auc_pr = auc(recall, precision)
        
        f1 = f1_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        
        return {
            'accuracy': accuracy,
            'auc_roc': auc_roc,
            'auc_pr': auc_pr,
            'f1_score': f1,
            'precision': prec,
            'recall': rec
        }
    except Exception as e:
        print(f"‚ö†Ô∏è Error calculating metrics: {e}")
        return {
            'accuracy': 0.0, 'auc_roc': 0.0, 'auc_pr': 0.0,
            'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0
        }

# ======================================================================================
# MAIN ENHANCED TRAINING FUNCTION
# ======================================================================================
def main():
    config = EnhancedConfig()
    
    # Set random seeds
    torch.manual_seed(config.random_state)
    np.random.seed(config.random_state)
    random.seed(config.random_state)
    
    print("üöÄ Enhanced Binary RTIDS Training - GPU T4 x2 Optimized (No Synthetic Samples, No Feature Engineering, No Early Stopping)")
    print("üéØ Target: 99.85%+ ROC-AUC with optimal hyperparameters")
    print("‚úÖ Full epoch training - NO synthetic samples, NO feature engineering, NO early stopping")
    print("=" * 100)
    
    # Load and prepare data
    print("üìä Loading CICIDS2017 dataset...")
    df = pd.read_csv(config.input_path)
    print(f"Dataset shape: {df.shape}")
    
    # Find label column
    label_col = None
    for col in df.columns:
        if 'label' in col.lower():
            label_col = col
            break
    
    if label_col is None:
        raise ValueError("‚ùå No label column found!")
    
    print(f"‚úÖ Found label column: {label_col}")
    
    # Convert to binary classification
    print("üîÑ Converting to binary classification...")
    df['binary_label'] = (df[label_col] != 'BENIGN').astype(int)
    
    print(f"Binary distribution:")
    print(f"Normal (0): {np.sum(df['binary_label'] == 0):,}")
    print(f"Attack (1): {np.sum(df['binary_label'] == 1):,}")
    
    # Prepare features (using original features only)
    print("üìä Using original features only...")
    exclude_cols = [label_col, 'binary_label', 'Flow ID', 'Source IP', 
                   'Destination IP', 'Timestamp']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X = df[feature_cols].copy()
    y = df['binary_label'].values
    
    print(f"Using {len(feature_cols)} original features (no feature engineering)")
    
    # Handle missing and infinite values
    print("üßπ Cleaning data...")
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    
    # Advanced preprocessing
    print("üìè Advanced preprocessing...")
    preprocessor = RobustPreprocessor(
        scaling_method='quantile',
        handle_outliers=True,
        n_features=120
    )
    X_processed = preprocessor.fit_transform(X, y)
    
    # Intelligent class balancing (undersampling only)
    print("‚öñÔ∏è Intelligent class balancing (undersampling only)...")
    balancer = IntelligentDataBalancer(undersampling_ratio=config.undersampling_ratio, random_state=config.random_state)
    X_balanced, y_balanced = balancer.balance_classes(X_processed, y)
    
    # Train-validation split
    print("‚úÇÔ∏è Splitting data...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_balanced, y_balanced,
        test_size=config.test_size,
        stratify=y_balanced,
        random_state=config.random_state
    )
    
    print(f"Training set: {X_train.shape[0]:,} samples")
    print(f"Validation set: {X_val.shape[0]:,} samples")
    print(f"Features: {X_train.shape[1]}")
    
    # Calculate class weights
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)
    class_weights = torch.FloatTensor([
        total_samples / (2 * class_counts[0]),
        total_samples / (2 * class_counts[1])
    ])
    
    print(f"üìä Class weights: Normal={class_weights[0]:.3f}, Attack={class_weights[1]:.3f}")
    
    # Create enhanced model
    print("ü§ñ Creating Enhanced Binary Transformer...")
    model = EnhancedBinaryTransformerClassifier(
        input_dim=X_train.shape[1],
        d_model=config.d_model,
        num_layers=config.num_layers,
        num_heads=config.heads,
        d_ff=config.d_ff,
        dropout=config.dropout
    )
    
    # Setup device and multi-GPU
    device, model = get_device_and_model(model, config)
    class_weights = class_weights.to(device)
    
    # Print model info
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"üìà Total parameters: {total_params:,}")
    print(f"üìà Trainable parameters: {trainable_params:,}")
    
    # Create enhanced data loaders
    print("üîÑ Creating enhanced data loaders...")
    train_loader, val_loader = create_enhanced_data_loaders(
        X_train, y_train, X_val, y_val, config
    )
    
    # Advanced loss function and optimizer
    criterion = AdaptiveFocalLoss(
        alpha=config.focal_alpha,
        gamma_min=1.0,
        gamma_max=3.0,
        class_weights=class_weights if config.use_class_weights else None,
        label_smoothing=config.label_smoothing
    )
    
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.lr,
        weight_decay=config.weight_decay,
        betas=(0.9, 0.999)
    )
    
    # Enhanced learning rate scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.lr,
        epochs=config.epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy='cos'
    )
    
    # Advanced training components
    mixup = TabularMixup(alpha=config.mixup_alpha) if config.use_mixup else None
    swa_optimizer = SWAOptimizer(optimizer, swa_start=config.swa_start, swa_freq=config.swa_freq) if config.use_swa else None
    
    print(f"üéØ Enhanced Loss: Adaptive Focal Loss (Œ≥=learnable, Œ±={config.focal_alpha})")
    print(f"üéØ Optimizer: AdamW (lr={config.lr})")
    print(f"üéØ Advanced Features: SWA={config.use_swa}, Mixup={config.use_mixup}")
    print(f"üéØ Training: Full {config.epochs} epochs (no early stopping)")
    print(f"üéØ Data Strategy: Original features + intelligent undersampling only (NO synthetic samples, NO feature engineering)")
    
    # Enhanced training loop
    print("\nüöÄ Starting enhanced training...")
    print("=" * 80)
    
    best_auc = 0.0
    best_model_state = None
    
    for epoch in range(config.epochs):
        # Training phase
        model.train()
        train_losses = []
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            
            # Apply mixup
            if mixup and epoch > 5:
                mixed_data, y_a, y_b, lam = mixup(data, target)
                output = model(mixed_data)
                loss = mixup.mixup_criterion(output, y_a, y_b, lam, criterion)
            else:
                output = model(data)
                if isinstance(output, tuple):
                    output, _ = output
                loss = criterion(output, target)
            
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step()
            train_losses.append(loss.item())
            
            if batch_idx % 100 == 0:
                current_lr = scheduler.get_last_lr()[0]
                gamma_val = criterion.gamma.item() if hasattr(criterion, 'gamma') else config.focal_gamma
                print(f"Epoch {epoch+1:2d} | Batch {batch_idx:4d} | Loss: {loss.item():.4f} | LR: {current_lr:.2e} | Œ≥: {gamma_val:.3f}")
        
        # SWA update
        if swa_optimizer:
            swa_updated = swa_optimizer.update_swa(model, epoch)
            if swa_updated:
                print(f"üìä SWA model updated (n_averaged: {swa_optimizer.n_averaged})")
        
        # Validation phase
        model.eval()
        val_losses = []
        all_preds = []
        all_probs = []
        all_targets = []
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
                
                output = model(data)
                if isinstance(output, tuple):
                    output, _ = output
                
                val_loss = criterion(output, target)
                val_losses.append(val_loss.item())
                
                probs = F.softmax(output, dim=1)
                preds = output.argmax(dim=1)
                
                all_preds.extend(preds.cpu().numpy())
                all_probs.extend(probs[:, 1].cpu().numpy())
                all_targets.extend(target.cpu().numpy())
        
        # Calculate comprehensive metrics
        metrics = calculate_comprehensive_metrics(
            np.array(all_targets), 
            np.array(all_preds), 
            np.array(all_probs)
        )
        
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        
        print(f"\nEpoch {epoch+1:2d} Summary:")
        print(f"  Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f} | ROC-AUC: {metrics['auc_roc']:.4f} | PR-AUC: {metrics['auc_pr']:.4f}")
        print(f"  F1: {metrics['f1_score']:.4f} | Precision: {metrics['precision']:.4f} | Recall: {metrics['recall']:.4f}")
        
        # Save best model
        if metrics['auc_roc'] > best_auc:
            best_auc = metrics['auc_roc']
            model_to_save = model.module if isinstance(model, DataParallel) else model
            best_model_state = {
                'epoch': epoch + 1,
                'model_state_dict': model_to_save.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_auc': best_auc,
                'config': config.__dict__,
                'preprocessor': preprocessor,
                'balancer': balancer,
                'metrics': metrics
            }
            
            model_path = os.path.join(config.output_dir, 'enhanced_binary_rtids_model_no_synthetic_no_features_no_early_stopping.pth')
            torch.save(best_model_state, model_path)
            print(f"  üíæ New best model saved! (AUC: {best_auc:.6f})")
        
        print("-" * 80)
    
    # Use SWA model if available
    if swa_optimizer and swa_optimizer.get_swa_model() is not None:
        print("üîÑ Evaluating SWA model...")
        swa_model = swa_optimizer.get_swa_model()
        
        # Quick SWA evaluation
        swa_model.eval()
        swa_probs = []
        swa_targets = []
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = swa_model(data)
                if isinstance(output, tuple):
                    output, _ = output
                probs = F.softmax(output, dim=1)[:, 1]
                swa_probs.extend(probs.cpu().numpy())
                swa_targets.extend(target.cpu().numpy())
        
        swa_auc = roc_auc_score(swa_targets, swa_probs)
        print(f"üìä SWA Model AUC: {swa_auc:.6f}")
        
        if swa_auc > best_auc:
            print("üèÜ SWA model is better! Using SWA for final model.")
            best_auc = swa_auc
            best_model_state['model_state_dict'] = swa_model.state_dict()
            best_model_state['best_auc'] = swa_auc
            torch.save(best_model_state, model_path)
    
    # Final evaluation
    print("\nüìä FINAL ENHANCED EVALUATION (NO SYNTHETIC SAMPLES, NO FEATURE ENGINEERING, NO EARLY STOPPING):")
    print("=" * 100)
    print(f"üèÜ Best Validation ROC-AUC: {best_auc:.6f}")
    print(f"üéØ Target achieved: {'‚úÖ' if best_auc >= 0.998 else 'üîÑ'} (Target: 99.80%+)")
    
    if len(all_targets) > 0 and len(np.unique(all_targets)) > 1:
        print("\nüìã Final Classification Report:")
        print(classification_report(
            all_targets, all_preds,
            target_names=['Normal', 'Attack'],
            digits=4
        ))
        
        print("\nüìä Final Confusion Matrix:")
        cm = confusion_matrix(all_targets, all_preds)
        cm_df = pd.DataFrame(
            cm, 
            index=['True Normal', 'True Attack'], 
            columns=['Pred Normal', 'Pred Attack']
        )
        print(cm_df)
        
        # Security-specific metrics
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        
        print(f"\nüîí Security Metrics:")
        print(f"  False Positive Rate: {fpr:.6f} ({fp:,} false alarms)")
        print(f"  False Negative Rate: {fnr:.6f} ({fn:,} missed attacks)")
        print(f"  Attack Detection Rate: {1-fnr:.6f}")
        
        # Performance improvement summary
        print(f"\nüìà Performance Improvements (NO SYNTHETIC SAMPLES, NO FEATURE ENGINEERING, NO EARLY STOPPING):")
        print(f"  ‚úÖ Enhanced Architecture: Feature attention + residual scaling")
        print(f"  ‚úÖ Optimal Hyperparameters: d_model=160, Œ≥=1.8, Œ±=0.75")
        print(f"  ‚úÖ Advanced Training: SWA + Mixup + Adaptive Loss")
        print(f"  ‚úÖ Intelligent Undersampling: Distance-based sampling only")
        print(f"  ‚úÖ Multi-GPU Optimization: T4 x2 support")
        print(f"  ‚úÖ Full Training: {config.epochs} epochs without early stopping")
        print(f"  ‚úÖ Original Features Only: No feature engineering, no synthetic samples")
    
    print(f"\nüíæ Enhanced model saved to: {model_path}")
    print("üöÄ Enhanced training completed successfully!")
    print(f"üéØ Full {config.epochs}-epoch training with original features only (no synthetic samples, no feature engineering, no early stopping)")

if __name__ == '__main__':
    main()

üî• CUDA available with 2 GPU(s)
   GPU 0: Tesla T4 (14.7GB)
   GPU 1: Tesla T4 (14.7GB)
üöÄ Enhanced Binary RTIDS Training - GPU T4 x2 Optimized (No Synthetic Samples, No Feature Engineering, No Early Stopping)
üéØ Target: 99.85%+ ROC-AUC with optimal hyperparameters
‚úÖ Full epoch training - NO synthetic samples, NO feature engineering, NO early stopping
üìä Loading CICIDS2017 dataset...
Dataset shape: (2830743, 79)
‚úÖ Found label column: Label
üîÑ Converting to binary classification...
Binary distribution:
Normal (0): 2,273,097
Attack (1): 557,646
üìä Using original features only...
Using 78 original features (no feature engineering)
üßπ Cleaning data...
üìè Advanced preprocessing...
üîß Advanced preprocessing...
‚úÖ Outlier capping applied
‚úÖ Applied quantile scaling
‚öñÔ∏è Intelligent class balancing (undersampling only)...
‚öñÔ∏è Intelligent class balancing (undersampling only)...
Original distribution: {'Normal': 2273097, 'Attack': 557646}
‚úÖ Applied distance-based i

In [None]:
# ===============================================================
# CNN-Transformer Hybrid IDS Training + Interpretability
# ===============================================================
from dataclasses import dataclass
from typing import Tuple, Dict, Optional
import json
import joblib
RUN_CNN_TRANSFORMER = False  # Flip to True before running this cell to launch training

@dataclass
class CNNTransformerConfig:
    input_path: str = "/kaggle/input/cicids2017/cicids2017.csv"
    output_dir: str = "/kaggle/working/"
    test_size: float = 0.2
    random_state: int = 42
    epochs: int = 25
    batch_size: int = 512 if MULTI_GPU else 256
    val_batch_size: int = 1024 if MULTI_GPU else 512
    lr: float = 1.5e-3
    weight_decay: float = 1e-4
    label_smoothing: float = 0.05
    conv_channels: int = 96
    num_layers: int = 3
    num_heads: int = 8
    d_model: int = 192
    d_ff: int = 768
    dropout: float = 0.2
    undersampling_ratio: float = 0.15
    ig_steps: int = 32
    ig_samples: int = 512

class CNNTokenizer(nn.Module):
    def __init__(self, input_dim: int, conv_channels: int, d_model: int):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, conv_channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(conv_channels),
            nn.GELU(),
            nn.Conv1d(conv_channels, conv_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(conv_channels),
            nn.GELU(),
            nn.Dropout(0.1)
        )
        self.proj = nn.Linear(conv_channels, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.input_dim = input_dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.unsqueeze(1)  # [B, 1, F]
        tokens = self.conv(x).transpose(1, 2)  # [B, F, conv_channels]
        tokens = self.proj(tokens)
        return self.norm(tokens)

class CNNTransformerIDS(nn.Module):
    def __init__(self, input_dim: int, config: CNNTransformerConfig):
        super().__init__()
        self.tokenizer = CNNTokenizer(input_dim, config.conv_channels, config.d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.d_model,
            nhead=config.num_heads,
            dim_feedforward=config.d_ff,
            dropout=config.dropout,
            batch_first=True,
            norm_first=True,
            activation='gelu'
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_layers)
        self.cls_token = nn.Parameter(torch.randn(1, 1, config.d_model))
        self.positional = nn.Parameter(torch.randn(1, input_dim + 1, config.d_model))
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Sequential(
            nn.LayerNorm(config.d_model),
            nn.Linear(config.d_model, config.d_model // 2),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model // 2, 2)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        tokens = self.tokenizer(x)
        batch_size, seq_len, _ = tokens.size()
        cls = self.cls_token.expand(batch_size, -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)
        tokens = tokens + self.positional[:, :seq_len + 1]
        encoded = self.encoder(self.dropout(tokens))
        logits = self.classifier(encoded[:, 0])
        return logits

def detect_label_column(df: pd.DataFrame) -> str:
    for col in df.columns:
        if 'label' in col.lower():
            return col
    raise ValueError("No label column detected in dataset.")

def prepare_raw_features(df: pd.DataFrame, label_col: str) -> Tuple[pd.DataFrame, np.ndarray, list]:
    df = df.copy()
    df['binary_label'] = (df[label_col] != 'BENIGN').astype(int)
    feature_blacklist = {label_col, 'binary_label', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp'}
    feature_cols = [c for c in df.columns if c not in feature_blacklist]
    X = df[feature_cols].copy()
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    return X, df['binary_label'].values, feature_cols

def split_scale_data(X: pd.DataFrame, y: np.ndarray, config: CNNTransformerConfig):
    X_train_raw, X_val_raw, y_train, y_val = train_test_split(
        X, y, test_size=config.test_size, stratify=y, random_state=config.random_state
    )
    train_medians = X_train_raw.median()
    X_train_raw = X_train_raw.fillna(train_medians)
    X_val_raw = X_val_raw.fillna(train_medians)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_val = scaler.transform(X_val_raw)
    return X_train, X_val, y_train, y_val, scaler, train_medians

def build_loaders(X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray, config: CNNTransformerConfig):
    train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.LongTensor(y_val))
    class_counts = np.bincount(y_train)
    weights = 1.0 / np.maximum(class_counts, 1)
    sample_weights = weights[y_train]
    sampler = WeightedRandomSampler(torch.DoubleTensor(sample_weights), len(sample_weights), replacement=True)
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        sampler=sampler,
        num_workers=4,
        pin_memory=True,
        drop_last=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.val_batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    return train_loader, val_loader, val_dataset

def train_epoch(model, loader, criterion, optimizer, scheduler):
    model.train()
    running_loss = 0.0
    for batch_data, batch_target in loader:
        batch_data = batch_data.to(DEVICE, non_blocking=True)
        batch_target = batch_target.to(DEVICE, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        logits = model(batch_data)
        loss = criterion(logits, batch_target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        running_loss += loss.item()
    return running_loss / max(len(loader), 1)

def evaluate_epoch(model, loader, criterion):
    model.eval()
    losses = []
    all_preds, all_probs, all_targets = [], [], []
    with torch.no_grad():
        for batch_data, batch_target in loader:
            batch_data = batch_data.to(DEVICE, non_blocking=True)
            batch_target = batch_target.to(DEVICE, non_blocking=True)
            logits = model(batch_data)
            loss = criterion(logits, batch_target)
            losses.append(loss.item())
            probs = F.softmax(logits, dim=1)[:, 1]
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_targets.extend(batch_target.cpu().numpy())
    metrics = calculate_comprehensive_metrics(
        np.array(all_targets), np.array(all_preds), np.array(all_probs)
    )
    return np.mean(losses) if losses else 0.0, metrics, np.array(all_probs), np.array(all_targets)

def integrated_gradients(model, inputs, baseline=None, steps=32, target_class=1):
    model.eval()
    device = inputs.device
    if baseline is None:
        baseline = torch.zeros_like(inputs, device=device)
    total_gradients = torch.zeros_like(inputs)
    for alpha in torch.linspace(0, 1, steps, device=device):
        interpolated = baseline + alpha * (inputs - baseline)
        interpolated.requires_grad_(True)
        outputs = model(interpolated)
        target = outputs[:, target_class].sum()
        grads = torch.autograd.grad(target, interpolated, retain_graph=False)[0]
        total_gradients += grads
    avg_gradients = total_gradients / steps
    return (inputs - baseline) * avg_gradients

def generate_ig_report(model, X_val: np.ndarray, feature_names: list, config: CNNTransformerConfig) -> str:
    sample_count = min(config.ig_samples, X_val.shape[0])
    if sample_count == 0:
        return ""
    sample_idx = np.random.RandomState(config.random_state).choice(X_val.shape[0], sample_count, replace=False)
    data = torch.FloatTensor(X_val[sample_idx]).to(DEVICE)
    baseline_vector = torch.FloatTensor(X_val.mean(axis=0, keepdims=True)).to(DEVICE)
    ig_values = []
    for chunk in torch.split(data, 128):
        base_chunk = baseline_vector.expand(chunk.size(0), -1)
        ig_chunk = integrated_gradients(model, chunk, baseline=base_chunk, steps=config.ig_steps)
        ig_values.append(ig_chunk.detach().cpu())
    ig_tensor = torch.cat(ig_values, dim=0)
    importance = ig_tensor.abs().mean(dim=0).numpy()
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'avg_abs_integrated_grad': importance
    }).sort_values('avg_abs_integrated_grad', ascending=False)
    csv_path = os.path.join(config.output_dir, 'cnn_transformer_integrated_gradients.csv')
    importance_df.to_csv(csv_path, index=False)
    print(f"üß† Saved Integrated Gradients feature ranking -> {csv_path}")
    return csv_path

def train_cnn_transformer(run_training: bool = True):
    if not run_training:
        print("Set RUN_CNN_TRANSFORMER=True to launch training.")
        return
    config = CNNTransformerConfig()
    torch.manual_seed(config.random_state)
    np.random.seed(config.random_state)
    random.seed(config.random_state)
    print("üöÄ Starting CNN-Transformer IDS training")
    df = pd.read_csv(config.input_path)
    label_col = detect_label_column(df)
    X, y, feature_cols = prepare_raw_features(df, label_col)
    X_train, X_val, y_train, y_val, scaler, medians = split_scale_data(X, y, config)
    balancer = IntelligentDataBalancer(config.undersampling_ratio, config.random_state)
    X_train_bal, y_train_bal = balancer.balance_classes(X_train, y_train)
    train_loader, val_loader, val_dataset = build_loaders(X_train_bal, y_train_bal, X_val, y_val, config)
    model = CNNTransformerIDS(input_dim=X_train.shape[1], config=config).to(DEVICE)
    if MULTI_GPU:
        model = DataParallel(model)
    criterion = nn.CrossEntropyLoss(label_smoothing=config.label_smoothing)
    optimizer = optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=config.lr, epochs=config.epochs, steps_per_epoch=len(train_loader)
    ) if len(train_loader) > 0 else None
    best_auc = 0.0
    best_state = None
    for epoch in range(1, config.epochs + 1):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, scheduler)
        val_loss, metrics, _, _ = evaluate_epoch(model, val_loader, criterion)
        print(
            f"Epoch {epoch:02d} | Train Loss {train_loss:.4f} | Val Loss {val_loss:.4f} | "
            f"ROC-AUC {metrics['auc_roc']:.4f} | F1 {metrics['f1_score']:.4f}"
        )
        if metrics['auc_roc'] > best_auc:
            best_auc = metrics['auc_roc']
            best_state = {
                'model_state_dict': model.module.state_dict() if isinstance(model, DataParallel) else model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics,
                'config': config.__dict__,
                'feature_columns': feature_cols
            }
    if best_state is None:
        print("Training did not improve beyond initialization; aborting save.")
        return
    model_path = os.path.join(config.output_dir, 'cnn_transformer_ids.pth')
    torch.save(best_state, model_path)
    print(f"üíæ Saved best CNN-Transformer checkpoint -> {model_path}")

    preprocess_artifacts = {
        'feature_columns': feature_cols,
        'medians': medians.to_dict(),
        'scaler_mean': scaler.mean_.tolist(),
        'scaler_scale': scaler.scale_.tolist()
    }
    preprocess_path = os.path.join(config.output_dir, 'cnn_transformer_preprocess.pkl')
    joblib.dump(preprocess_artifacts, preprocess_path)
    print(f"üíæ Saved preprocessing artifacts -> {preprocess_path}")

    # Load best weights for interpretability
    if isinstance(model, DataParallel):
        model.module.load_state_dict(best_state['model_state_dict'])
        final_model = model.module
    else:
        model.load_state_dict(best_state['model_state_dict'])
        final_model = model
    generate_ig_report(final_model, X_val, feature_cols, config)

if __name__ == '__main__' and RUN_CNN_TRANSFORMER:
    train_cnn_transformer(run_training=True)

In [6]:
# ===============================================================
# SHAP on downloaded RT-IDS checkpoint (no training code needed)
# - Robust checkpoint loader (PyTorch 2.6 safe_globals)
# - Exact class/attribute names matching your saved state_dict
# - Restores saved preprocessor if present (else Quantile fallback)
# - SHAP DeepExplainer with progress + additivity disabled
# - Saves CSV + plots; prints frequent progress updates
# ===============================================================

# (Optional) ensure shap installed; comment out if your env already has it
# !pip install shap -q

import os, gc, time, math, warnings, sys, json
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

# -------------------- CONFIG --------------------
CKPT_PATH  = "/kaggle/input/v1/pytorch/default/1/enhanced_binary_rtids_model_no_synthetic_no_features_no_early_stopping.pth"
DATA_PATH  = "/kaggle/input/cicids2017/cicids2017.csv"  # change if your file is elsewhere
OUT_DIR    = "/kaggle/working"
CHUNK_SIZE = 256     # SHAP batch per iteration (progress-printed)
BG_SIZE    = 2000    # background samples for DeepExplainer
EVAL_SIZE  = 2000    # number of samples to explain
EVAL_POOL  = 150_000 # build eval pool from this many random rows (processing once)
RANDOM_SEED= 42
PLOT_TOPK  = 20

# -------------------- LOGGING -------------------
def log(msg): 
    print(msg, flush=True)

def secs(t): 
    return f"{t:.1f}s"

log(f"üîß SHAP config -> CHUNK_SIZE={CHUNK_SIZE}, BG_SIZE={BG_SIZE}, EVAL_SIZE={EVAL_SIZE}")

# -------------------- DEVICE --------------------
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    log("üíª Using device: cuda:0")
    try:
        log(f"   GPU: {torch.cuda.get_device_name(0)} | Mem: {torch.cuda.get_device_properties(0).total_memory/1024**3:.1f} GB")
    except Exception:
        pass
else:
    device = torch.device("cpu")
    log("üíª Using device: cpu")

# -------------------- DUMMY CLASSES (pickle safety) --------------------
# These satisfy objects saved inside the checkpoint (e.g., preprocessor)
from sklearn.preprocessing import QuantileTransformer, RobustScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif

class RobustPreprocessor:
    def __init__(self, scaling_method='quantile', handle_outliers=True, n_features=120):
        self.scaling_method = scaling_method
        self.handle_outliers = handle_outliers
        self.n_features = n_features
        self.scaler = None
        self.feature_selector = None
    def fit_transform(self, *args, **kwargs):
        raise NotImplementedError("Not used in inference.")
    def transform(self, X):
        Xp = X.replace([np.inf, -np.inf], np.nan)
        for c in Xp.columns:
            if Xp[c].isna().sum() > 0:
                Xp[c].fillna(Xp[c].median(), inplace=True)
        if self.feature_selector is not None:
            try:
                Xp = self.feature_selector.transform(Xp)
                Xp = pd.DataFrame(Xp)
            except Exception:
                pass
        if self.scaler is not None:
            try:
                return self.scaler.transform(Xp)
            except Exception:
                return Xp.values
        return Xp.values

class IntelligentDataBalancer:
    def __init__(self, undersampling_ratio=0.12, random_state=42):
        self.undersampling_ratio = undersampling_ratio
        self.random_state = random_state
    def balance_classes(self, X, y):
        return X, y

# -------------------- ARCHITECTURE (names match checkpoint) --------------------
class FeatureImportanceLayer(nn.Module):
    def __init__(self, input_dim, d_model):
        super().__init__()
        self.feature_attention = nn.Sequential(
            nn.Linear(input_dim, d_model),
            nn.Tanh(),
            nn.Linear(d_model, input_dim),
            nn.Sigmoid()
        )
        self.projection = nn.Linear(input_dim, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
    def forward(self, x):
        imp = self.feature_attention(x)
        emb = self.projection(x * imp)
        emb = self.layer_norm(emb)
        emb = self.dropout(emb)
        return emb.unsqueeze(1), imp

class EnhancedMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.qkv_projection = nn.Linear(d_model, d_model*3, bias=False)
        self.output_projection = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.d_k)
    def forward(self, x):
        B,T,D = x.size()
        qkv = self.qkv_projection(x).reshape(B,T,3,self.num_heads,self.d_k).permute(2,0,3,1,4)
        q,k,v = qkv[0], qkv[1], qkv[2]
        scores = torch.matmul(q, k.transpose(-2,-1)) / self.scale
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, v).transpose(1,2).contiguous().reshape(B,T,D)
        return self.output_projection(out)

class EnhancedTransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.attention = EnhancedMultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.residual_scale = nn.Parameter(torch.ones(1)*0.8)
    def forward(self, x):
        x = x + self.dropout(self.attention(self.norm1(x)))   * self.residual_scale
        x = x + self.dropout(self.feed_forward(self.norm2(x))) * self.residual_scale
        return x

class AttentionPoolingClassifier(nn.Module):
    def __init__(self, d_model, dropout=0.15):
        super().__init__()
        self.attention_pool = nn.MultiheadAttention(d_model, num_heads=8, batch_first=True)
        self.cls_token = nn.Parameter(torch.randn(1,1,d_model))
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model//2), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_model//2, d_model//4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_model//4, 2)
        )
    def forward(self, x):
        B = x.size(0)
        cls = self.cls_token.expand(B,-1,-1)
        x_with_cls = torch.cat([cls, x], dim=1)
        pooled,_ = self.attention_pool(cls, x_with_cls, x_with_cls)
        return self.classifier(pooled.squeeze(1))

class EnhancedBinaryTransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model=160, num_layers=4, num_heads=10, d_ff=640, dropout=0.15):
        super().__init__()
        self.feature_embedder = FeatureImportanceLayer(input_dim, d_model)
        self.transformer_blocks = nn.ModuleList([
            EnhancedTransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.final_norm = nn.LayerNorm(d_model)
        self.classifier = AttentionPoolingClassifier(d_model, dropout)
        self.apply(self._init)
    def _init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None: nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0); nn.init.constant_(m.weight, 1.0)
    def forward(self, x):
        x, feat_imp = self.feature_embedder(x)
        for blk in self.transformer_blocks:
            x = blk(x)
        x = self.final_norm(x)
        logits = self.classifier(x)
        return logits, feat_imp

# -------------------- LOAD CHECKPOINT ROBUSTLY --------------------
if not os.path.exists(CKPT_PATH):
    raise FileNotFoundError(f"Checkpoint not found at: {CKPT_PATH}")

log("üì¶ Loading checkpoint (full pickle, weights_only=False)‚Ä¶")
try:
    ckpt = torch.load(CKPT_PATH, map_location="cpu", weights_only=False)
    log("‚úÖ Loaded with weights_only=False")
except Exception as e:
    log(f"‚ö†Ô∏è Full load failed: {e}")
    # expand allowed globals for PyTorch 2.6 safe unpickler
    try:
        from torch.serialization import add_safe_globals
        import numpy as _np
        add_safe_globals([
            _np.dtype, _np.float64, _np.int64, _np.float32, _np.int32,
            _np.dtypes.Float64DType, _np.dtypes.Int64DType, _np.core.multiarray.scalar
        ])
        ckpt = torch.load(CKPT_PATH, map_location="cpu", weights_only=False)
        log("‚úÖ Loaded after allowlisting numpy dtypes")
    except Exception as e2:
        log(f"‚ùå Still failed: {e2}")
        raise

state = ckpt["model_state_dict"]
cfg   = ckpt.get("config", {})
def _cfg(k, dv): return cfg.get(k, dv)

# infer input_dim from weight shape of first linear in embedder
input_dim = state["feature_embedder.projection.weight"].shape[1]

model = EnhancedBinaryTransformerClassifier(
    input_dim=input_dim,
    d_model=_cfg("d_model",160),
    num_layers=_cfg("num_layers",4),
    num_heads=_cfg("heads",10),
    d_ff=_cfg("d_ff",640),
    dropout=_cfg("dropout",0.15)
)
model.load_state_dict(state, strict=True)
model = model.to(device).eval()
log("‚úÖ Model reconstructed & weights loaded")

preprocessor = ckpt.get("preprocessor", None)
if preprocessor is not None:
    log("‚úÖ Preprocessor restored from checkpoint")
else:
    log("‚ö†Ô∏è Preprocessor missing; will use QuantileTransformer fallback")

# -------------------- FEATURES / COLUMNS --------------------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"DATA_PATH not found: {DATA_PATH}")

log("üìÑ Reading headers to derive feature columns‚Ä¶")
df_head = pd.read_csv(DATA_PATH, nrows=5)
label_candidates = [c for c in df_head.columns if 'label' in c.lower()]
if not label_candidates:
    raise RuntimeError("Could not find a label column in the dataset headers.")
label_col = label_candidates[0]
exclude_cols = [label_col, 'binary_label', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
feature_cols = [c for c in df_head.columns if c not in exclude_cols]
log(f"‚úÖ Found {len(feature_cols)} raw feature columns")

# If checkpoint's selector exists, get selected feature names in order
if preprocessor is not None and getattr(preprocessor, "feature_selector", None) is not None:
    try:
        sel_idx = preprocessor.feature_selector.get_support(indices=True)
        selected_feature_names = [feature_cols[i] for i in sel_idx]
    except Exception:
        selected_feature_names = feature_cols
else:
    selected_feature_names = feature_cols

# -------------------- BUILD EVAL POOL & PREPROCESS --------------------
np.random.seed(RANDOM_SEED)
log("üì• Building evaluation slice‚Ä¶")
t0 = time.time()

usecols = list(dict.fromkeys(feature_cols + [label_col]))  # preserve order
full = pd.read_csv(DATA_PATH, usecols=usecols)
full["binary_label"] = (full[label_col] != "BENIGN").astype(int)

X_raw = full[feature_cols].copy()
# cast non-numeric
for c in X_raw.select_dtypes(include=["object"]).columns:
    X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")
X_raw = X_raw.replace([np.inf, -np.inf], np.nan).fillna(X_raw.median())

# apply saved preprocessor or fallback
if preprocessor is not None:
    try:
        X_proc = preprocessor.transform(X_raw)
        log("‚úÖ Applied restored preprocessor")
    except Exception as e:
        log(f"‚ö†Ô∏è preprocessor.transform failed ({e}); using fallback QuantileTransformer")
        qt = QuantileTransformer(output_distribution="uniform", random_state=42)
        X_proc = qt.fit_transform(X_raw)
        selected_feature_names = feature_cols
else:
    qt = QuantileTransformer(output_distribution="uniform", random_state=42)
    X_proc = qt.fit_transform(X_raw)
    selected_feature_names = feature_cols
    log("‚ö†Ô∏è Used fallback QuantileTransformer")

y_all = full["binary_label"].values

pool_n = min(EVAL_POOL, len(y_all))
pool_idx = np.random.RandomState(RANDOM_SEED).choice(len(y_all), size=pool_n, replace=False)
X_eval_pool = X_proc[pool_idx]
y_eval_pool = y_all[pool_idx]
log(f"‚úÖ Eval pool ready in {secs(time.time()-t0)} (pool={X_eval_pool.shape[0]:,})")

# -------------------- SHAP SETUP --------------------
class LogitsOnly(nn.Module):
    def __init__(self, base): 
        super().__init__()
        self.base = base
    def forward(self, x):
        out = self.base(x)
        return out[0] if isinstance(out, (tuple, list)) else out

logits_model = LogitsOnly(model).to(device).eval()

import shap

# Prefer DeepExplainer; fallback to GradientExplainer if needed
log("üßÆ Building SHAP Explainer‚Ä¶")
bg_size  = min(BG_SIZE, X_eval_pool.shape[0])
eval_sz  = min(EVAL_SIZE, X_eval_pool.shape[0])

X_t       = torch.from_numpy(X_eval_pool).float().to(device)
bg_idx    = np.random.RandomState(7).choice(X_t.shape[0], size=bg_size, replace=False)
background= X_t[bg_idx]
eval_data = X_t[:eval_sz]

explainer = None
t_exp = time.time()
try:
    explainer = shap.DeepExplainer(logits_model, background)
    mode = "DeepExplainer"
    log(f"‚úÖ {mode} ready in {secs(time.time()-t_exp)} (bg={bg_size}, eval={eval_sz})")
except Exception as e:
    log(f"‚ö†Ô∏è DeepExplainer failed: {e} -> falling back to GradientExplainer")
    try:
        explainer = shap.GradientExplainer(logits_model, background)
        mode = "GradientExplainer"
        log(f"‚úÖ {mode} ready in {secs(time.time()-t_exp)} (bg={bg_size}, eval={eval_sz})")
    except Exception as e2:
        log(f"‚ùå GradientExplainer also failed: {e2}")
        raise

# -------------------- SHAP RUN (progress + no additivity) --------------------
def shap_values_in_chunks_no_additivity(explainer, data_t, chunk=256):
    n = data_t.shape[0]
    sv0_parts, sv1_parts = [], []
    start = 0
    last_pct = -1
    t0 = time.time()
    while start < n:
        end = min(start + chunk, n)
        pct = int(100 * end / n)
        if pct != last_pct:
            log(f"   ‚Üí SHAP {end}/{n} ({pct}%)")
            last_pct = pct
        # Disable additivity check to avoid LayerNorm/MHA assertion
        sv = explainer.shap_values(data_t[start:end], check_additivity=False) \
             if mode == "DeepExplainer" else explainer.shap_values(data_t[start:end].detach().cpu().numpy())
        if isinstance(sv, (list, tuple)) and len(sv) >= 2:
            sv0_parts.append(sv[0])
            sv1_parts.append(sv[1])
        else:
            # single output fallback: duplicate to keep downstream shape
            sv0_parts.append(sv)
            sv1_parts.append(sv)
        gc.collect()
        if torch.cuda.is_available(): torch.cuda.empty_cache()
        start = end
    log(f"‚úÖ SHAP complete in {secs(time.time()-t0)}")
    sv0 = np.concatenate(sv0_parts, axis=0)
    sv1 = np.concatenate(sv1_parts, axis=0)
    return sv0, sv1

log("üèÅ Starting SHAP computation‚Ä¶")
sv0, sv1 = shap_values_in_chunks_no_additivity(explainer, eval_data, chunk=CHUNK_SIZE)
log("‚úÖ SHAP values ready (no-additivity mode)")

# -------------------- SAVE / REPORT --------------------
mean_abs = np.abs(sv1).mean(axis=0)
global_importance = pd.DataFrame({
    "feature": selected_feature_names,
    "mean_abs_shap": mean_abs
}).sort_values("mean_abs_shap", ascending=False)

csv_path = os.path.join(OUT_DIR, "shap_global_importance_attack.csv")
global_importance.to_csv(csv_path, index=False)
log(f"üíæ Saved global SHAP importances -> {csv_path}")

# Try to render + save plots (headless safe)
import matplotlib.pyplot as plt

try:
    shap.summary_plot(
        sv1, 
        features=eval_data.detach().cpu().numpy(),
        feature_names=selected_feature_names,
        show=False
    )
    plt.tight_layout()
    sum_png = os.path.join(OUT_DIR, "shap_summary_attack.png")
    plt.savefig(sum_png, dpi=160, bbox_inches="tight")
    plt.close()
    log(f"üñºÔ∏è Saved summary plot -> {sum_png}")
except Exception as e:
    log(f"‚ö†Ô∏è summary_plot skipped: {e}")

# Pick a confident Attack sample for local explanation
with torch.no_grad():
    probs = F.softmax(logits_model(eval_data), dim=1)[:,1].detach().cpu().numpy()

attack_idx = np.where(y_eval_pool[:eval_sz]==1)[0]
i_local = int(attack_idx[np.argmax(probs[attack_idx])]) if len(attack_idx)>0 else int(np.argmax(probs))

try:
    exp = shap.Explanation(
        values=sv1[i_local],
        base_values=np.array(explainer.expected_value[1]).mean() if hasattr(explainer, "expected_value") else 0.0,
        data=eval_data[i_local].detach().cpu().numpy(),
        feature_names=selected_feature_names
    )
    shap.plots.waterfall(exp, max_display=PLOT_TOPK, show=False)
    plt.tight_layout()
    wf_png = os.path.join(OUT_DIR, "shap_waterfall_attack.png")
    plt.savefig(wf_png, dpi=160, bbox_inches="tight")
    plt.close()
    log(f"üñºÔ∏è Saved waterfall plot -> {wf_png}")
except Exception as e:
    log(f"‚ö†Ô∏è waterfall plot skipped: {e}")

# Small text report top features
TOPK = 20
log("\nüèÜ Top features by mean |SHAP| (Attack):")
for i,(f,v) in enumerate(global_importance.head(TOPK).values, 1):
    log(f"{i:>2}. {f:>30}  {v:.6f}")

log("\nüéâ Done.\n"
    f"   ‚Ä¢ Global CSV: {csv_path}\n"
    f"   ‚Ä¢ Summary PNG: {os.path.join(OUT_DIR, 'shap_summary_attack.png')}\n"
    f"   ‚Ä¢ Waterfall PNG: {os.path.join(OUT_DIR, 'shap_waterfall_attack.png')}")


üîß SHAP config -> CHUNK_SIZE=256, BG_SIZE=2000, EVAL_SIZE=2000
üíª Using device: cuda:0
   GPU: Tesla T4 | Mem: 14.7 GB
üì¶ Loading checkpoint (full pickle, weights_only=False)‚Ä¶
‚úÖ Loaded with weights_only=False
‚úÖ Model reconstructed & weights loaded
‚úÖ Preprocessor restored from checkpoint
üìÑ Reading headers to derive feature columns‚Ä¶
‚úÖ Found 78 raw feature columns
üì• Building evaluation slice‚Ä¶
‚úÖ Applied restored preprocessor
‚úÖ Eval pool ready in 41.9s (pool=150,000)
üßÆ Building SHAP Explainer‚Ä¶
‚úÖ DeepExplainer ready in 0.0s (bg=2000, eval=2000)
üèÅ Starting SHAP computation‚Ä¶
   ‚Üí SHAP 256/2000 (12%)
   ‚Üí SHAP 512/2000 (25%)
   ‚Üí SHAP 768/2000 (38%)
   ‚Üí SHAP 1024/2000 (51%)
   ‚Üí SHAP 1280/2000 (64%)
   ‚Üí SHAP 1536/2000 (76%)
   ‚Üí SHAP 1792/2000 (89%)
   ‚Üí SHAP 2000/2000 (100%)
‚úÖ SHAP complete in 351.1s
‚úÖ SHAP values ready (no-additivity mode)
üíæ Saved global SHAP importances -> /kaggle/working/shap_global_importance_attack.csv
üñ