<a href="https://colab.research.google.com/github/rezahamzeh69/Intrusion-Detection-LSTM/blob/main/unsw_nb15_ids_bilstm_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install pyarrow

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import time
import os
import kagglehub
import io

LSTM_HIDDEN_DIM = 128
LSTM_LAYERS = 2
TRANSFORMER_DIM = 128
NHEAD = 8
NUM_TRANSFORMER_LAYERS = 2
NUM_CLASSES = 2
DROPOUT = 0.3
SEQUENCE_LENGTH = 15
BATCH_SIZE = 256
LEARNING_RATE = 5e-4
NUM_EPOCHS = 30
VALIDATION_SPLIT_FROM_TRAIN = 0.15
RANDOM_SEED = 42
EARLY_STOPPING_PATIENCE = 5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TRAIN_FILE_NAME = 'UNSW_NB15_training-set.parquet'
TEST_FILE_NAME = 'UNSW_NB15_testing-set.parquet'

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_and_preprocess_unsw(dataset_dir):
    train_file_path = os.path.join(dataset_dir, TRAIN_FILE_NAME)
    test_file_path = os.path.join(dataset_dir, TEST_FILE_NAME)
    print(f"Attempting to load training data from: {train_file_path}")
    print(f"Attempting to load testing data from: {test_file_path}")
    if not os.path.exists(train_file_path) or not os.path.exists(test_file_path):
        print("Error: Training or Testing file not found in the downloaded dataset directory.")
        print(f"Contents of {dataset_dir}: {os.listdir(dataset_dir)}")
        return None, None, None, None, -1
    try:
        df_train = pd.read_parquet(train_file_path)
        df_test = pd.read_parquet(test_file_path)
        print("Datasets loaded successfully.")
    except Exception as e:
        print(f"Error loading Parquet files: {e}")
        return None, None, None, None, -1
    print("Preprocessing datasets...")
    df = pd.concat([df_train, df_test], ignore_index=True)
    df = df.drop(['id', 'attack_cat'], axis=1, errors='ignore')
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    for col in numerical_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median() if df[col].nunique() > 10 else 0)
    for col in categorical_cols:
        if df[col].isnull().any():
            if pd.api.types.is_categorical_dtype(df[col]):
                df[col] = df[col].astype('object')
            df[col] = df[col].fillna(df[col].mode()[0])
    if 'label' in numerical_cols:
        numerical_cols.remove('label')
    elif 'label' in categorical_cols:
        categorical_cols.remove('label')
        try:
            df['label'] = pd.to_numeric(df['label'])
        except ValueError:
            print("Error: Could not convert 'label' column to numeric.")
            return None, None, None, None, -1
    if 'label' not in df.columns or not pd.api.types.is_numeric_dtype(df['label']):
        print("Error: 'label' column is missing or not numeric after cleaning.")
        return None, None, None, None, -1
    df_encoded = pd.get_dummies(df, columns=categorical_cols, dummy_na=False)
    train_len = len(df_train)
    df_train_processed = df_encoded.iloc[:train_len].copy()
    df_test_processed = df_encoded.iloc[train_len:].copy()
    scaler = MinMaxScaler()
    numerical_cols_encoded = df_train_processed.drop('label', axis=1).select_dtypes(include=np.number).columns.tolist()
    if numerical_cols_encoded:
        for col in numerical_cols_encoded:
            df_train_processed[col] = df_train_processed[col].astype(np.float32)
            df_test_processed[col] = df_test_processed[col].astype(np.float32)
        scaler.fit(df_train_processed[numerical_cols_encoded])
        df_train_processed.loc[:, numerical_cols_encoded] = scaler.transform(df_train_processed[numerical_cols_encoded])
        df_test_processed.loc[:, numerical_cols_encoded] = scaler.transform(df_test_processed[numerical_cols_encoded])
    feature_cols = df_train_processed.columns.drop('label')
    try:
        X_train = df_train_processed[feature_cols].astype(np.float32).values
        X_test = df_test_processed[feature_cols].astype(np.float32).values
    except Exception as e:
        print(f"Error converting feature columns to float32 before .values: {e}")
        for col in feature_cols:
            try:
                df_train_processed[col].astype(np.float32)
            except Exception as col_e:
                print(f"  - Column '{col}' failed conversion: {col_e}, dtype: {df_train_processed[col].dtype}")
        return None, None, None, None, -1
    y_train = df_train_processed['label'].astype(np.int64).values
    y_test = df_test_processed['label'].astype(np.int64).values
    input_dim = X_train.shape[1]
    print(f"Preprocessing finished. Input dimension: {input_dim}")
    return X_train, y_train, X_test, y_test, input_dim

def create_sequences(features, labels, sequence_length):
    sequences = []
    sequence_labels = []
    if len(features) < sequence_length:
        return np.array([]), np.array([])
    for i in range(len(features) - sequence_length + 1):
        sequences.append(features[i:i+sequence_length])
        sequence_labels.append(labels[i+sequence_length-1])
    return np.array(sequences, dtype=np.float32), np.array(sequence_labels, dtype=np.int64)

class BiLSTMTransformerModel(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, lstm_layers, transformer_dim, nhead, num_transformer_layers, num_classes, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden_dim, num_layers=lstm_layers, batch_first=True, dropout=dropout if lstm_layers > 1 else 0, bidirectional=True)
        self.fc_lstm_transformer = nn.Linear(lstm_hidden_dim * 2, transformer_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=nhead, dim_feedforward=transformer_dim * 4, dropout=dropout, activation='gelu', batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_transformer_layers)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout_layer = nn.Dropout(dropout)
        self.classifier = nn.Linear(transformer_dim, num_classes)
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        transformer_input = self.fc_lstm_transformer(lstm_out)
        transformer_out = self.transformer_encoder(transformer_input)
        pooled = self.global_pool(transformer_out.transpose(1, 2)).squeeze(-1)
        dropped_out = self.dropout_layer(pooled)
        logits = self.classifier(dropped_out)
        return logits

def evaluate_model(model, data_loader, criterion, device, is_test_set=False):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    all_labels = []
    all_predictions = []
    if not data_loader or len(data_loader.dataset) == 0:
        if not is_test_set:
            return np.nan, np.nan, np.nan
        print("Warning: Cannot evaluate on empty or invalid dataloader.")
        return 0.0, 0.0, 0.0
    with torch.no_grad():
        for batch_data, batch_labels in data_loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            total_loss += loss.item() * batch_data.size(0)
            _, predicted = torch.max(outputs.data, 1)
            correct_predictions += (predicted == batch_labels).sum().item()
            all_labels.extend(batch_labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = correct_predictions / len(data_loader.dataset)
    f1 = 0.0
    if len(all_labels) > 0 and len(all_predictions) > 0:
        f1 = f1_score(all_labels, all_predictions, average='binary', zero_division=0)
    if is_test_set:
        print("\n--- Final Test Set Evaluation ---")
        print(f"Test Loss: {avg_loss:.4f}")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test F1 Score (Binary): {f1:.4f}")
        if len(all_labels) > 0 and len(all_predictions) > 0:
            print("\nConfusion Matrix:")
            print(confusion_matrix(all_labels, all_predictions))
            print("\nClassification Report:")
            print(classification_report(all_labels, all_predictions, target_names=['Normal (0)', 'Attack (1)'], zero_division=0))
        else:
            print("\nNo labels/predictions available for Confusion Matrix/Classification Report.")
        print("--------------------------------")
    return avg_loss, accuracy, f1

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, patience):
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': [], 'val_f1_score': []}
    best_val_loss = float('inf')
    epochs_no_improve = 0
    print(f"\n--- Starting Training on {device} ---")
    total_start_time = time.time()
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        model.train()
        running_loss = 0.0
        for batch_data, batch_labels in train_loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * batch_data.size(0)
        train_loss = running_loss / len(train_loader.dataset)
        if val_loader:
            val_loss, val_accuracy, val_f1_score = evaluate_model(model, val_loader, criterion, device)
            history['train_loss'].append(train_loss)
            history['val_loss'].append(val_loss)
            history['val_accuracy'].append(val_accuracy)
            history['val_f1_score'].append(val_f1_score)
            epoch_duration = time.time() - epoch_start_time
            print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f} | Val F1: {val_f1_score:.4f} | Time: {epoch_duration:.2f}s")
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_no_improve = 0
                torch.save(model.state_dict(), 'best_bilstm_transformer_model.pth')
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= patience:
                    print(f"Early stopping triggered after epoch {epoch+1}!")
                    break
        else:
            history['train_loss'].append(train_loss)
            history['val_loss'].append(np.nan)
            history['val_accuracy'].append(np.nan)
            history['val_f1_score'].append(np.nan)
            epoch_duration = time.time() - epoch_start_time
            print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Time: {epoch_duration:.2f}s")
    total_training_time = time.time() - total_start_time
    print(f"--- Training Finished --- Total time: {total_training_time // 60:.0f}m {total_training_time % 60:.0f}s")
    if val_loader and os.path.exists('best_bilstm_transformer_model.pth'):
        try:
            model.load_state_dict(torch.load('best_bilstm_transformer_model.pth', map_location=device))
            print("Loaded best model weights for final evaluation.")
        except Exception as e:
            print(f"Warning: Could not load best model weights ({e}). Using last epoch model.")
    elif not val_loader:
        torch.save(model.state_dict(), 'last_epoch_bilstm_transformer_model.pth')
        print("Saved model from last epoch (no validation performed).")
    return history

print("Downloading UNSW-NB15 dataset using kagglehub...")
try:
    dataset_path = kagglehub.dataset_download("dhoogla/unswnb15")
    print(f"Dataset downloaded to: {dataset_path}")
except Exception as e:
    print(f"Error downloading dataset: {e}")
    print("Please ensure Kaggle API credentials are set up correctly in your environment.")
    dataset_path = None

if dataset_path and os.path.isdir(dataset_path):
    X_train_raw, y_train_raw, X_test_raw, y_test_raw, input_dim = load_and_preprocess_unsw(dataset_path)
    if X_train_raw is not None and input_dim != -1:
        INPUT_DIM = input_dim
        print("Creating sequences...")
        X_train_seq, y_train_seq = create_sequences(X_train_raw, y_train_raw, SEQUENCE_LENGTH)
        X_test_seq, y_test_seq = create_sequences(X_test_raw, y_test_raw, SEQUENCE_LENGTH)
        if len(X_train_seq) == 0 or len(X_test_seq) == 0:
            print("Error: Not enough data to create sequences with the specified length.")
        else:
            print(f"Raw training sequences: {len(X_train_seq)}")
            print(f"Raw testing sequences: {len(X_test_seq)}")
            X_train_final_seq, X_val_seq, y_train_final_seq, y_val_seq = [], [], [], []
            if len(X_train_seq) > 1 and VALIDATION_SPLIT_FROM_TRAIN > 0:
                try:
                    X_train_final_seq, X_val_seq, y_train_final_seq, y_val_seq = train_test_split(
                        X_train_seq, y_train_seq, test_size=VALIDATION_SPLIT_FROM_TRAIN, random_state=RANDOM_SEED, stratify=y_train_seq
                    )
                    print(f"Splitting {VALIDATION_SPLIT_FROM_TRAIN*100:.1f}% of training sequences for validation.")
                except ValueError as e:
                    print(f"Warning: Could not stratify split ({e}). Performing non-stratified split.")
                    X_train_final_seq, X_val_seq, y_train_final_seq, y_val_seq = train_test_split(
                        X_train_seq, y_train_seq, test_size=VALIDATION_SPLIT_FROM_TRAIN, random_state=RANDOM_SEED
                    )
                print(f"Final Train sequences: {len(X_train_final_seq)}")
                print(f"Validation sequences: {len(X_val_seq)}")
                print(f"Final Test sequences: {len(X_test_seq)}")
            elif len(X_train_seq) > 0:
                print("Using all training sequences for training, no validation split performed.")
                X_train_final_seq, y_train_final_seq = X_train_seq, y_train_seq
                X_val_seq, y_val_seq = np.array([]), np.array([])
                print(f"Final Train sequences: {len(X_train_final_seq)}")
                print(f"Validation sequences: 0")
                print(f"Final Test sequences: {len(X_test_seq)}")
            else:
                print("Error: No training sequences available.")
                X_train_final_seq = []
            if len(X_train_final_seq) > 0:
                X_train_tensor = torch.from_numpy(X_train_final_seq)
                y_train_tensor = torch.from_numpy(y_train_final_seq)
                X_val_tensor = torch.from_numpy(X_val_seq)
                y_val_tensor = torch.from_numpy(y_val_seq)
                X_test_tensor = torch.from_numpy(X_test_seq)
                y_test_tensor = torch.from_numpy(y_test_seq)
                num_workers = 2 if DEVICE.type == 'cuda' else 0
                pin_memory_flag = True if DEVICE.type == 'cuda' else False
                train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
                val_dataset = TensorDataset(X_val_tensor, y_val_tensor) if len(X_val_seq) > 0 else None
                test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
                train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, pin_memory=pin_memory_flag, drop_last=True)
                val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=pin_memory_flag) if val_dataset else None
                test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=pin_memory_flag)
                model = BiLSTMTransformerModel(INPUT_DIM, LSTM_HIDDEN_DIM, LSTM_LAYERS, TRANSFORMER_DIM, NHEAD, NUM_TRANSFORMER_LAYERS, NUM_CLASSES, DROPOUT).to(DEVICE)
                if len(y_train_final_seq) > 0:
                    class_counts = np.bincount(y_train_final_seq)
                    if len(class_counts) == NUM_CLASSES and 0 not in class_counts:
                        class_weights = torch.tensor([1.0 / c for c in class_counts], dtype=torch.float32).to(DEVICE)
                        criterion = nn.CrossEntropyLoss(weight=class_weights)
                        print(f"Using class weights for CrossEntropyLoss: {class_weights.cpu().numpy()}")
                    else:
                        print("Using standard CrossEntropyLoss (no weighting or class count issue).")
                        criterion = nn.CrossEntropyLoss()
                else:
                    criterion = nn.CrossEntropyLoss()
                optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
                history = train_model(model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, DEVICE, EARLY_STOPPING_PATIENCE)
                evaluate_model(model, test_loader, criterion, DEVICE, is_test_set=True)
                print("\nExecution finished successfully.")
            else:
                print("\nExecution aborted: No training data available after sequencing/splitting.")
    else:
        print("\nExecution aborted due to data loading/preprocessing errors.")
else:
    print("\nExecution aborted: Dataset download failed or directory not found.")


Downloading UNSW-NB15 dataset using kagglehub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/dhoogla/unswnb15?dataset_version_number=5...


100%|██████████| 11.7M/11.7M [00:00<00:00, 127MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/dhoogla/unswnb15/versions/5
Attempting to load training data from: /root/.cache/kagglehub/datasets/dhoogla/unswnb15/versions/5/UNSW_NB15_training-set.parquet
Attempting to load testing data from: /root/.cache/kagglehub/datasets/dhoogla/unswnb15/versions/5/UNSW_NB15_testing-set.parquet
Datasets loaded successfully.
Preprocessing datasets...
Preprocessing finished. Input dimension: 188
Creating sequences...
Raw training sequences: 175327
Raw testing sequences: 82318
Splitting 15.0% of training sequences for validation.
Final Train sequences: 149027
Validation sequences: 26300
Final Test sequences: 82318
Using class weights for CrossEntropyLoss: [2.101370e-05 9.858141e-06]

--- Starting Training on cuda ---
Epoch 1/30 | Train Loss: 0.1508 | Val Loss: 0.0906 | Val Acc: 0.9681 | Val F1: 0.9767 | Time: 13.20s
Epoch 2/30 | Train Loss: 0.0930 | Val Loss: 0.0803 | Val Acc: 0.9676 | Val F1: 0.9760 | Time: 12.13s
Epoch 3/30 | Train Loss: 0.08