# CYBERML Project 2025-2026
## IoT Intrusion Detection and Attack Analysis

### Objectives:
1. **Classification and Anomaly Detection** for tracking attacks
2. **Adversarial Attacks** against classification (bonus)

### Dataset: CIC IoT-DIAD 2024
Source: https://www.unb.ca/cic/datasets/iot-diad-2024.html

**Note:** This notebook uses stratified sampling to handle the large 50GB dataset efficiently.

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
# !pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn xgboost lightgbm plotly nbformat tensorflow

# (UV IS PREFFERED, DON'T USE THIS &v&)

In [None]:
# Download http://cicresearch.ca/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/ into the local data/ directory
# Download the full directory recursively using wget under data/
!wget -r -np -nH --cut-dirs=3 -R "index.html*" -P data/ "http://cicresearch.ca/IOTDataset/CIC%20IoT-IDAD%20Dataset%202024/Dataset/"

In [None]:
# Import required libraries
import os
import glob
import warnings
import gc
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set matplotlib style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.utils import shuffle

# Unsupervised Learning (Anomaly Detection)
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

# Supervised Learning (Classification)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Metrics
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, balanced_accuracy_score, matthews_corrcoef,
    precision_recall_curve, roc_curve, ConfusionMatrixDisplay
)

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("All libraries imported successfully!")

## 2. Memory-Efficient Data Loading with Stratified Sampling

Since the dataset is ~50GB, we use stratified sampling to:
- Sample a fixed number of rows from each attack category
- Maintain class distribution representation
- Keep memory usage manageable

In [None]:
# Configuration for sampling
DATA_ROOT = "data/Anomaly Detection - Flow Based features/"
SAMPLES_PER_CATEGORY = 10000  # Adjust based on available memory
CHUNK_SIZE = 50000  # Read CSV in chunks

# Define attack categories and their folder mappings
ATTACK_CATEGORIES = {
    'Benign': 'Benign',
    'BruteForce': 'BruteForce',
    'DDoS': 'DDoS',
    'DoS': 'DoS',
    'Mirai': 'Mirai',
    'Recon': 'Recon',
    'Spoofing': 'Spoofing',
    'Web-Based': 'Web-Based'
}

print(f"Sampling {SAMPLES_PER_CATEGORY} rows per category")
print(f"Categories: {list(ATTACK_CATEGORIES.keys())}")

In [None]:
def get_csv_files_for_category(category_path):
    """Recursively find all CSV files in a category folder."""
    csv_files = glob.glob(os.path.join(category_path, "**/*.csv"), recursive=True)
    return csv_files

def sample_from_csv(file_path, n_samples, chunk_size=CHUNK_SIZE):
    """Sample n rows from a CSV file using reservoir sampling approach."""
    # First, count total rows (fast scan)
    total_rows = sum(1 for _ in open(file_path, 'r')) - 1  # -1 for header
    
    if total_rows <= 0:
        return None
    
    if total_rows <= n_samples:
        # File is small enough, read entirely
        try:
            return pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return None
    
    # Random sample of row indices to keep
    skip_idx = set(range(1, total_rows + 1)) - set(np.random.choice(range(1, total_rows + 1), n_samples, replace=False))
    
    try:
        df = pd.read_csv(file_path, skiprows=skip_idx, low_memory=False)
        return df
    except Exception as e:
        print(f"Error sampling {file_path}: {e}")
        return None

def load_category_sample(category_name, category_folder, n_samples):
    """Load a stratified sample from all files in a category."""
    category_path = os.path.join(DATA_ROOT, category_folder)
    csv_files = get_csv_files_for_category(category_path)
    
    if not csv_files:
        print(f"No CSV files found in {category_path}")
        return None
    
    print(f"\n{category_name}: Found {len(csv_files)} CSV files")
    
    # Distribute samples across files
    samples_per_file = max(1, n_samples // len(csv_files))
    
    dfs = []
    total_sampled = 0
    
    for csv_file in csv_files:
        if total_sampled >= n_samples:
            break
            
        remaining = n_samples - total_sampled
        to_sample = min(samples_per_file, remaining)
        
        df_sample = sample_from_csv(csv_file, to_sample)
        if df_sample is not None and len(df_sample) > 0:
            dfs.append(df_sample)
            total_sampled += len(df_sample)
            print(f"  - {os.path.basename(csv_file)}: {len(df_sample)} samples")
    
    if not dfs:
        return None
    
    result = pd.concat(dfs, ignore_index=True)
    result['Label'] = category_name
    
    # Clean up
    del dfs
    gc.collect()
    
    print(f"  Total samples for {category_name}: {len(result)}")
    return result

In [None]:
# Load stratified samples from each category
print("Loading stratified samples from each attack category...")
print("="*60)

all_samples = []

for category_name, category_folder in ATTACK_CATEGORIES.items():
    df_category = load_category_sample(category_name, category_folder, SAMPLES_PER_CATEGORY)
    if df_category is not None:
        all_samples.append(df_category)
    gc.collect()

# Combine all samples
df = pd.concat(all_samples, ignore_index=True)
del all_samples
gc.collect()

print("\n" + "="*60)
print(f"Total dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print("=" * 50)
print(f"Number of samples: {len(df)}")
print(f"Number of features: {len(df.columns)}")
print(f"\nColumn names:")
print(df.columns.tolist())

In [None]:
# Display first few rows
df.head()

In [None]:
# Data types and missing values
print("Data Types:")
print(df.dtypes)
print("\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct})
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Statistical summary
df.describe()

## 3. Dataset Characterization

In [None]:
# Label distribution
label_col = 'Label'
print(f"Label column: {label_col}")
print(f"\nLabel distribution:")
label_counts = df[label_col].value_counts()
print(label_counts)

In [None]:
# Visualize label distribution
fig, ax = plt.subplots(figsize=(12, 6))
colors = plt.cm.Set3(np.linspace(0, 1, len(label_counts)))
bars = ax.bar(label_counts.index, label_counts.values, color=colors)
ax.set_xlabel('Attack Type')
ax.set_ylabel('Count')
ax.set_title('Distribution of Attack Types (Sampled Dataset)')
plt.xticks(rotation=45, ha='right')
for bar, count in zip(bars, label_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, f'{count}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
# Pie chart for label distribution
fig, ax = plt.subplots(figsize=(10, 8))
colors = plt.cm.Set3(np.linspace(0, 1, len(label_counts)))
wedges, texts, autotexts = ax.pie(
    label_counts.values, 
    labels=label_counts.index, 
    autopct='%1.1f%%',
    colors=colors,
    pctdistance=0.85,
    explode=[0.02] * len(label_counts)
)
centre_circle = plt.Circle((0, 0), 0.50, fc='white')
ax.add_patch(centre_circle)
ax.set_title('Proportion of Attack Types')
plt.tight_layout()
plt.show()

In [None]:
# Create binary label for anomaly detection (Benign vs Attack)
df['is_attack'] = (df[label_col] != 'Benign').astype(int)

print("Binary Classification Distribution:")
print(df['is_attack'].value_counts())
print(f"\nAttack ratio: {df['is_attack'].mean()*100:.2f}%")

## 4. Data Preprocessing

In [None]:
# Keep a copy of labels
y_multiclass = df[label_col].copy()
y_binary = df['is_attack'].copy()

# Drop label columns from features
df_features = df.drop(columns=[label_col, 'is_attack'])

print(f"Features shape: {df_features.shape}")

In [None]:
# Handle non-numeric columns
non_numeric_cols = df_features.select_dtypes(include=['object']).columns.tolist()
print(f"Non-numeric columns: {non_numeric_cols}")

# Drop IP addresses and similar identifier columns
cols_to_drop = [col for col in non_numeric_cols if any(x in col.lower() for x in ['ip', 'address', 'id', 'time', 'stamp'])]
df_features = df_features.drop(columns=cols_to_drop, errors='ignore')

# Label encode remaining categorical columns
remaining_object_cols = df_features.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in remaining_object_cols:
    df_features[col] = le.fit_transform(df_features[col].astype(str))

print(f"Features shape after encoding: {df_features.shape}")

In [None]:
# Handle infinite values and fill missing values
df_features = df_features.replace([np.inf, -np.inf], np.nan)

# Fill missing values with median
for col in df_features.columns:
    if df_features[col].isnull().any():
        median_val = df_features[col].median()
        df_features[col] = df_features[col].fillna(median_val if pd.notna(median_val) else 0)

# Remove constant columns (variance = 0)
constant_cols = df_features.columns[df_features.nunique() <= 1].tolist()
df_features = df_features.drop(columns=constant_cols)
print(f"Removed {len(constant_cols)} constant columns")
print(f"Final features shape: {df_features.shape}")

In [None]:
# Feature correlation analysis (sample for visualization)
plt.figure(figsize=(16, 14))
sample_cols = df_features.columns[:min(30, len(df_features.columns))]
corr_matrix = df_features[sample_cols].corr()
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, annot=False)
plt.title('Feature Correlation Matrix (First 30 Features)')
plt.tight_layout()
plt.show()

In [None]:
# Encode multiclass labels
le_multiclass = LabelEncoder()
y_multiclass_encoded = le_multiclass.fit_transform(y_multiclass)
class_names = le_multiclass.classes_
print(f"Classes: {class_names}")
print(f"Number of classes: {len(class_names)}")

In [None]:
# Split data for training and testing
X = df_features.values
y = y_binary.values  # Binary classification
y_multi = y_multiclass_encoded  # Multiclass classification

# Train-test split (binary)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Train-test split (multiclass)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X, y_multi, test_size=0.2, random_state=RANDOM_STATE, stratify=y_multi
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_multi_scaled = scaler.fit_transform(X_train_multi)
X_test_multi_scaled = scaler.transform(X_test_multi)

print("Features scaled successfully!")

## 5. Unsupervised Learning - Anomaly Detection

We benchmark 3 complementary unsupervised algorithms:
1. **Isolation Forest** - Tree-based anomaly detection
2. **Local Outlier Factor (LOF)** - Density-based anomaly detection
3. **One-Class SVM** - Support vector-based anomaly detection

In [None]:
def evaluate_anomaly_detector(y_true, y_pred, model_name):
    """Evaluate anomaly detection model and return metrics."""
    # Convert predictions: -1 (anomaly) -> 1 (attack), 1 (normal) -> 0
    y_pred_binary = np.where(y_pred == -1, 1, 0)
    
    cm = confusion_matrix(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary, zero_division=0)
    recall = recall_score(y_true, y_pred_binary, zero_division=0)
    f1 = f1_score(y_true, y_pred_binary, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_binary)
    mcc = matthews_corrcoef(y_true, y_pred_binary)
    
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")
    
    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(6, 5))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Attack'])
    disp.plot(ax=ax, cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.show()
    
    return {
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Balanced Accuracy': balanced_acc,
        'MCC': mcc
    }

In [None]:
# Calculate contamination rate (proportion of attacks)
contamination_rate = y_train.mean()
print(f"Contamination rate (attack ratio): {contamination_rate:.4f}")

In [None]:
# 1. Isolation Forest
print("Training Isolation Forest...")
iso_forest = IsolationForest(
    n_estimators=100,
    contamination="auto",
    random_state=RANDOM_STATE,
    n_jobs=-1
)
iso_forest.fit(X_train_scaled)
y_pred_iso = iso_forest.predict(X_test_scaled)
iso_results = evaluate_anomaly_detector(y_test, y_pred_iso, "Isolation Forest")

In [None]:
# 2. Local Outlier Factor
print("Training Local Outlier Factor...")
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination="auto",
    novelty=True,
    n_jobs=-1
)
lof.fit(X_train_scaled)
y_pred_lof = lof.predict(X_test_scaled)
lof_results = evaluate_anomaly_detector(y_test, y_pred_lof, "Local Outlier Factor")

In [None]:
# 3. One-Class SVM (using a subsample due to computational complexity)
print("Training One-Class SVM...")
# Subsample for OCSVM (it's O(nÂ²) complexity)
OCSVM_MAX_SAMPLES = 10000
if len(X_train_scaled) > OCSVM_MAX_SAMPLES:
    ocsvm_idx = np.random.choice(len(X_train_scaled), OCSVM_MAX_SAMPLES, replace=False)
    X_train_ocsvm = X_train_scaled[ocsvm_idx]
else:
    X_train_ocsvm = X_train_scaled

ocsvm = OneClassSVM(
    kernel='rbf',
    gamma='scale',
    nu=min(contamination_rate, 0.5)  # nu must be <= 0.5
)
ocsvm.fit(X_train_ocsvm)
y_pred_ocsvm = ocsvm.predict(X_test_scaled)
ocsvm_results = evaluate_anomaly_detector(y_test, y_pred_ocsvm, "One-Class SVM")

In [None]:
# Summary of Anomaly Detection Results
anomaly_results_df = pd.DataFrame([iso_results, lof_results, ocsvm_results])
print("\n" + "="*70)
print("ANOMALY DETECTION BENCHMARK SUMMARY")
print("="*70)
print(anomaly_results_df.to_string(index=False))

In [None]:
# Visualize anomaly detection results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = anomaly_results_df['Model'].tolist()
x = np.arange(len(models))
width = 0.35

# Precision & Recall
axes[0].bar(x - width/2, anomaly_results_df['Precision'], width, label='Precision', color='steelblue')
axes[0].bar(x + width/2, anomaly_results_df['Recall'], width, label='Recall', color='darkorange')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Precision & Recall')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models, rotation=15, ha='right')
axes[0].legend()
axes[0].set_ylim(0, 1)

# Balanced Accuracy & MCC
axes[1].bar(x - width/2, anomaly_results_df['Balanced Accuracy'], width, label='Balanced Accuracy', color='seagreen')
axes[1].bar(x + width/2, anomaly_results_df['MCC'], width, label='MCC', color='crimson')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_title('Balanced Accuracy & MCC')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models, rotation=15, ha='right')
axes[1].legend()
axes[1].set_ylim(0, 1)

plt.suptitle('Anomaly Detection Performance Comparison', fontsize=14)
plt.tight_layout()
plt.show()

## 6. Supervised Learning - Classification

We benchmark 3 complementary classification algorithms:
1. **Random Forest** - Ensemble tree-based classifier
2. **XGBoost** - Gradient boosting classifier
3. **LightGBM** - Light gradient boosting classifier

In [None]:
def evaluate_classifier(model, X_train, X_test, y_train, y_test, model_name, class_names=None):
    """Train and evaluate a classifier with comprehensive metrics."""
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Probabilities for AUPRC
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_test)
        try:
            from sklearn.preprocessing import label_binarize
            y_test_bin = label_binarize(y_test, classes=range(len(class_names)))
            auprc = average_precision_score(y_test_bin, y_prob, average='weighted')
        except:
            auprc = 0.0
    else:
        auprc = 0.0
    
    # Metrics
    cm = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"AUPRC (weighted): {auprc:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")
    
    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(ax=ax, cmap='Blues', xticks_rotation=45)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.show()
    
    return {
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'AUPRC': auprc,
        'Balanced Accuracy': balanced_acc,
        'MCC': mcc
    }, model

In [None]:
# 1. Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight='balanced'
)
rf_results, rf_model = evaluate_classifier(
    rf, X_train_multi_scaled, X_test_multi_scaled, y_train_multi, y_test_multi,
    "Random Forest", class_names
)

In [None]:
# 2. XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    eval_metric='mlogloss',
    verbosity=0
)
xgb_results, xgb_model = evaluate_classifier(
    xgb, X_train_multi_scaled, X_test_multi_scaled, y_train_multi, y_test_multi,
    "XGBoost", class_names
)

In [None]:
# 3. LightGBM Classifier
lgbm = LGBMClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight='balanced',
    verbose=-1
)
lgbm_results, lgbm_model = evaluate_classifier(
    lgbm, X_train_multi_scaled, X_test_multi_scaled, y_train_multi, y_test_multi,
    "LightGBM", class_names
)

In [None]:
# Summary of Classification Results
classification_results_df = pd.DataFrame([rf_results, xgb_results, lgbm_results])
print("\n" + "="*70)
print("CLASSIFICATION BENCHMARK SUMMARY")
print("="*70)
print(classification_results_df.to_string(index=False))

In [None]:
# Visualize classification results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = classification_results_df['Model'].tolist()
x = np.arange(len(models))
width = 0.25

# Precision, Recall & AUPRC
axes[0].bar(x - width, classification_results_df['Precision'], width, label='Precision', color='steelblue')
axes[0].bar(x, classification_results_df['Recall'], width, label='Recall', color='darkorange')
axes[0].bar(x + width, classification_results_df['AUPRC'], width, label='AUPRC', color='forestgreen')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Precision, Recall & AUPRC')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models, rotation=15, ha='right')
axes[0].legend()
axes[0].set_ylim(0, 1)

# Balanced Accuracy & MCC
axes[1].bar(x - width/2, classification_results_df['Balanced Accuracy'], width, label='Balanced Accuracy', color='seagreen')
axes[1].bar(x + width/2, classification_results_df['MCC'], width, label='MCC', color='crimson')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_title('Balanced Accuracy & MCC')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models, rotation=15, ha='right')
axes[1].legend()
axes[1].set_ylim(0, 1)

plt.suptitle('Classification Performance Comparison', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Analysis
feature_names = df_features.columns.tolist()

# Random Forest Feature Importance
rf_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=True).tail(20)

fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(rf_importance['Feature'], rf_importance['Importance'], color='steelblue')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
ax.set_title('Top 20 Feature Importances (Random Forest)')
plt.tight_layout()
plt.show()

## 7. Adversarial Attacks (Bonus - Objective 2)

We implement adversarial attacks against the classification models:
1. **FGSM (Fast Gradient Sign Method)** - White-box attack
2. **PGD (Projected Gradient Descent)** - Iterative white-box attack
3. **Noise-based perturbation** - Simple black-box attack

In [None]:
# Import TensorFlow for neural network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Build a neural network for adversarial attack demonstration
def build_nn_model(input_shape, num_classes):
    model = keras.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

print("Neural network model builder ready.")

In [None]:
# Train neural network
num_classes = len(class_names)
input_shape = X_train_multi_scaled.shape[1]

nn_model = build_nn_model(input_shape, num_classes)

# Train with early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

print("Training Neural Network...")
history = nn_model.fit(
    X_train_multi_scaled, y_train_multi,
    epochs=50,
    batch_size=256,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Evaluate baseline model performance
baseline_loss, baseline_acc = nn_model.evaluate(X_test_multi_scaled, y_test_multi, verbose=0)
print(f"Baseline Neural Network Accuracy: {baseline_acc:.4f}")

In [None]:
# FGSM Attack Implementation
def fgsm_attack(model, x, y, epsilon=0.1):
    """Fast Gradient Sign Method attack."""
    x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
    y_tensor = tf.convert_to_tensor(y, dtype=tf.int32)
    
    with tf.GradientTape() as tape:
        tape.watch(x_tensor)
        predictions = model(x_tensor)
        loss = keras.losses.sparse_categorical_crossentropy(y_tensor, predictions)
    
    gradients = tape.gradient(loss, x_tensor)
    signed_grad = tf.sign(gradients)
    x_adv = x_tensor + epsilon * signed_grad
    return x_adv.numpy()

# PGD Attack Implementation
def pgd_attack(model, x, y, epsilon=0.1, alpha=0.01, num_iter=10):
    """Projected Gradient Descent attack."""
    x_adv = tf.identity(tf.convert_to_tensor(x, dtype=tf.float32))
    x_original = tf.convert_to_tensor(x, dtype=tf.float32)
    y_tensor = tf.convert_to_tensor(y, dtype=tf.int32)
    
    for _ in range(num_iter):
        with tf.GradientTape() as tape:
            tape.watch(x_adv)
            predictions = model(x_adv)
            loss = keras.losses.sparse_categorical_crossentropy(y_tensor, predictions)
        
        gradients = tape.gradient(loss, x_adv)
        x_adv = x_adv + alpha * tf.sign(gradients)
        perturbation = tf.clip_by_value(x_adv - x_original, -epsilon, epsilon)
        x_adv = x_original + perturbation
    
    return x_adv.numpy()

# Noise-based attack (Black-box)
def noise_attack(x, epsilon=0.1):
    """Simple random noise perturbation attack."""
    noise = np.random.uniform(-epsilon, epsilon, x.shape)
    return x + noise

print("Attack functions defined.")

In [None]:
# Evaluate adversarial attacks with different epsilon values
epsilons = [0.01, 0.05, 0.1, 0.2, 0.3]
adversarial_results = []

# Use a subset for faster evaluation
ADV_SAMPLES = min(2000, len(X_test_multi_scaled))
X_adv_test = X_test_multi_scaled[:ADV_SAMPLES]
y_adv_test = y_test_multi[:ADV_SAMPLES]

print("Evaluating adversarial attacks...")
print(f"Using {ADV_SAMPLES} test samples")

for eps in epsilons:
    print(f"\nEpsilon = {eps}")
    
    # FGSM
    X_fgsm = fgsm_attack(nn_model, X_adv_test, y_adv_test, epsilon=eps)
    _, fgsm_acc = nn_model.evaluate(X_fgsm, y_adv_test, verbose=0)
    
    # PGD
    X_pgd = pgd_attack(nn_model, X_adv_test, y_adv_test, epsilon=eps)
    _, pgd_acc = nn_model.evaluate(X_pgd, y_adv_test, verbose=0)
    
    # Noise
    X_noise = noise_attack(X_adv_test, epsilon=eps)
    _, noise_acc = nn_model.evaluate(X_noise, y_adv_test, verbose=0)
    
    adversarial_results.append({
        'Epsilon': eps,
        'Baseline': baseline_acc,
        'FGSM': fgsm_acc,
        'PGD': pgd_acc,
        'Noise': noise_acc
    })
    
    print(f"  FGSM Accuracy: {fgsm_acc:.4f} (drop: {(baseline_acc - fgsm_acc)*100:.2f}%)")
    print(f"  PGD Accuracy: {pgd_acc:.4f} (drop: {(baseline_acc - pgd_acc)*100:.2f}%)")
    print(f"  Noise Accuracy: {noise_acc:.4f} (drop: {(baseline_acc - noise_acc)*100:.2f}%)")

In [None]:
# Visualize adversarial attack results
adv_df = pd.DataFrame(adversarial_results)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(adv_df['Epsilon'], adv_df['Baseline'], 'k--', marker='o', label='Baseline', linewidth=2)
ax.plot(adv_df['Epsilon'], adv_df['FGSM'], 'r-', marker='s', label='FGSM Attack', linewidth=2)
ax.plot(adv_df['Epsilon'], adv_df['PGD'], 'b-', marker='^', label='PGD Attack', linewidth=2)
ax.plot(adv_df['Epsilon'], adv_df['Noise'], 'g-', marker='d', label='Noise Attack', linewidth=2)

ax.set_xlabel('Epsilon (Perturbation Strength)')
ax.set_ylabel('Accuracy')
ax.set_title('Model Accuracy Under Adversarial Attacks')
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Summary table of adversarial attack results
print("\n" + "="*70)
print("ADVERSARIAL ATTACKS SUMMARY")
print("="*70)
print(adv_df.to_string(index=False))

## 8. Conclusions and Security Analysis

In [None]:
# Final Summary
print("="*70)
print("CYBERML PROJECT - FINAL SUMMARY")
print("="*70)

print("\n1. DATASET CHARACTERIZATION")
print(f"   - Sampled dataset size: {len(df)}")
print(f"   - Number of features: {len(df_features.columns)}")
print(f"   - Number of attack classes: {len(class_names)}")
print(f"   - Classes: {list(class_names)}")
print(f"   - Attack ratio in sample: {y_binary.mean()*100:.2f}%")

print("\n2. ANOMALY DETECTION RESULTS")
best_anomaly = anomaly_results_df.loc[anomaly_results_df['MCC'].idxmax()]
print(f"   Best Model: {best_anomaly['Model']}")
print(f"   - MCC: {best_anomaly['MCC']:.4f}")
print(f"   - Balanced Accuracy: {best_anomaly['Balanced Accuracy']:.4f}")
print(f"   - Precision: {best_anomaly['Precision']:.4f}")
print(f"   - Recall: {best_anomaly['Recall']:.4f}")

print("\n3. CLASSIFICATION RESULTS")
best_classifier = classification_results_df.loc[classification_results_df['MCC'].idxmax()]
print(f"   Best Model: {best_classifier['Model']}")
print(f"   - MCC: {best_classifier['MCC']:.4f}")
print(f"   - Balanced Accuracy: {best_classifier['Balanced Accuracy']:.4f}")
print(f"   - AUPRC: {best_classifier['AUPRC']:.4f}")

print("\n4. ADVERSARIAL ATTACKS ANALYSIS")
print(f"   - Baseline accuracy: {baseline_acc:.4f}")
print(f"   - FGSM is more effective than random noise")
print(f"   - PGD provides stronger attacks than FGSM")
print(f"   - Model robustness decreases significantly with epsilon > 0.1")

print("\n5. SECURITY RECOMMENDATIONS")
print("   - Implement adversarial training for improved robustness")
print("   - Use ensemble methods combining multiple detection approaches")
print("   - Regular model retraining with new attack patterns")
print("   - Deploy anomaly detection as first defense layer")
print("   - Consider input validation and feature monitoring")

In [None]:
# Save results to CSV for report
anomaly_results_df.to_csv('anomaly_detection_results.csv', index=False)
classification_results_df.to_csv('classification_results.csv', index=False)
adv_df.to_csv('adversarial_attack_results.csv', index=False)

print("Results saved to CSV files:")
print("  - anomaly_detection_results.csv")
print("  - classification_results.csv")
print("  - adversarial_attack_results.csv")