# Group Assignment: Comparative Analysis
## Network Intrusion Detection - Performance Comparison of Different Classifiers

**Module:** CT115-3-M Data Analytics in Cyber Security  
**Dataset:** NSL-KDD (Boosted Train + Preprocessed Test)  

---

### Classifiers Compared:
1. **Linear:** Linear Discriminant Analysis (LDA)
2. **Ensemble (Bagging):** Random Forest
3. **Non-Linear:** K-Nearest Neighbors (KNN)
4. **Ensemble (Boosting):** Gradient Boosting

---
## 1. Import Libraries and Load Results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

In [None]:
# Load results from individual notebooks
results_path = 'results/'

results_files = {
    'Linear (LDA)': 'linear_lda_results.json',
    'Ensemble - RF': 'ensemble_rf_results.json',
    'Non-Linear (KNN)': 'nonlinear_knn_results.json',
    'Ensemble - GB': 'ensemble_gb_results.json'
}

all_results = {}
for name, filename in results_files.items():
    filepath = os.path.join(results_path, filename)
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            all_results[name] = json.load(f)
        print(f"Loaded: {name}")
    else:
        print(f"File not found: {filename} - Run individual notebooks first!")

---
## 2. Overview of Selected Algorithms

In [None]:
# Create algorithm overview table
overview_data = []

algorithm_descriptions = {
    'Linear (LDA)': {
        'Full Name': 'Linear Discriminant Analysis',
        'Category': 'Linear',
        'Characteristics': 'Assumes linear decision boundaries, projects data to maximize class separability',
        'Strengths': 'Fast training, interpretable, works well with normally distributed data',
        'Weaknesses': 'Assumes linear separability, sensitive to outliers'
    },
    'Ensemble - RF': {
        'Full Name': 'Random Forest',
        'Category': 'Ensemble (Bagging)',
        'Characteristics': 'Builds multiple decision trees and aggregates predictions via voting',
        'Strengths': 'Robust to overfitting, handles high dimensionality, feature importance',
        'Weaknesses': 'Can be slow with large datasets, less interpretable than single trees'
    },
    'Non-Linear (KNN)': {
        'Full Name': 'K-Nearest Neighbors',
        'Category': 'Non-Linear (Instance-based)',
        'Characteristics': 'Classifies based on majority vote of k nearest neighbors in feature space',
        'Strengths': 'Simple, no training phase, naturally handles multi-class',
        'Weaknesses': 'Slow prediction, sensitive to irrelevant features and scaling'
    },
    'Ensemble - GB': {
        'Full Name': 'Gradient Boosting',
        'Category': 'Ensemble (Boosting)',
        'Characteristics': 'Sequentially builds weak learners, each correcting predecessor errors',
        'Strengths': 'High accuracy, handles non-linear relationships, feature importance',
        'Weaknesses': 'Prone to overfitting, slower training, sensitive to hyperparameters'
    }
}

for name, desc in algorithm_descriptions.items():
    overview_data.append({
        'Algorithm': name,
        'Full Name': desc['Full Name'],
        'Category': desc['Category'],
        'Characteristics': desc['Characteristics']
    })

overview_df = pd.DataFrame(overview_data)
print("\n" + "="*80)
print("OVERVIEW OF SELECTED ALGORITHMS")
print("="*80)
for _, row in overview_df.iterrows():
    print(f"\n{row['Algorithm']} ({row['Full Name']})")
    print(f"  Category: {row['Category']}")
    print(f"  Characteristics: {row['Characteristics']}")

---
## 3. Performance Metrics Comparison

In [None]:
# Create comparison dataframes
baseline_data = []
optimised_data = []

for name, result in all_results.items():
    baseline_data.append({
        'Classifier': name,
        'Accuracy': result['baseline_metrics']['accuracy'],
        'Precision': result['baseline_metrics']['precision'],
        'Recall': result['baseline_metrics']['recall'],
        'F1-Score': result['baseline_metrics']['f1'],
        'MCC': result['baseline_metrics']['mcc'],
        'AUC': result['baseline_auc']
    })
    optimised_data.append({
        'Classifier': name,
        'Accuracy': result['optimised_metrics']['accuracy'],
        'Precision': result['optimised_metrics']['precision'],
        'Recall': result['optimised_metrics']['recall'],
        'F1-Score': result['optimised_metrics']['f1'],
        'MCC': result['optimised_metrics']['mcc'],
        'AUC': result['optimised_auc']
    })

baseline_df = pd.DataFrame(baseline_data)
optimised_df = pd.DataFrame(optimised_data)

In [None]:
print("\n" + "="*80)
print("BASELINE MODELS PERFORMANCE")
print("="*80)
print(baseline_df.round(4).to_string(index=False))

In [None]:
print("\n" + "="*80)
print("OPTIMISED MODELS PERFORMANCE")
print("="*80)
print(optimised_df.round(4).to_string(index=False))

In [None]:
# Calculate improvement
improvement_df = pd.DataFrame({
    'Classifier': baseline_df['Classifier'],
    'Accuracy': ((optimised_df['Accuracy'] - baseline_df['Accuracy']) / baseline_df['Accuracy'] * 100),
    'Precision': ((optimised_df['Precision'] - baseline_df['Precision']) / baseline_df['Precision'] * 100),
    'Recall': ((optimised_df['Recall'] - baseline_df['Recall']) / baseline_df['Recall'] * 100),
    'F1-Score': ((optimised_df['F1-Score'] - baseline_df['F1-Score']) / baseline_df['F1-Score'] * 100),
    'MCC': ((optimised_df['MCC'] - baseline_df['MCC']) / baseline_df['MCC'] * 100),
    'AUC': ((optimised_df['AUC'] - baseline_df['AUC']) / baseline_df['AUC'] * 100)
})

print("\n" + "="*80)
print("IMPROVEMENT AFTER OPTIMISATION (%)")
print("="*80)
print(improvement_df.round(2).to_string(index=False))

---
## 4. Visual Comparison

In [None]:
# Bar chart comparison of optimised models
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'MCC', 'AUC']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

colors = ['#3498db', '#27ae60', '#9b59b6', '#e74c3c']

for idx, metric in enumerate(metrics):
    ax = axes[idx]
    values = optimised_df[metric].values
    classifiers = optimised_df['Classifier'].values
    
    bars = ax.bar(classifiers, values, color=colors)
    ax.set_title(metric, fontsize=14, fontweight='bold')
    ax.set_ylim(0, 1.1)
    ax.set_ylabel('Score')
    ax.tick_params(axis='x', rotation=45)
    
    for bar, val in zip(bars, values):
        ax.annotate(f'{val:.3f}', xy=(bar.get_x() + bar.get_width()/2, val),
                    xytext=(0, 3), textcoords="offset points", ha='center', fontsize=9)

plt.suptitle('Optimised Models Performance Comparison', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Heatmap of optimised metrics
plt.figure(figsize=(12, 6))
heatmap_df = optimised_df.set_index('Classifier')[metrics]
sns.heatmap(heatmap_df, annot=True, fmt='.3f', cmap='RdYlGn', 
            linewidths=0.5, vmin=0.5, vmax=1.0)
plt.title('Performance Metrics Heatmap - Optimised Models', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Radar chart comparison
from math import pi

categories = metrics
N = len(categories)

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

for idx, row in optimised_df.iterrows():
    values = row[metrics].values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=row['Classifier'], color=colors[idx])
    ax.fill(angles, values, alpha=0.1, color=colors[idx])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=12)
ax.set_ylim(0, 1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.title('Radar Chart - Optimised Models Performance', fontsize=14, fontweight='bold', y=1.08)
plt.tight_layout()
plt.show()

In [None]:
# Baseline vs Optimised comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

classifiers = baseline_df['Classifier'].values
x = np.arange(len(classifiers))
width = 0.35

metrics_to_plot = ['Accuracy', 'F1-Score', 'MCC', 'AUC']

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx // 2, idx % 2]
    
    bars1 = ax.bar(x - width/2, baseline_df[metric], width, label='Baseline', color='#95a5a6')
    bars2 = ax.bar(x + width/2, optimised_df[metric], width, label='Optimised', color='#2ecc71')
    
    ax.set_ylabel('Score')
    ax.set_title(f'{metric} - Baseline vs Optimised', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(classifiers, rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 1.1)
    
    for bar in bars1:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3), textcoords="offset points", ha='center', fontsize=8)
    for bar in bars2:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3), textcoords="offset points", ha='center', fontsize=8)

plt.suptitle('Baseline vs Optimised Performance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 5. Optimisation Strategies Summary

In [None]:
print("\n" + "="*80)
print("OPTIMISATION STRATEGIES APPLIED BY EACH CLASSIFIER")
print("="*80)

for name, result in all_results.items():
    print(f"\n{name}:")
    print(f"  Strategies: {', '.join(result['optimisation_strategies'])}")
    print(f"  Features: {result['n_features_original']} -> {result['n_features_selected']}")
    print(f"  Feature reduction: {((result['n_features_original'] - result['n_features_selected']) / result['n_features_original'] * 100):.1f}%")

---
## 6. Ranking and Best Model Selection

In [None]:
# Rank classifiers by each metric
ranking_df = optimised_df.copy()
ranking_df = ranking_df.set_index('Classifier')

rankings = {}
for metric in metrics:
    rankings[metric] = ranking_df[metric].rank(ascending=False).astype(int)

ranking_result = pd.DataFrame(rankings)
ranking_result['Average Rank'] = ranking_result.mean(axis=1)
ranking_result = ranking_result.sort_values('Average Rank')

print("\n" + "="*80)
print("CLASSIFIER RANKINGS (1 = Best)")
print("="*80)
print(ranking_result.to_string())

In [None]:
# Find best performing classifier
best_classifier = ranking_result['Average Rank'].idxmin()
best_metrics = optimised_df[optimised_df['Classifier'] == best_classifier].iloc[0]

print("\n" + "="*80)
print(f"BEST PERFORMING CLASSIFIER: {best_classifier}")
print("="*80)
print(f"  Accuracy:  {best_metrics['Accuracy']:.4f}")
print(f"  Precision: {best_metrics['Precision']:.4f}")
print(f"  Recall:    {best_metrics['Recall']:.4f}")
print(f"  F1-Score:  {best_metrics['F1-Score']:.4f}")
print(f"  MCC:       {best_metrics['MCC']:.4f}")
print(f"  AUC:       {best_metrics['AUC']:.4f}")

---
## 7. Discussion and Findings

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS AND DISCUSSION")
print("="*80)

print("\n1. ALGORITHM PERFORMANCE COMPARISON:")
print("   - Ensemble methods (Random Forest, Gradient Boosting) generally outperform")
print("     single classifiers due to their ability to combine multiple weak learners")
print("   - Linear classifiers (LDA) show competitive performance with faster training")
print("   - KNN provides good baseline but is computationally expensive for prediction")

print("\n2. OPTIMISATION IMPACT:")
print("   - Hyperparameter tuning consistently improved all classifiers")
print("   - Feature selection reduced model complexity without significant performance loss")
print("   - Class weighting helped address the slight class imbalance in the dataset")

print("\n3. METRIC ANALYSIS:")
print("   - For intrusion detection, Recall (detecting attacks) is crucial")
print("   - MCC provides balanced evaluation considering both classes")
print("   - AUC shows the trade-off between true positive and false positive rates")

print("\n4. RECOMMENDATIONS FOR NETWORK INTRUSION DETECTION:")
print("   - Use ensemble methods for production systems requiring high accuracy")
print("   - Consider LDA for real-time detection where speed is critical")
print("   - Feature selection can significantly reduce computational requirements")
print("   - Regular model retraining is recommended as attack patterns evolve")

---
## 8. Summary Table

In [None]:
# Create comprehensive summary table
summary_data = []

for name, result in all_results.items():
    summary_data.append({
        'Classifier': name,
        'Category': result['category'],
        'Baseline F1': result['baseline_metrics']['f1'],
        'Optimised F1': result['optimised_metrics']['f1'],
        'F1 Improvement': ((result['optimised_metrics']['f1'] - result['baseline_metrics']['f1']) / 
                          result['baseline_metrics']['f1'] * 100),
        'Optimised AUC': result['optimised_auc'],
        'Features Used': result['n_features_selected'],
        'Feature Reduction %': ((result['n_features_original'] - result['n_features_selected']) / 
                                result['n_features_original'] * 100)
    })

summary_df = pd.DataFrame(summary_data)

print("\n" + "="*100)
print("COMPREHENSIVE SUMMARY")
print("="*100)
print(summary_df.round(2).to_string(index=False))

In [None]:
# Save summary to CSV for report
summary_df.round(4).to_csv('results/group_summary.csv', index=False)
baseline_df.round(4).to_csv('results/baseline_comparison.csv', index=False)
optimised_df.round(4).to_csv('results/optimised_comparison.csv', index=False)

print("\nSummary tables saved to results folder:")
print("  - group_summary.csv")
print("  - baseline_comparison.csv")
print("  - optimised_comparison.csv")