# Individual Assignment: Ensemble Classifier (Bagging)
## Network Intrusion Detection using Random Forest

**Classifier Category:** Ensemble (Bagging)  
**Algorithm:** Random Forest Classifier  
**Dataset:** NSL-KDD (Boosted Train + Preprocessed Test)

---
## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import warnings
warnings.filterwarnings('ignore')

import os
data_path = '../datasets'

In [None]:
# Import local library
import sys
if "../.." not in sys.path:
    sys.path.insert(0, '../..')

from mylib import show_labels_dist, show_metrics, bias_var_metrics

---
## 2. Load Dataset

In [None]:
# Load Boosted Train and Preprocessed Test datasets
data_file = os.path.join(data_path, 'NSL_boosted-2.csv')
train_df = pd.read_csv(data_file)
print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))

data_file = os.path.join(data_path, 'NSL_ppTest.csv')
test_df = pd.read_csv(data_file)
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))

In [None]:
train_df.head()

---
## 3. Data Preparation

In [None]:
# Check for missing values
print('Missing Values - Train Set:', train_df.isnull().sum().sum())
print('Missing Values - Test Set:', test_df.isnull().sum().sum())

In [None]:
# Combine datasets for consistent preprocessing
combined_df = pd.concat([train_df, test_df])
print('Combined Dataset: {} rows, {} columns'.format(combined_df.shape[0], combined_df.shape[1]))

In [None]:
# Check label distributions
print("Label distribution:")
print(combined_df['label'].value_counts())
print("\nAttack category distribution:")
print(combined_df['atakcat'].value_counts())

In [None]:
# Set classification target (Two-class: normal vs attack)
twoclass = True

if twoclass:
    labels_df = combined_df['label'].copy()
    labels_df[labels_df != 'normal'] = 'attack'
else:
    labels_df = combined_df[['atakcat']].copy()
    labels_df.rename(columns={'atakcat':'label'}, inplace=True)
    labels_df = labels_df.squeeze('columns')

# Drop target features
combined_df.drop(['label'], axis=1, inplace=True)
combined_df.drop(['atakcat'], axis=1, inplace=True)

In [None]:
# One-Hot Encoding categorical features
categori = combined_df.select_dtypes(include=['object']).columns
category_cols = categori.tolist()
features_df = pd.get_dummies(combined_df, columns=category_cols)
print('Features after encoding: {} columns'.format(features_df.shape[1]))

In [None]:
# Get numeric columns for scaling
numeri = combined_df.select_dtypes(include=['float64','int64']).columns

In [None]:
# Restore train/test split
X_train = features_df.iloc[:len(train_df),:].copy()
X_train.reset_index(inplace=True, drop=True)
X_test = features_df.iloc[len(train_df):,:].copy()
X_test.reset_index(inplace=True, drop=True)

y_train = labels_df[:len(train_df)]
y_train.reset_index(inplace=True, drop=True)
y_test = labels_df[len(train_df):]
y_test.reset_index(inplace=True, drop=True)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
# Apply MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

for i in numeri:
    arr = np.array(X_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    X_train[i] = scale.transform(arr.reshape(len(arr),1))
    
    arr = np.array(X_test[i])
    X_test[i] = scale.transform(arr.reshape(len(arr),1))

print("Scaling completed using MinMaxScaler")

In [None]:
# Save original datasets
X_train_original = X_train.copy()
X_test_original = X_test.copy()
y_train_original = y_train.copy()

---
## 4. BASELINE MODEL: Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create baseline model with default parameters
baseline_model = RandomForestClassifier(random_state=42)
print("Baseline Model:", baseline_model)
print("\nDefault Parameters:")
for k, v in baseline_model.get_params().items():
    print(f"  {k}: {v}")

In [None]:
# Show label distribution
show_labels_dist(X_train, X_test, y_train, y_test)

In [None]:
# Train and evaluate baseline model
print("="*60)
print("BASELINE MODEL EVALUATION")
print("="*60)

trs = time()
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
tre = time() - trs

print(f"Training Time: {tre:.2f} seconds\n")
show_metrics(y_test, y_pred_baseline, baseline_model.classes_)

In [None]:
# Bias-Variance Decomposition for baseline
print("\nBias-Variance Decomposition (Baseline):")
bias_var_metrics(X_train, X_test, y_train, y_test, RandomForestClassifier(random_state=42), folds=10)

In [None]:
# Store baseline metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

baseline_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_baseline),
    'precision': precision_score(y_test, y_pred_baseline, pos_label='attack'),
    'recall': recall_score(y_test, y_pred_baseline, pos_label='attack'),
    'f1': f1_score(y_test, y_pred_baseline, pos_label='attack'),
    'mcc': matthews_corrcoef(y_test, y_pred_baseline)
}
print("Baseline Metrics:", baseline_metrics)

---
## 5. OPTIMISATION STRATEGY 1: Hyperparameter Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

print("Parameter Grid for Random Forest:")
for k, v in param_grid.items():
    print(f"  {k}: {v}")

In [None]:
# Use RandomizedSearchCV for efficiency
from sklearn.model_selection import RandomizedSearchCV

print("Running Randomized Search CV (this may take a few minutes)...")
trs = time()

rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=20,
    cv=3,
    scoring='f1_weighted',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random.fit(X_train, y_train)
tre = time() - trs

print(f"\nSearch Time: {tre:.2f} seconds")
print(f"\nBest Parameters: {rf_random.best_params_}")
print(f"Best CV Score: {rf_random.best_score_:.4f}")

In [None]:
# Store best parameters
best_params = rf_random.best_params_

---
## 6. OPTIMISATION STRATEGY 2: Feature Importance Based Selection

In [None]:
# Get feature importances from baseline model
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': baseline_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importances.head(20).to_string(index=False))

In [None]:
# Visualize feature importances
plt.figure(figsize=(12, 10))
top_n = 30
top_features = feature_importances.head(top_n)
sns.barplot(x='importance', y='feature', data=top_features, palette='viridis')
plt.title(f'Top {top_n} Feature Importances - Random Forest')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

In [None]:
# Select features with importance > threshold
# Use cumulative importance approach - select features that contribute to 95% of importance
feature_importances['cumulative'] = feature_importances['importance'].cumsum()
threshold_95 = feature_importances[feature_importances['cumulative'] <= 0.95]

selected_features = threshold_95['feature'].tolist()
# Add a few more features to ensure we capture key signals
if len(selected_features) < 20:
    selected_features = feature_importances.head(20)['feature'].tolist()

print(f"\nSelected {len(selected_features)} features (95% cumulative importance)")

In [None]:
# Create reduced datasets
X_train_reduced = X_train[selected_features]
X_test_reduced = X_test[selected_features]
print(f"Reduced feature set: {X_train_reduced.shape[1]} features (from {X_train.shape[1]})")

---
## 7. OPTIMISATION STRATEGY 3: Handling Class Imbalance

In [None]:
# Check class distribution
print("Class Distribution in Training Set:")
print(y_train.value_counts())
print(f"\nClass Ratio: {y_train.value_counts()['attack'] / y_train.value_counts()['normal']:.2f}")

In [None]:
# Use class_weight='balanced' to handle imbalance
print("\nUsing class_weight='balanced' to handle class imbalance")

---
## 8. OPTIMISED MODEL

In [None]:
# Create optimised model with best parameters, selected features, and balanced weights
optimised_params = best_params.copy()
optimised_params['class_weight'] = 'balanced'
optimised_params['random_state'] = 42

optimised_model = RandomForestClassifier(**optimised_params)

print("="*60)
print("OPTIMISED MODEL EVALUATION")
print("="*60)
print(f"Parameters: {optimised_params}")
print(f"Features: {len(selected_features)} (reduced from {X_train.shape[1]})")

trs = time()
optimised_model.fit(X_train_reduced, y_train)
y_pred_optimised = optimised_model.predict(X_test_reduced)
tre = time() - trs

print(f"\nTraining Time: {tre:.2f} seconds\n")
show_metrics(y_test, y_pred_optimised, optimised_model.classes_)

In [None]:
# Bias-Variance Decomposition for optimised model
print("\nBias-Variance Decomposition (Optimised):")
opt_model_for_bv = RandomForestClassifier(**optimised_params)
bias_var_metrics(X_train_reduced, X_test_reduced, y_train, y_test, opt_model_for_bv, folds=10)

In [None]:
# Store optimised metrics
optimised_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_optimised),
    'precision': precision_score(y_test, y_pred_optimised, pos_label='attack'),
    'recall': recall_score(y_test, y_pred_optimised, pos_label='attack'),
    'f1': f1_score(y_test, y_pred_optimised, pos_label='attack'),
    'mcc': matthews_corrcoef(y_test, y_pred_optimised)
}
print("Optimised Metrics:", optimised_metrics)

---
## 9. COMPARISON: Baseline vs Optimised Model

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'MCC'],
    'Baseline': [baseline_metrics['accuracy'], baseline_metrics['precision'], 
                 baseline_metrics['recall'], baseline_metrics['f1'], baseline_metrics['mcc']],
    'Optimised': [optimised_metrics['accuracy'], optimised_metrics['precision'],
                  optimised_metrics['recall'], optimised_metrics['f1'], optimised_metrics['mcc']]
})
comparison_df['Improvement'] = comparison_df['Optimised'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline'] * 100).round(2)

print("\n" + "="*60)
print("PERFORMANCE COMPARISON: BASELINE vs OPTIMISED")
print("="*60)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df['Metric']))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Baseline'], width, label='Baseline', color='steelblue')
bars2 = ax.bar(x + width/2, comparison_df['Optimised'], width, label='Optimised', color='forestgreen')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Random Forest: Baseline vs Optimised')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Metric'])
ax.legend()
ax.set_ylim(0, 1.1)

for bar in bars1:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Confusion Matrix Comparison
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

cm_baseline = confusion_matrix(y_test, y_pred_baseline, labels=baseline_model.classes_)
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm_baseline, display_labels=baseline_model.classes_)
disp1.plot(ax=axes[0], cmap='Blues')
axes[0].set_title('Baseline Model')

cm_optimised = confusion_matrix(y_test, y_pred_optimised, labels=optimised_model.classes_)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_optimised, display_labels=optimised_model.classes_)
disp2.plot(ax=axes[1], cmap='Greens')
axes[1].set_title('Optimised Model')

plt.tight_layout()
plt.show()

In [None]:
# ROC Curve Comparison
from sklearn.metrics import roc_curve, auc

y_prob_baseline = baseline_model.predict_proba(X_test)[:, 1]
y_prob_optimised = optimised_model.predict_proba(X_test_reduced)[:, 1]

y_test_binary = (y_test == 'attack').astype(int)

fpr_base, tpr_base, _ = roc_curve(y_test_binary, y_prob_baseline)
fpr_opt, tpr_opt, _ = roc_curve(y_test_binary, y_prob_optimised)

auc_base = auc(fpr_base, tpr_base)
auc_opt = auc(fpr_opt, tpr_opt)

plt.figure(figsize=(8, 6))
plt.plot(fpr_base, tpr_base, 'b-', label=f'Baseline (AUC = {auc_base:.4f})')
plt.plot(fpr_opt, tpr_opt, 'g-', label=f'Optimised (AUC = {auc_opt:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison - Random Forest')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

---
## 10. Summary and Conclusions

In [None]:
print("="*70)
print("SUMMARY: RANDOM FOREST FOR INTRUSION DETECTION")
print("="*70)
print("\n1. CLASSIFIER CATEGORY: Ensemble (Bagging)")
print("   Algorithm: Random Forest Classifier")
print("\n2. OPTIMISATION STRATEGIES APPLIED:")
print("   a) Hyperparameter Tuning with RandomizedSearchCV")
for k, v in best_params.items():
    print(f"      - {k}: {v}")
print("   b) Feature Selection based on Feature Importance")
print(f"      - Original features: {X_train.shape[1]}")
print(f"      - Selected features: {len(selected_features)}")
print(f"      - Feature reduction: {((X_train.shape[1] - len(selected_features)) / X_train.shape[1] * 100):.1f}%")
print("   c) Class Imbalance Handling")
print("      - Method: class_weight='balanced'")
print("\n3. PERFORMANCE IMPROVEMENT:")
for _, row in comparison_df.iterrows():
    print(f"   {row['Metric']}: {row['Baseline']:.4f} -> {row['Optimised']:.4f} ({row['Improvement %']:+.2f}%)")
print(f"\n4. ROC-AUC: {auc_base:.4f} -> {auc_opt:.4f}")
print("\n" + "="*70)

In [None]:
# Save results for group comparison
import json

results_dict = {
    'classifier': 'Random Forest',
    'category': 'Ensemble (Bagging)',
    'baseline_metrics': baseline_metrics,
    'optimised_metrics': optimised_metrics,
    'baseline_auc': auc_base,
    'optimised_auc': auc_opt,
    'optimisation_strategies': ['Hyperparameter Tuning', 'Feature Selection (Importance)', 'Class Weighting'],
    'best_params': {k: str(v) for k, v in best_params.items()},
    'n_features_original': X_train.shape[1],
    'n_features_selected': len(selected_features)
}

with open('../results/ensemble_rf_results.json', 'w') as f:
    json.dump(results_dict, f, indent=2)
print("Results saved to: results/ensemble_rf_results.json")