In [None]:
# Set random seeds for reproducibility
import numpy as np
import random
np.random.seed(42)
random.seed(42)

# Train a machine learning model on the preprocessed data
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# Paths
DATA_DIR = '../data/'
MODEL_DIR = '../models/'
X_TRAIN_FILE = os.path.join(DATA_DIR, 'X_train.csv')
Y_TRAIN_FILE = os.path.join(DATA_DIR, 'y_train.csv')
X_VAL_FILE = os.path.join(DATA_DIR, 'X_val.csv')
Y_VAL_FILE = os.path.join(DATA_DIR, 'y_val.csv')

# Check if data files exist
for file_path in [X_TRAIN_FILE, Y_TRAIN_FILE, X_VAL_FILE, Y_VAL_FILE]:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Required file not found: {file_path}")

# Load data
print('Loading data...')
X_train = pd.read_csv(X_TRAIN_FILE)
y_train = pd.read_csv(Y_TRAIN_FILE).values.ravel()
X_val = pd.read_csv(X_VAL_FILE)
y_val = pd.read_csv(Y_VAL_FILE).values.ravel()
print('Data loaded.')

### Hyperparameter Tuning with GridSearchCV

Let's use GridSearchCV to find the best hyperparameters for the Random Forest model after SMOTE oversampling.

In [None]:
from sklearn.model_selection import GridSearchCV

# Remove classes with only one sample before SMOTE
import numpy as np
(unique, counts) = np.unique(y_train, return_counts=True)
to_keep = np.isin(y_train, unique[counts > 1])
X_train_smote = X_train[to_keep]
y_train_smote = y_train[to_keep]

# Apply SMOTE oversampling to handle class imbalance
from imblearn.over_sampling import SMOTE
# Set k_neighbors=1 to avoid ValueError for very small classes
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_res, y_train_res = smote.fit_resample(X_train_smote, y_train_smote)

# Parameter grid for Random Forest (about 500 fits: 168 combinations x 3 folds)
param_grid = {
    'n_estimators': [100, 200, 300, 400],   # 4 options
    'max_depth': [None, 10, 20],           # 3 options
    'min_samples_split': [2, 5],           # 2 options
    'min_samples_leaf': [1, 2],            # 2 options
    'max_features': ['sqrt', 'log2'],      # 2 options
    'bootstrap': [True],                   # 1 option
    'class_weight': ['balanced']           # 1 option
}
# 4*3*2*2*2*1*1 = 96 combinations, 3 folds = 288 fits
# To get closer to 500, add one more value to n_estimators and min_samples_split:
param_grid['n_estimators'] = [100, 200, 300, 400, 500]  # 5 options
param_grid['min_samples_split'] = [2, 5, 10]            # 3 options
# 5*3*3*2*2*1*1 = 180 combinations, 3 folds = 540 fits

rf = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1)
print('Starting GridSearchCV with parameter grid and cv=3...')
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2, error_score='raise')
grid_search.fit(X_train_res, y_train_res)

print('Best parameters found:', grid_search.best_params_)
print('Best cross-validated score:', grid_search.best_score_)

# Use the best estimator for evaluation
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}")

# Display classification report in a scrollable output area
from IPython.display import display, HTML
report = classification_report(y_val, y_pred)
display(HTML(f'<div style="height:300px; overflow:auto; border:1px solid #ccc; white-space:pre;">{report}</div>'))

# Display confusion matrix heatmap in a scrollable output area
import matplotlib.pyplot as plt
import seaborn as sns
labels = sorted(list(set(y_val) | set(y_pred)))
cm = confusion_matrix(y_val, y_pred, labels=labels)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix Heatmap (Tuned Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
display(HTML('<div style="height:600px; overflow:auto; border:1px solid #ccc;">'))
plt.show()
display(HTML('</div>'))

# Save the tuned model
import joblib
import os
os.makedirs(MODEL_DIR, exist_ok=True)
model_path = os.path.join(MODEL_DIR, 'random_forest_model_tuned.joblib')
joblib.dump(best_clf, model_path)
print(f"Tuned model saved to {model_path}")

## Workflow Complete

- Data loaded and checked.
- SMOTE oversampling applied to handle class imbalance.
- Random Forest trained and evaluated.
- Hyperparameter tuning performed with GridSearchCV.
- Both base and tuned models saved to disk.

You can now proceed to test the model on new data or integrate it into your intrusion detection pipeline.

## Testing with kdd_test_processed.csv


In [None]:
# Test the tuned model with kdd_test_processed.csv
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

TEST_FILE = '../data/kdd_test_processed.csv'
MODEL_DIR = '../models/'  # Ensure MODEL_DIR is defined
assert os.path.exists(TEST_FILE), f"Test file not found: {TEST_FILE}"

# Load test data
test_df = pd.read_csv(TEST_FILE)

# Assume the last column is the label
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

# Load the tuned model
model_path = os.path.join(MODEL_DIR, 'random_forest_model_tuned.joblib')
best_clf = joblib.load(model_path)

# Predict and evaluate
y_pred_test = best_clf.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {acc_test:.4f}")
print("Classification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test):")
labels = sorted(list(set(y_test) | set(y_pred_test)))
cm = confusion_matrix(y_test, y_pred_test, labels=labels)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix Heatmap (Test Data)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()