# NSL-KDD Network Intrusion Detection System

This notebook walks through the end-to-end workflow: data loading, preprocessing, model training, evaluation, and selecting the best model.

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

from src.data_utils import load_train_test
from src.preprocessing import split_features_labels, build_preprocessor
from src.model_utils import get_models, get_param_grids, compute_metrics
from src.config import RANDOM_STATE


In [None]:
train_df, test_df = load_train_test()
train_df.head()

In [None]:
X_train, y_train = split_features_labels(train_df)
X_test, y_test = split_features_labels(test_df)
preprocessor = build_preprocessor(X_train)
models = get_models(RANDOM_STATE)
param_grids = get_param_grids()

In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
results = []
for name, model in models.items():
    pipeline = Pipeline([('preprocess', preprocessor), ('model', model)])
    grid = GridSearchCV(pipeline, param_grids[name], scoring='f1', cv=cv, n_jobs=1)
    grid.fit(X_train, y_train)
    preds = grid.best_estimator_.predict(X_test)
    metrics = compute_metrics(y_test, preds)
    metrics['model'] = name
    metrics['cv_f1'] = grid.best_score_
    results.append(metrics)

results_df = pd.DataFrame(results).set_index('model').sort_values('f1', ascending=False)
results_df

The training script (`python -m src.train`) will save the best model to `models/best_model.joblib` and generate plots in `reports/`.