In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
# Load the training data
try:
    train_data = pd.read_csv('datasets/KDDTrain+.txt', header=None)
except FileNotFoundError:
    print("Training file not found. Please check the file path and try again.")
    exit(1)

In [3]:
# Define columns
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack', 'level'
]
train_data.columns = columns

In [4]:
# Label encode categorical features
label_encoders = {}
for col in ['protocol_type', 'service', 'flag']:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

In [5]:
# Binary classification for 'attack' column
train_data['attack'] = train_data['attack'].apply(lambda x: 0 if x == 'normal' else 1)


In [6]:
# Split features and target
X_train = train_data.drop(['attack'], axis=1)
y_train = train_data['attack']

In [7]:
#Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [8]:
# KNN Classifier with GridSearchCV
param_grid = {
    'n_neighbors': [5, 10, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [9]:
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   2.8s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   3.2s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   2.9s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   3.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   3.1s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.7s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.8s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   2.8s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   3.5s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   3.4s
[CV] END ..metric=euclidean, n_neighbors=10, weights=uniform; total time=   3.8s
[CV] END ..metric=euclidean, n_neighbors=10, wei

  _data = np.array(data, dtype=dtype, copy=copy,


In [14]:
# Save the trained model
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(grid.best_estimator_, f)

In [15]:
# Save the scaler for use in testing
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [16]:
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Best Cross-Validation Score: 0.999158552494323


In [17]:
print("Model trained and saved successfully.")


Model trained and saved successfully.
