In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# 1. Load Data

### The dataset does not have headers, so we'll add them.

### The column names are available from the NSL-KDD documentation.

In [None]:
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class', 'difficulty'
]

### Load the training and test sets

In [None]:
df_train = pd.read_csv('KDDTrain+.txt', header=None, names=columns)
df_test = pd.read_csv('KDDTest+.txt', header=None, names=columns)

### Drop the 'difficulty' column as it's not needed for detection

In [None]:
df_train.drop(['difficulty'], axis=1, inplace=True)
df_test.drop(['difficulty'], axis=1, inplace=True)

# 2. Data Preprocessing

### Identify categorical and numerical features

In [None]:
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = df_train.select_dtypes(include=np.number).columns.tolist()

### The 'class' column is our target label
### Remove it from the numerical columns list

### These are actually binary, but we'll scale them anyway

In [None]:
numerical_cols.remove('land') 
numerical_cols.remove('logged_in')
numerical_cols.remove('is_host_login')
numerical_cols.remove('is_guest_login')

### --- Feature Encoding for Categorical Features ---
### We use one-hot encoding to convert categorical features into a numerical format.

In [None]:
df = pd.concat([df_train, df_test])
for col in categorical_cols:
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)
    df.drop(col, axis=1, inplace=True)

### Separate the combined data back into train and test sets

In [None]:
train_rows = len(df_train)
train_df = df.iloc[:train_rows]
test_df = df.iloc[train_rows:]

### --- Label Encoding for the 'class' column ---
### We change the 'class' label to be binary: 0 for 'normal' and 1 for 'attack'

In [None]:
train_labels = train_df['class'].apply(lambda x: 0 if x == 'normal' else 1)
test_labels = test_df['class'].apply(lambda x: 0 if x == 'normal' else 1)

### Drop the original class column

In [None]:
train_df.drop(['class'], axis=1, inplace=True)
test_df.drop(['class'], axis=1, inplace=True)

### --- Normalization for Numerical Features ---
### Scale numerical features to a range of [0, 1] for better model performance

In [None]:
scaler = MinMaxScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

### Convert to numpy arrays from model training

In [None]:
# Convert to numpy arrays for model training
X_train = train_df.values
y_train = train_labels.values
X_test = test_df.values
y_test = test_labels.values

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

### Step 3: Training Traditional ML Models

### Now, we'll train the traditional models you listed: 
### KNN, LDA, and SVM. We will use 
### scikit-learn for this.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### --- Initialize Models ---

In [None]:
models = {
    "K-Nearest Neighbours (KNN)": KNeighborsClassifier(n_neighbors=5),
    "Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(),
    "Support Vector Machine (SVM)": SVC(kernel='linear', probability=True) # Using a linear kernel for speed
}

### --- Train and Evaluate Models ---

In [None]:
for name, model in models.items():
    print(f"--- Training {name} ---")
    model.fit(X_train, y_train)
    
    print(f"--- Evaluating {name} ---")
    y_pred = model.predict(X_test)

### --- Performance Metrics ---

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
    
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("----------------------------------\n")

In [None]:
results_list = []

In [None]:
model_results = {
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }
results_list.append(model_results)

In [None]:
print("--- Final Model Performance Summary ---")

# Print the table header
print(f"{'Model':<35} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10}")
print("-" * 85)

# Print the results for each model
for result in results_list:
    print(f"{result['Model']:<35} | {result['Accuracy']:.4f}     | {result['Precision']:.4f}    | {result['Recall']:.4f}   | {result['F1-Score']:.4f}")

In [125]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings

# Suppress warnings from LDA about collinear features
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

# --- Step 2: Data Loading and Preprocessing ---

print("--- Starting Data Loading and Preprocessing ---")

# Define the column names for the NSL-KDD dataset
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class', 'difficulty'
]

# Load the training and test datasets
df_train = pd.read_csv('KDDTrain+.txt', header=None, names=columns)
df_test = pd.read_csv('KDDTest+.txt', header=None, names=columns)

# Drop the 'difficulty' column
df_train.drop(['difficulty'], axis=1, inplace=True)
df_test.drop(['difficulty'], axis=1, inplace=True)

# Identify categorical and numerical features
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = df_train.select_dtypes(include=np.number).columns.tolist()

# Combine train and test sets for consistent one-hot encoding
df = pd.concat([df_train, df_test])

# Perform one-hot encoding
for col in categorical_cols:
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)
    df.drop(col, axis=1, inplace=True)

# Separate back into training and testing sets
train_rows = len(df_train)
train_df = df.iloc[:train_rows]
test_df = df.iloc[train_rows:]

# Create binary labels: 0 for 'normal' and 1 for 'attack'
train_labels = train_df['class'].apply(lambda x: 0 if x == 'normal' else 1)
test_labels = test_df['class'].apply(lambda x: 0 if x == 'normal' else 1)

# Drop the original 'class' column
train_df.drop(['class'], axis=1, inplace=True)
test_df.drop(['class'], axis=1, inplace=True)

# Align columns - crucial for models to work correctly
# Some 'service' types might be in train but not test, or vice-versa
train_cols = train_df.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_df[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    train_df[c] = 0
test_df = test_df[train_cols] # Ensure order is the same

# Normalize numerical features
scaler = MinMaxScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Convert dataframes to numpy arrays
X_train = train_df.values
y_train = train_labels.values
X_test = test_df.values
y_test = test_labels.values

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print("--- Preprocessing Complete ---\n")

# --- Step 3: Train, Evaluate, and Store Results ---

# Initialize the models
# Added solver='svd' to LDA to make it more robust against collinear features from one-hot encoding
models = {
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(n_neighbors=5),
    "Linear Discriminant Analysis (LDA)": LinearDiscriminantAnalysis(solver='svd'),
    "Support Vector Machine (SVM)": SVC(kernel='linear')
}

# Create a list to store the results dictionaries
results_list = []

# Loop through each model
for name, model in models.items():
    print(f"--- Training {name} ---")
    model.fit(X_train, y_train)
    
    print(f"--- Evaluating {name} ---")
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Print the individual report for this model
    print(classification_report(y_test, y_pred))
    
    # Store results in a dictionary and append to the list
    model_results = {
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }
    results_list.append(model_results)
    print(f"'{name}' results have been stored.")
    print("--------------------------------------------------\n")

# --- Step 4: Display Final Summary Table ---

print("--- Final Model Performance Summary ---")

# Create a DataFrame from the results list for pretty printing
results_df = pd.DataFrame(results_list)
print(results_df.to_string(index=False))

--- Starting Data Loading and Preprocessing ---


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['class'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['class'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])


Training data shape: (125973, 122)
Test data shape: (22544, 122)
--- Preprocessing Complete ---

--- Training K-Nearest Neighbors (KNN) ---
--- Evaluating K-Nearest Neighbors (KNN) ---
              precision    recall  f1-score   support

           0       0.67      0.93      0.78      9711
           1       0.92      0.65      0.76     12833

    accuracy                           0.77     22544
   macro avg       0.80      0.79      0.77     22544
weighted avg       0.81      0.77      0.77     22544

'K-Nearest Neighbors (KNN)' results have been stored.
--------------------------------------------------

--- Training Linear Discriminant Analysis (LDA) ---
--- Evaluating Linear Discriminant Analysis (LDA) ---
              precision    recall  f1-score   support

           0       0.66      0.93      0.77      9711
           1       0.92      0.63      0.75     12833

    accuracy                           0.76     22544
   macro avg       0.79      0.78      0.76     22544
weig