In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [None]:
# Load NSL-KDD Dataset
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", 
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", 
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", 
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty_level"
]

In [None]:
file_path = 'nsl-kdd/KDDTrain+.txt' 
df = pd.read_csv(file_path, header=None, names=column_names)

In [None]:
# One-Hot Encoding for Categorical Columns
categorical_columns = ['protocol_type', 'service', 'flag']
df = pd.get_dummies(df, columns=categorical_columns)

In [None]:
# Save expected columns for feature alignment
train_columns = df.drop(['label', 'difficulty_level'], axis=1).columns.tolist()
joblib.dump(train_columns, 'train_columns.pkl')

In [None]:
# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
joblib.dump(label_encoder, 'label_encoder.pkl')

In [None]:

# Save all possible classes
all_classes = np.unique(df['label'])

In [None]:
# Prepare Features and Labels
X = df.drop(['label', 'difficulty_level'], axis=1)
y = df['label']

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Remove extremely low-sample classes before resampling
min_samples_threshold = 5  # Adjust as needed
filtered_classes = y_train.value_counts()[y_train.value_counts() >= min_samples_threshold].index
X_train = X_train[y_train.isin(filtered_classes)]
y_train = y_train[y_train.isin(filtered_classes)]

In [None]:
# Apply SMOTE to Balance Classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:

# Ensure `y_train_balanced` contains all possible classes
missing_classes = np.setdiff1d(all_classes, np.unique(y_train_balanced))
if len(missing_classes) > 0:
    for cls in missing_classes:
        X_train_balanced = np.vstack([X_train_balanced, np.zeros((1, X_train_balanced.shape[1]))])
        y_train_balanced = np.append(y_train_balanced, cls)

In [None]:

# Standardization
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(pd.DataFrame(X_train_balanced, columns=train_columns))
X_test = scaler.transform(pd.DataFrame(X_test, columns=train_columns))
joblib.dump(scaler, 'scaler.pkl')

In [None]:
# Train XGBoost Model
clf = XGBClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

In [None]:
# Evaluate Model
y_pred = clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Save Model
joblib.dump(clf, 'ids_model.pkl')
print("Model saved successfully!")