In [5]:
# Efficient Iris Classification with sklearn

import pandas as pd  # For loading and handling CSV data
from sklearn.model_selection import train_test_split  # For splitting dataset into training and testing sets
from sklearn.preprocessing import LabelEncoder, StandardScaler  # For encoding labels and normalizing features
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix  # Performance metrics
from sklearn.svm import SVC  # Support Vector Machine classifier
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors classifier

In [6]:
# Load dataset from CSV file
data = pd.read_csv('data_t.csv')  # Reads the CSV into a pandas DataFrame

In [7]:
# 1) Use 'category' column as the label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data["category"])  # 'Normal' / 'DDoS' â†’ 0 / 1

print(label_encoder.classes_)
print(pd.crosstab(data["category"], y))

# 2) Drop attack/category/subcategory from the features
cols_to_drop = ["attack", "category", "subcategory"]
features_df = data.drop(columns=cols_to_drop)

categorical_cols = ["proto", "flgs", "state", "saddr","daddr"]

features_encoded = pd.get_dummies(
    features_df,
    columns=categorical_cols,
    drop_first=True
)

na_counts = features_encoded.isna().sum().sort_values(ascending=False)
cols_all_nan = na_counts[na_counts == len(features_encoded)].index
features_encoded = features_encoded.drop(columns=cols_all_nan)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")  # or "mean"
X = imputer.fit_transform(features_encoded)  # replaces NaNs with column median

['DDoS' 'Normal']
col_0        0   1
category          
DDoS      1044   0
Normal       0  65


In [8]:
# Normalize features to zero mean and unit variance
X = StandardScaler().fit_transform(X)  # StandardScaler scales each feature for better model performance

In [9]:
# Split dataset into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)  # random_state ensures reproducibility

In [10]:
# Define multiple classifiers in a dictionary for easy iteration
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),  # Logistic Regression, increased max_iter
    'Decision Tree': DecisionTreeClassifier(),  # Decision Tree
    'Random Forest': RandomForestClassifier(),  # Random Forest
    }

In [11]:
# Function to print evaluation metrics
def print_metrics(y_true, y_pred):
    """
    Prints performance metrics for predictions:
    - Accuracy
    - F1-score
    - Recall
    - Precision
    - Confusion Matrix
    - Crosstab for detailed view
    """
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")  # Overall accuracy
    print(f"F1-score: {f1_score(y_true, y_pred, average='weighted'):.3f}")  # F1-score (weighted)
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.3f}")  # Recall (weighted)
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.3f}")  # Precision (weighted)
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))  # Confusion matrix
    print(pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))  # Crosstab
    print('-'*50)  # Separator for readability

In [12]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model on the training set
    y_pred = model.predict(X_test)  # Make predictions on the test set
    print(f"{name} Results:")  # Print model name
    print_metrics(y_test, y_pred)  # Print all metrics for this model

Logistic Regression Results:
Accuracy: 1.00
F1-score: 1.000
Recall: 1.000
Precision: 1.000
Confusion Matrix:
 [[313   0]
 [  0  20]]
Predicted    0   1  All
True                   
0          313   0  313
1            0  20   20
All        313  20  333
--------------------------------------------------
Decision Tree Results:
Accuracy: 1.00
F1-score: 1.000
Recall: 1.000
Precision: 1.000
Confusion Matrix:
 [[313   0]
 [  0  20]]
Predicted    0   1  All
True                   
0          313   0  313
1            0  20   20
All        313  20  333
--------------------------------------------------
Random Forest Results:
Accuracy: 1.00
F1-score: 1.000
Recall: 1.000
Precision: 1.000
Confusion Matrix:
 [[313   0]
 [  0  20]]
Predicted    0   1  All
True                   
0          313   0  313
1            0  20   20
All        313  20  333
--------------------------------------------------
