This Notebook contains the code for training and measuring training time, inference time of the XGB, Naive Bayes. They are evaluated over dataset of various sizes like 3.5M,350k,35000 rows. These are evaluated before feature selection

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report,roc_curve

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier

import warnings
warnings.filterwarnings('ignore')

In [26]:
df = pd.read_csv("iot23_final_preprocessed.csv")

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,duration,orig_bytes,resp_bytes,orig_pkts,resp_pkts,label,proto_tcp,proto_udp,conn_state_REJ,...,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,service_dhcp,service_dns,service_http,service_irc,service_ssh,service_ssl
0,20,6.1e-05,0.0,0.0,3.0,0.0,PartOfAHorizontalPortScan,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,0.0,0.0,0.0,1.0,0.0,PartOfAHorizontalPortScan,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23,6.1e-05,0.0,0.0,3.0,0.0,PartOfAHorizontalPortScan,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24,0.0,0.0,0.0,1.0,0.0,PartOfAHorizontalPortScan,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27,0.0,0.0,0.0,1.0,0.0,Benign,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:


print(df.columns)

Index(['Unnamed: 0', 'duration', 'orig_bytes', 'resp_bytes', 'orig_pkts',
       'resp_pkts', 'label', 'proto_tcp', 'proto_udp', 'conn_state_REJ',
       'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR',
       'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2',
       'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR',
       'service_dhcp', 'service_dns', 'service_http', 'service_irc',
       'service_ssh', 'service_ssl'],
      dtype='object')


In [29]:
df['label'].unique()

array(['PartOfAHorizontalPortScan', 'Benign', 'Okiru', 'DDoS',
       'C&C-HeartBeat', 'C&C', 'Attack'], dtype=object)

In [30]:
df['label'].value_counts()

label
PartOfAHorizontalPortScan    825417
Okiru                        262503
Benign                       197274
DDoS                         138718
C&C                           15003
Attack                         3914
C&C-HeartBeat                   308
Name: count, dtype: int64

In [31]:
df['label'] = df['label'].apply(lambda x: 0 if x == 'Benign' else 1)


In [32]:
X = df.drop("label",axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69,stratify=y)

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
import time

In [49]:
xgb_model_improved = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='aucpr',  
    n_estimators=1,  
    gamma=0,
    learning_rate=0.1,
    max_depth=3,
    reg_lambda=1,
    scale_pos_weight=0.3,
    subsample=0.9,
    colsample_bytree=0.5
)

start_time = time.time()

xgb_model_improved.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    verbose=True
)

end_time = time.time()
print(f"Training time: {end_time - start_time:.4f} seconds")

[0]	validation_0-aucpr:0.96249
Training time: 0.6191 seconds


In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

y_pred_probs = xgb_model_improved.predict_proba(X_test)[:, 1]
y_pred = (y_pred_probs >= 0.5).astype(int)  

# Step 2: Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_probs)
conf_matrix = confusion_matrix(y_test, y_pred)

# Step 3: Print metrics
print(f"Accuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Optional: Full report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy     : 0.9348
Precision    : 0.9335
Recall       : 0.9954
F1 Score     : 0.9634
ROC-AUC Score: 0.8795

Confusion Matrix:
[[ 27215  22104]
 [  1425 310041]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.55      0.70     49319
           1       0.93      1.00      0.96    311466

    accuracy                           0.93    360785
   macro avg       0.94      0.77      0.83    360785
weighted avg       0.94      0.93      0.93    360785



In [43]:
import joblib
import os

model_filename = 'xgb_model_improved.pkl'
joblib.dump(xgb_model_improved, model_filename)

model_size_bytes = os.path.getsize(model_filename)
model_size_kb = model_size_bytes / 1024
model_size_mb = model_size_kb / 1024

print(f"Model size: {model_size_kb:.2f} KB")
print(f"Model size: {model_size_mb:.2f} MB")

# Step 4: Delete the file
os.remove(model_filename)
print("Model file deleted.")


Model size: 9.42 KB
Model size: 0.01 MB
Model file deleted.


In [44]:
X_test_10x = np.concatenate([X_test] * 10, axis=0)

In [47]:
# Measure inference time


inference_start = time.time()
y_pred = xgb_model_improved.predict(X_test_10x)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")


Inference time: 0.1820 seconds


In [50]:




nb_model = GaussianNB(var_smoothing=1e-9)

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time




# Print results
print(f"Training Time: {training_time:.4f} seconds")


Training Time: 0.3308 seconds


In [147]:
inference_start = time.time()
y_pred = nb_model.predict(X_test_10x)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")

Inference time: 3.3566 seconds


In [51]:
y_pred = nb_model.predict(X_test)
y_pred_proba = nb_model.predict_proba(X_test)[:, 1]  

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 4. Print metrics
print(f"\nðŸ§  NaÃ¯ve Bayes Model Evaluation:")
print(f"Training Time : {training_time:.4f} seconds")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1 Score      : {f1:.4f}")
print(f"ROC-AUC Score : {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

model_filename = "nb_model.pkl"
joblib.dump(nb_model, model_filename)
model_size_bytes = os.path.getsize(model_filename)
print(f"\nModel Size    : {model_size_bytes / 1024:.2f} KB")

os.remove(model_filename)
print("Model file deleted âœ…")


ðŸ§  NaÃ¯ve Bayes Model Evaluation:
Training Time : 0.3308 seconds
Accuracy      : 0.9036
Precision     : 0.9004
Recall        : 0.9987
F1 Score      : 0.9470
ROC-AUC Score : 0.6508

Confusion Matrix:
[[ 14919  34400]
 [   392 311074]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.30      0.46     49319
           1       0.90      1.00      0.95    311466

    accuracy                           0.90    360785
   macro avg       0.94      0.65      0.70    360785
weighted avg       0.91      0.90      0.88    360785


Model Size    : 1.57 KB
Model file deleted âœ…


In [116]:
X_train=X_train[:357000]
y_train=y_train[:357000]
print(len(X_train))
print(len(X_test))

357000
360785


In [117]:
xgb_model_improved = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='aucpr',  
    n_estimators=1,  
    gamma=0,
    learning_rate=0.1,
    max_depth=3,
    reg_lambda=1,
    scale_pos_weight=0.3,
    subsample=0.9,
    colsample_bytree=0.5
)

start_time = time.time()

xgb_model_improved.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    verbose=True
)

end_time = time.time()
print(f"Training time: {end_time - start_time:.4f} seconds")

[0]	validation_0-aucpr:0.54860
Training time: 0.3316 seconds


In [118]:


inference_start = time.time()
y_pred = xgb_model_improved.predict(X_test)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")


Inference time: 0.0406 seconds


In [119]:




nb_model = GaussianNB(var_smoothing=1e-9)

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time



print(f"Training Time: {training_time:.4f} seconds")


Training Time: 0.1173 seconds


In [121]:
inference_start = time.time()
y_pred = nb_model.predict(X_test)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")

Inference time: 0.2972 seconds


In [125]:
X_train=X_train[:35000]
y_train=y_train[:35000]
X_test=X_test[:35000]
y_test=y_test[:35000]
print(len(X_test))

35000


In [126]:
len(X_train)

35000

In [12]:
import time

In [127]:

xgb_model_improved = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='aucpr',  
    n_estimators=1,  
    gamma=0,
    learning_rate=0.1,
    max_depth=3,
    reg_lambda=1,
    scale_pos_weight=0.3,
    subsample=0.9,
    colsample_bytree=0.5
)

start_time = time.time()

xgb_model_improved.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    verbose=True
)

end_time = time.time()
print(f"Training time: {end_time - start_time:.4f} seconds")

[0]	validation_0-aucpr:0.53355
Training time: 0.0676 seconds


In [130]:


inference_start = time.time()
y_pred = xgb_model_improved.predict(X_test)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")


Inference time: 0.0226 seconds


In [131]:
from sklearn.naive_bayes import GaussianNB




nb_model = GaussianNB(var_smoothing=1e-9)

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

training_time = end_time - start_time




# Print results
print(f"Training Time: {training_time:.4f} seconds")


Training Time: 0.0137 seconds


In [132]:
inference_start = time.time()
y_pred = nb_model.predict(X_test)
inference_end = time.time()

print(f"Inference time: {inference_end - inference_start:.4f} seconds")

Inference time: 0.0209 seconds
