# Training a ML model using CICIoT2023

This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files.

In [1]:
#Regular EDA and plotting libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# We want our plots to appear in the notebook
%matplotlib inline

## Models
from tqdm import tqdm, trange
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.metrics import RocCurveDisplay

import joblib


In [2]:
DATASET_DIRECTORY = 'CICIoT2023/'

### Importing Dataset

In [4]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [5]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

### Scaling

In [6]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [7]:
for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135/135 [03:43<00:00,  1.66s/it]


### Classification: 34 (33+1) classes

In [8]:
ML_models = [
    LogisticRegression(n_jobs=-1),
    Perceptron(),
    AdaBoostClassifier(),
    RandomForestClassifier(n_jobs=-1),
    DecisionTreeClassifier()
]

ML_names = [
    "LogisticRegression",
    "Perceptron",
    "AdaBoost",
    "RandomForest",
    "DecisionTree"
]

ML_models = [joblib.load(f"model_34classes_{name}.sav") for name in ML_names]



In [9]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [09:48<00:00, 17.32s/it]


In [10]:
# Save y_test and preds using joblib
joblib.dump(y_test, "y_test.sav")
joblib.dump(preds, "predictions.sav")

['predictions.sav']

In [12]:
for k, v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_test, y_pred))  # Swapped y_test and y_pred for correctness
    print('recall_score: ', recall_score(y_test, y_pred, average='macro'))
    print('precision_score: ', precision_score(y_test, y_pred, average='macro'))
    print('f1_score: ', f1_score(y_test, y_pred, average='macro'))
    print()
    print()
    print()
    
    # Save the metrics_store using joblib
joblib.dump(metrics_store, "model_metrics_34_classes.sav")


##### LogisticRegression (34 classes) #####
accuracy_score:  0.8023150703359454


KeyboardInterrupt: 

In [None]:
# Store the metrics for each model in lists
accuracies = []
recalls = []
precisions = []
f1_scores = []

# Calculate metrics for each model and append to the lists
for k, v in preds.items():
    y_pred = v
    accuracies.append(accuracy_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred, average='macro'))
    precisions.append(precision_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Save the metrics using joblib
metrics_dict = {
    'accuracies': accuracies,
    'recalls': recalls,
    'precisions': precisions,
    'f1_scores': f1_scores
}

joblib.dump(metrics_dict, "metrics_data.sav")
    
# Set up the matplotlib figure
plt.figure(figsize=(15, 10))

# Create subplots for each metric
metrics = [accuracies, recalls, precisions, f1_scores]
metric_names = ['Accuracy', 'Recall', 'Precision', 'F1-Score']

for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    plt.subplot(2, 2, i+1)

    # Create a bar plot for the metric
    sns.barplot(x=ML_names, y=metric, palette="viridis")

    plt.title(f'Model Comparison: {metric_name}')
    plt.ylabel(metric_name)
    plt.xticks(rotation=45)
    plt.ylim(0, 1)  # Since all metrics are in the range [0, 1]

plt.tight_layout()
plt.show()

# Classification: 8 (7+1) classes

In [None]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [None]:
# Define the models
ML_models = [
    LogisticRegression(n_jobs=-1),
    Perceptron(),
    AdaBoostClassifier(),
    RandomForestClassifier(n_jobs=-1),
    DecisionTreeClassifier()
]

# Define the model names
ML_names = [
    "LogisticRegression",
    "Perceptron",
    "AdaBoost",
    "RandomForest",
    "DecisionTree"
]

# Train each model on each dataset
for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])

    # Convert the labels using the dictionary
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y

    # Train the models
    for i, model in enumerate(ML_models):
        model.fit(d[X_columns], d[y_column])
        print(f"{ML_names[i]} has been trained on {train_set}")
        # Save the trained model
        filename = f"model_7classes_{ML_names[i]}.sav"
        joblib.dump(model, filename)
    del d


In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_7classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
    # Saving the predictions and y_test using joblib
    joblib.dump(preds, "predictions_data_7classes.sav")
    joblib.dump(y_test, "y_test_7classes.sav")

In [None]:
for k, v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (7 classes) #####")  # Corrected the name and number of classes
    print('accuracy_score = ', accuracy_score(y_test, y_pred))  # Swapped y_test and y_pred for consistency
    print('recall_score = ', recall_score(y_test, y_pred, average='macro'))
    print('precision_score = ', precision_score(y_test, y_pred, average='macro'))
    print('f1_score = ', f1_score(y_test, y_pred, average='macro'))
    print()
    print()
    print()
# Saving the metrics_data using joblib
joblib.dump(metrics_data, "metrics_data_7classes.sav")

In [None]:
# Lists to store the evaluation metrics for each model
accuracies = []
recalls = []
precisions = []
f1_scores = []

# Calculate and store metrics for each model
for k, v in preds.items():
    y_pred = v
    accuracies.append(accuracy_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred, average='macro'))
    precisions.append(precision_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Set up the matplotlib figure
plt.figure(figsize=(15, 12))

# Define metrics and their names for plotting
metrics = [accuracies, recalls, precisions, f1_scores]
metric_names = ['Accuracy', 'Recall', 'Precision', 'F1-Score']

# Plot each metric
for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    plt.subplot(2, 2, i+1)
    sns.barplot(x=ML_names, y=metric, palette="viridis")
    plt.title(f'Model Comparison: {metric_name}')
    plt.ylabel(metric_name)
    plt.xticks(rotation=45)
    plt.ylim(0, 1)  # Since all metrics are in the range [0, 1]

plt.tight_layout()
plt.show()


# Classification: 2 (1+1) Classes

In [None]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [None]:
# Define the models
ML_models = [
    LogisticRegression(n_jobs=-1),
    Perceptron(),
    AdaBoostClassifier(),
    RandomForestClassifier(n_jobs=-1),
    DecisionTreeClassifier()
]

# Define the model names
ML_names = [
    "LogisticRegression",
    "Perceptron",
    "AdaBoost",
    "RandomForest",
    "DecisionTree"
]

# Train each model on each dataset
for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])

    # Convert the labels using the dictionary
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y

    # Train the models
    for i, model in enumerate(ML_models):
        model.fit(d[X_columns], d[y_column])
        print(f"{ML_names[i]} has been trained on {train_set}")
        # Save the trained model
        filename = f"model_2classes_{ML_names[i]}.sav"
        joblib.dump(model, filename)
    del d


In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        
# Saving the predictions using joblib
joblib.dump(preds, "predictions_2classes.sav")
joblib.dump(y_test, "y_test_2classes.sav")


In [None]:
for k, v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (2 classes) #####")  # Corrected the name
    print('accuracy_score = ', accuracy_score(y_test, y_pred))
    print('recall_score = ', recall_score(y_test, y_pred, average='macro'))
    print('precision_score = ', precision_score(y_test, y_pred, average='macro'))
    print('f1_score = ', f1_score(y_test, y_pred, average='macro'))
    print()
    print()
    print()

# Saving the evaluation metrics using joblib
joblib.dump(evaluation_metrics_2classes, "evaluation_metrics_2classes.sav")

In [None]:
# Lists to store the evaluation metrics for each model
accuracies = []
recalls = []
precisions = []
f1_scores = []

# Calculate and store metrics for each model
for k, v in preds.items():
    y_pred = v
    accuracies.append(accuracy_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred, average='macro'))
    precisions.append(precision_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Set up the matplotlib figure
plt.figure(figsize=(15, 12))

# Define metrics and their names for plotting
metrics = [accuracies, recalls, precisions, f1_scores]
metric_names = ['Accuracy', 'Recall', 'Precision', 'F1-Score']

# Plot each metric
for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    plt.subplot(2, 2, i+1)
    sns.barplot(x=ML_names, y=metric, palette="viridis")
    plt.title(f'Model Comparison: {metric_name}')
    plt.ylabel(metric_name)
    plt.xticks(rotation=45)
    plt.ylim(0, 1)  # Since all metrics are in the range [0, 1]

plt.tight_layout()
plt.show()

