# Training a ML model using CICIoT2023

This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files.

In [1]:
#Regular EDA and plotting libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# We want our plots to appear in the notebook
%matplotlib inline

## Models
from tqdm import tqdm, trange
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Importing Perceptron and AdaBoost
from sklearn.linear_model import Perceptron
from sklearn.ensemble import AdaBoostClassifier

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay

In [2]:
DATASET_DIRECTORY = 'CICIoT2023/'

### Importing Dataset

In [3]:
df_sets = [k for k in tqdm(os.listdir(DATASET_DIRECTORY), desc="Filtering .csv files") if k.endswith('.csv')]
df_sets.sort()

Filtering .csv files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169/169 [00:00<00:00, 114402.42it/s]


In [4]:
length_of_list = len(df_sets)
print(length_of_list)

169


In [5]:
# List of DataFrames
dfs = []

# Loop through each of the first 50 .csv files and append to the dfs list
for csv_file in tqdm(df_sets[:50], desc="Loading CSVs"):
    path = os.path.join(DATASET_DIRECTORY, csv_file)
    dfs.append(pd.read_csv(path))

# Concatenate all DataFrames in the dfs list
df = pd.concat(dfs, ignore_index=True)

Loading CSVs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:17<00:00,  1.54s/it]


In [6]:
df.shape

(12859545, 47)

In [7]:
df.columns.to_list()

['flow_duration',
 'Header_Length',
 'Protocol Type',
 'Duration',
 'Rate',
 'Srate',
 'Drate',
 'fin_flag_number',
 'syn_flag_number',
 'rst_flag_number',
 'psh_flag_number',
 'ack_flag_number',
 'ece_flag_number',
 'cwr_flag_number',
 'ack_count',
 'syn_count',
 'fin_count',
 'urg_count',
 'rst_count',
 'HTTP',
 'HTTPS',
 'DNS',
 'Telnet',
 'SMTP',
 'SSH',
 'IRC',
 'TCP',
 'UDP',
 'DHCP',
 'ARP',
 'ICMP',
 'IPv',
 'LLC',
 'Tot sum',
 'Min',
 'Max',
 'AVG',
 'Std',
 'Tot size',
 'IAT',
 'Number',
 'Magnitue',
 'Radius',
 'Covariance',
 'Variance',
 'Weight',
 'label']

## Exploratory Data Analysis

Let us try and make sense of the data

In [8]:
df.head(10)

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,0.329807,0.329807,0.0,1.0,0.0,1.0,...,0.0,54.0,83343830.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-RSTFINFlood
1,0.0,57.04,6.33,64.0,4.290556,4.290556,0.0,0.0,0.0,0.0,...,2.822973,57.04,82926070.0,9.5,10.464666,4.010353,160.987842,0.05,141.55,DoS-TCP_Flood
2,0.0,0.0,1.0,64.0,33.396799,33.396799,0.0,0.0,0.0,0.0,...,0.0,42.0,83127990.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
3,0.328175,76175.0,17.0,64.0,4642.13301,4642.13301,0.0,0.0,0.0,0.0,...,0.0,50.0,83015700.0,9.5,10.0,0.0,0.0,0.0,141.55,DoS-UDP_Flood
4,0.11732,101.73,6.11,65.91,6.202211,6.202211,0.0,0.0,1.0,0.0,...,23.113111,57.88,82973000.0,9.5,11.346876,32.716243,3016.808286,0.19,141.55,DoS-SYN_Flood
5,0.0,0.0,47.0,64.0,1.954123,1.954123,0.0,0.0,0.0,0.0,...,0.0,592.0,83698400.0,9.5,34.409301,0.0,0.0,0.0,141.55,Mirai-greeth_flood
6,1.052463,108.0,6.0,64.0,1.902353,1.902353,0.0,0.0,1.0,0.0,...,0.0,54.0,83365480.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood
7,0.142555,2322.79,6.66,79.77,493.283636,493.283636,0.0,0.0,0.0,0.0,...,35.347029,94.41,83707170.0,9.5,13.422817,48.122903,3201.472632,0.38,141.55,Mirai-udpplain
8,0.002135,192.52,16.89,65.73,16.88324,16.88324,0.0,0.0,0.0,0.0,...,7.624944,180.72,83007320.0,9.5,18.809396,10.792987,328.325187,0.19,141.55,DoS-UDP_Flood
9,0.0,54.2,6.0,64.0,11.243547,11.243547,0.0,0.0,1.0,0.0,...,0.619849,54.2,83089060.0,9.5,10.409168,0.878113,3.254011,0.12,141.55,DDoS-SYN_Flood


`value_counts()` allows us to see how many times each of the values of a **categorical** column appear.

In [9]:
df.label.value_counts()

DDoS-ICMP_Flood            1983254
DDoS-UDP_Flood             1491863
DDoS-TCP_Flood             1239990
DDoS-PSHACK_Flood          1126510
DDoS-SYN_Flood             1118677
DDoS-RSTFINFlood           1113910
DDoS-SynonymousIP_Flood     989865
DoS-UDP_Flood               914419
DoS-TCP_Flood               736157
DoS-SYN_Flood               557565
BenignTraffic               302896
Mirai-greeth_flood          272706
Mirai-udpplain              245167
Mirai-greip_flood           207678
DDoS-ICMP_Fragmentation     124468
MITM-ArpSpoofing             84747
DDoS-UDP_Fragmentation       79529
DDoS-ACK_Fragmentation       78956
DNS_Spoofing                 49429
Recon-HostDiscovery          36740
Recon-OSScan                 26846
Recon-PortScan               22604
DoS-HTTP_Flood               19759
VulnerabilityScan            10343
DDoS-HTTP_Flood               7942
DDoS-SlowLoris                6508
DictionaryBruteForce          3590
BrowserHijacking              1621
CommandInjection    

In [10]:
(df.label == "BenignTraffic").sum()

302896

We can observe that the dataset contains various types of cyber attacks labeled under different categories. Specifically, the label "BenignTraffic" indicates benign or harmless traffic, whereas all other labels represent different types of malicious or malignant attacks. The presence of numerous attack types alongside benign traffic showcases the diverse nature of this dataset, making it suitable for building and evaluating intrusion detection systems.

In [15]:
# Everything except target variable
X = df.drop("label", axis=1)

#Target variable
y = df.label.values

In [16]:
X

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,0.000000,54.00,6.00,64.00,0.329807,0.329807,0.0,1.0,0.0,1.0,...,54.000000,0.000000,54.00,8.334383e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55
1,0.000000,57.04,6.33,64.00,4.290556,4.290556,0.0,0.0,0.0,0.0,...,54.796404,2.822973,57.04,8.292607e+07,9.5,10.464666,4.010353,160.987842,0.05,141.55
2,0.000000,0.00,1.00,64.00,33.396799,33.396799,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.00,8.312799e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55
3,0.328175,76175.00,17.00,64.00,4642.133010,4642.133010,0.0,0.0,0.0,0.0,...,50.000000,0.000000,50.00,8.301570e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55
4,0.117320,101.73,6.11,65.91,6.202211,6.202211,0.0,0.0,1.0,0.0,...,67.959230,23.113111,57.88,8.297300e+07,9.5,11.346876,32.716243,3016.808286,0.19,141.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12859540,0.006055,68.22,6.22,64.64,45.026574,45.026574,0.0,0.0,1.0,0.0,...,56.941648,9.692716,57.18,8.308897e+07,9.5,10.655768,13.730903,634.163598,0.35,141.55
12859541,0.557748,118.80,6.00,64.00,3.231386,3.231386,0.0,0.0,1.0,0.0,...,54.000000,0.000000,54.00,8.336253e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55
12859542,0.000000,0.00,1.00,64.00,2.171149,2.171149,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.00,8.312408e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55
12859543,0.000000,53.46,5.95,64.00,13.324980,13.324980,0.0,0.0,0.0,0.0,...,54.095485,0.398463,54.44,8.333215e+07,9.5,10.401502,0.566842,4.018273,0.04,141.55


In [None]:
##### X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'

### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [None]:
for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

### Classification: 34 (33+1) classes

In [None]:
ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

# Classification: 8 (7+1) classes

In [None]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y

    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_7classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (8 classes) #####")
    print('accuracy_score = ', accuracy_score(y_pred, y_test))
    print('recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

# Classification: 2 (1+1) Classes

In [None]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y

    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y

    y_test += list(d_test[y_column].values)

    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (2 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()