# Centralised Learning and Federated Learning on the CICIoT2023 dataset

This notebook extends on the functionality of the CICIoT2023 example notebook, to account for improvement to the centralised training of all data instances.

In [3]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [4]:
DATASET_DIRECTORY = '../datasets/CICIoT2023/'

In [5]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# # Create the training and test sets
# training_sets = df_sets[:int(len(df_sets)*.8)]
# test_sets = df_sets[int(len(df_sets)*.8):]

# Create the training and test sets - LOW MEMORY CLUDGE
training_sets = df_sets[:int(len(df_sets)*.2)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [2]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

# Create a new DataFrame that consists of all CSV datA

This is **memory intensive** as it will create a DataFrame with 36 million rows.

In [11]:
# Depreciated method
# df = []

# count = 0
# for train_set in tqdm(training_sets):
#     if count == 0:
#         df = pd.read_csv(DATASET_DIRECTORY + train_set)
#     else:
#         df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
#         df = df.append(df_new, ignore_index=True)
#     count = count + 1

 78%|███████▊  | 105/135 [18:44<10:34, 21.16s/it]

In [1]:
# New method
dfs = []
for train_set in tqdm(training_sets):
    df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
    dfs.append(df_new)
df = pd.concat(dfs, ignore_index=True)

NameError: name 'tqdm' is not defined

In [6]:
df

NameError: name 'df' is not defined

# Save this output to a Pickle file

In [15]:
df.to_pickle('training_data.pkl')

We can now retrieve the dataset from the pkl in further work (pickle file approx 2GB compared to 12GB of CSV data).

---

# Read the pickle file


In [6]:
# Read the pickle file
df = pd.read_pickle('training_data.pkl')


# Scale the input features

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
df[X_columns] = scaler.fit_transform(df[X_columns])

# Classification Problem (2-class, 8-class, or 34-class)

In [8]:
binary_classifier = False
group_classifier = False
individual_classifier = True

if group_classifier:
    
    dict_7classes = {}
    dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
    dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
    dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
    dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
    dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
    dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-SlowLoris'] = 'DDoS'
    dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'
    dict_7classes['DoS-UDP_Flood'] = 'DoS'
    dict_7classes['DoS-SYN_Flood'] = 'DoS'
    dict_7classes['DoS-TCP_Flood'] = 'DoS'
    dict_7classes['DoS-HTTP_Flood'] = 'DoS'
    dict_7classes['Mirai-greeth_flood'] = 'Mirai'
    dict_7classes['Mirai-greip_flood'] = 'Mirai'
    dict_7classes['Mirai-udpplain'] = 'Mirai'
    dict_7classes['Recon-PingSweep'] = 'Recon'
    dict_7classes['Recon-OSScan'] = 'Recon'
    dict_7classes['Recon-PortScan'] = 'Recon'
    dict_7classes['VulnerabilityScan'] = 'Recon'
    dict_7classes['Recon-HostDiscovery'] = 'Recon'
    dict_7classes['DNS_Spoofing'] = 'Spoofing'
    dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'
    dict_7classes['BenignTraffic'] = 'Benign'
    dict_7classes['BrowserHijacking'] = 'Web'
    dict_7classes['Backdoor_Malware'] = 'Web'
    dict_7classes['XSS'] = 'Web'
    dict_7classes['Uploading_Attack'] = 'Web'
    dict_7classes['SqlInjection'] = 'Web'
    dict_7classes['CommandInjection'] = 'Web'
    dict_7classes['DictionaryBruteForce'] = 'BruteForce'

    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y
    
elif binary_classifier:
    dict_2classes = {}
    dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
    dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
    dict_2classes['DDoS-SYN_Flood'] = 'Attack'
    dict_2classes['DDoS-UDP_Flood'] = 'Attack'
    dict_2classes['DDoS-TCP_Flood'] = 'Attack'
    dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
    dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
    dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-SlowLoris'] = 'Attack'
    dict_2classes['DDoS-HTTP_Flood'] = 'Attack'
    dict_2classes['DoS-UDP_Flood'] = 'Attack'
    dict_2classes['DoS-SYN_Flood'] = 'Attack'
    dict_2classes['DoS-TCP_Flood'] = 'Attack'
    dict_2classes['DoS-HTTP_Flood'] = 'Attack'
    dict_2classes['Mirai-greeth_flood'] = 'Attack'
    dict_2classes['Mirai-greip_flood'] = 'Attack'
    dict_2classes['Mirai-udpplain'] = 'Attack'
    dict_2classes['Recon-PingSweep'] = 'Attack'
    dict_2classes['Recon-OSScan'] = 'Attack'
    dict_2classes['Recon-PortScan'] = 'Attack'
    dict_2classes['VulnerabilityScan'] = 'Attack'
    dict_2classes['Recon-HostDiscovery'] = 'Attack'
    dict_2classes['DNS_Spoofing'] = 'Attack'
    dict_2classes['MITM-ArpSpoofing'] = 'Attack'
    dict_2classes['BenignTraffic'] = 'Benign'
    dict_2classes['BrowserHijacking'] = 'Attack'
    dict_2classes['Backdoor_Malware'] = 'Attack'
    dict_2classes['XSS'] = 'Attack'
    dict_2classes['Uploading_Attack'] = 'Attack'
    dict_2classes['SqlInjection'] = 'Attack'
    dict_2classes['CommandInjection'] = 'Attack'
    dict_2classes['DictionaryBruteForce'] = 'Attack'

    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
else:
    print ("Assuming individual_classifier...")
    pass
    

Assuming individual_classifier...


# Model Creation (LR, RF, MLP)

In [9]:
%%time

import pickle
from datetime import datetime

logreg = True
random_forest = True
mlp = True

if logreg:
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(n_jobs=-1)

    print (datetime.now(), " : Fit LR model...")
    model.fit(df[X_columns], df[y_column])
    print (datetime.now(), " : Fit LR model complete...")
    
    with open("logreg-34class-model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(len(ML_models))}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print(f"##### {ML_neams[k]} (34 classes) #####")
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    
    
if random_forest:
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()

    print (datetime.now(), " : Fit RF model...")
    model.fit(df[X_columns], df[y_column])
    print (datetime.now(), " : Fit RF model complete...")
    
    

    with open("rf-34class-model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(len(ML_models))}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print(f"##### {ML_neams[k]} (34 classes) #####")
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
        
if mlp:
    from sklearn.neural_network import MLPClassifier
    model = MLPClassifier()
    print (datetime.now(), " : Fit MLP model...")
    model.fit(df[X_columns], df[y_column])
    print (datetime.now(), " : Fit MLP model complete...")
    
    with open("mlp-34class-model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = {i:[] for i in range(len(ML_models))}
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y


        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_7classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds[0] = preds[0] + y_pred

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    for k,v in preds.items():
        y_pred = v
        print(f"##### {ML_neams[k]} (34 classes) #####")
        print('accuracy_score: ', accuracy_score(y_pred, y_test))
        print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
        print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
        print('f1_score: ', f1_score(y_pred, y_test, average='macro'))



2023-07-07 17:44:01.168036  : Fit LR model...


# Load in a Pickled model result

In [None]:
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

# Calculate Test Performance metrics

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    if binary_classifier:
        # binary classifier (2-class)
        new_y = [dict_2classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y


    elif group_classifier:
        # group classifier (8-class)
        new_y = [dict_7classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y

    else:
        # individual_classifier
        pass

    y_test += list(d_test[y_column].values)

    y_pred = list(model.predict(d_test[X_columns]))
    preds[0] = preds[0] + y_pred

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()