# Centralised Learning and Federated Learning on the CICIoT2023 dataset

This notebook extends on the functionality of the CICIoT2023 example notebook, to account for improvement to the centralised training of all data instances.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import warnings
#warnings.filterwarnings('ignore')


In [2]:
DATASET_DIRECTORY = '../datasets/CICIoT2023/'

Include the defines for the dataframe columns and the attack labels and their mappings

In [3]:
from includes import X_columns, y_column, dict_34_classes, dict_8_classes, dict_7_classes, dict_2_classes

In [4]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# # Create the training and test sets
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

# TODO - REMOVE THIS - Works on 20% of the data for low memory machines
# Create the training and test sets - LOW MEMORY CLUDGE FOR JON
# training_sets = df_sets[:int(len(df_sets)*.2)]
# test_sets = df_sets[int(len(df_sets)*.8):]

---
# TEMP CODE

In [5]:
# Set training_sets to the last entry of training_sets
training_sets = training_sets[-1:]
print(f"HACK TO REPLICATE ORIGINAL AUTHORS CODE WITH ONE FILE TRAIN - {training_sets}")

HACK TO REPLICATE ORIGINAL AUTHORS CODE WITH ONE FILE TRAIN - ['part-00134-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']


Remove this if you have more than a morsel of memory

---

# Create a new DataFrame that consists of all CSV datA

This is **memory intensive** as it will create a DataFrame with 36 million rows.

In [6]:
# Depreciated method
# df = []

# count = 0
# for train_set in tqdm(training_sets):
#     if count == 0:
#         df = pd.read_csv(DATASET_DIRECTORY + train_set)
#     else:
#         df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
#         df = df.append(df_new, ignore_index=True)
#     count = count + 1

In [7]:
# New faster method not using depreciated pandas append
dfs = []
for train_set in tqdm(training_sets):
    df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
    dfs.append(df_new)
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.52it/s]


In [8]:
df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000838,54.62,6.05,64.00,11.961779,11.961779,0.0,0.0,0.0,0.0,...,0.111473,54.45,8.307598e+07,9.5,10.392912,0.037895,0.035900,0.02,141.55,DDoS-TCP_Flood
1,0.005486,75.88,6.00,64.00,29.502125,29.502125,0.0,0.0,1.0,0.0,...,0.100314,54.24,8.309325e+07,9.5,10.395361,0.143036,0.346802,0.03,141.55,DDoS-SYN_Flood
2,0.000000,0.00,45.61,65.81,151.517376,151.517376,0.0,0.0,0.0,0.0,...,57.165223,576.80,8.369379e+07,9.5,33.783684,80.958879,8638.780727,0.40,141.55,Mirai-greeth_flood
3,0.000000,54.00,6.00,64.00,1.500542,1.500542,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.309408e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-SYN_Flood
4,0.004568,745.42,5.95,65.13,8.082100,8.082100,0.0,0.0,0.0,0.0,...,549.190629,927.04,8.333561e+07,9.5,41.550978,776.661367,318084.344439,0.95,141.55,DDoS-ACK_Fragmentation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243644,0.000000,54.00,6.00,64.00,19.582485,19.582485,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.331443e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-PSHACK_Flood
243645,0.037146,78.22,36.21,63.18,24.542045,24.542045,0.0,0.0,0.0,0.0,...,110.233513,453.78,8.358187e+07,9.5,30.338676,154.660856,23401.960226,0.53,141.55,Mirai-greip_flood
243646,3.293075,1025996.92,17.00,64.00,572.160392,572.160392,0.0,0.0,0.0,0.0,...,0.000000,554.00,8.378910e+07,9.5,33.286634,0.000000,0.000000,0.00,141.55,Mirai-udpplain
243647,0.047343,35223.00,17.00,64.00,15083.107398,15083.107398,0.0,0.0,0.0,0.0,...,0.000000,50.00,8.309852e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,DDoS-UDP_Flood


## Map the y labels to integers

In [9]:
# Map y column to the dict_34_classes values
df['label'] = df['label'].map(dict_34_classes)

# Save this output to a Pickle file

In [10]:
df.to_pickle('training_data.pkl')

We can now retrieve the dataset from the pkl in further work (pickle file approx 2GB compared to 12GB of CSV data).

---

# Read the pickle file


In [11]:
# Read the pickle file
df = pd.read_pickle('training_data.pkl')

# Scale the input features

In [12]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
df[X_columns] = scaler.fit_transform(df[X_columns])

# Classification Problem (2-class, 8-class, or 34-class)
Select which size classification problem you want to solve.

In [13]:
binary_classifier = True
group_classifier = False
individual_classifier = False

if group_classifier:
    print("Group 8 Class Classifier...")
    # Map y column to the dict_7_classes values
    df['label'] = df['label'].map(dict_8_classes)
    class_size = "8"
        
elif binary_classifier:
    print("Binary 2 Class Classifier...")
    # Map y column to the dict_2_classes values
    df['label'] = df['label'].map(dict_2_classes)
    class_size = "2"

else:
    print ("Individual 34 Class classifier...")
    class_size = "34"
    
    

Binary 2 Class Classifier...


# Model Creation (LR, RF, MLP)

In [14]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle
from datetime import datetime

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

ML_models = [
    ("LogisticRegression", LogisticRegression(n_jobs=-1), f"logreg-{class_size}class-model.pkl"),
    ("RandomForestClassifier", RandomForestClassifier(), f"rf-{class_size}class-model.pkl"),
    ("MLPClassifier", MLPClassifier(), f"mlp-{class_size}class-model.pkl")
]

def train_and_evaluate(name, model, model_file, df):
    print(datetime.now(), f" : Fit {name} model...")
    model.fit(df[X_columns], df[y_column])
    print(datetime.now(), f" : Fit {name} model complete...")
    
    with open(model_file, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = []
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        # Always map the y column to the dict_34_classes values
        new_y = [dict_34_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2_classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_8_classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds += y_pred

    print(f"##### {name} ({class_size} classes) #####")
    print('accuracy_score: ', accuracy_score(preds, y_test))
    print('recall_score: ', recall_score(preds, y_test, average='macro'))
    print('precision_score: ', precision_score(preds, y_test, average='macro'))
    print('f1_score: ', f1_score(preds, y_test, average='macro'))
    print('\n')

for name, model, model_file in ML_models:
    train_and_evaluate(name, model, model_file, df)

2023-07-08 12:30:43.940463  : Fit LogisticRegression model...
2023-07-08 12:30:59.935965  : Fit LogisticRegression model complete...


100%|██████████| 34/34 [01:00<00:00,  1.77s/it]


##### LogisticRegression (2 classes) #####
accuracy_score:  0.9890232850339564
recall_score:  0.8904023664038468
precision_score:  0.8631580080482479
f1_score:  0.8762598157224799


2023-07-08 12:32:29.668409  : Fit RandomForestClassifier model...
2023-07-08 12:32:46.538445  : Fit RandomForestClassifier model complete...


100%|██████████| 34/34 [01:58<00:00,  3.47s/it]


##### RandomForestClassifier (2 classes) #####
accuracy_score:  0.9968563352156702
recall_score:  0.963039124051402
precision_score:  0.9690356854646043
f1_score:  0.9660170961654488


2023-07-08 12:35:14.283741  : Fit MLPClassifier model...
2023-07-08 12:37:39.177590  : Fit MLPClassifier model complete...


100%|██████████| 34/34 [01:12<00:00,  2.13s/it]


##### MLPClassifier (2 classes) #####
accuracy_score:  0.9933677048162016
recall_score:  0.9224608691470502
precision_score:  0.9356547334449308
f1_score:  0.928950718304631


CPU times: total: 19min 25s
Wall time: 8min 37s


# Load in a Pickled model result

In [15]:
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'model.pkl'

# Calculate Test Performance metrics

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    if binary_classifier:
        # binary classifier (2-class)
        new_y = [dict_2_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y


    elif group_classifier:
        # group classifier (8-class)
        new_y = [dict_7_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y

    else:
        # individual_classifier
        pass

    y_test += list(d_test[y_column].values)

    y_pred = list(model.predict(d_test[X_columns]))
    preds[0] = preds[0] + y_pred

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()