# 03 - Federated Learning with Edge IIoT Dataset using Flower and TensorFlow/Keras

In this notebook we use the Flower Federated Learning library (flower.dev) with scikit-learn to distribute the Edge-IIoT data across multiple clients in various different ways.

In [196]:
### THIS SECTION NEEDS TO BE SET TO DETERMINE WHICH CONFIGURATION METHOD TO UTILISE

AVAILABLE_METHODS = ['INDIVIDUAL_IP', 'GROUP_IP', 'INDIVIDUAL_ATTACK', 'GROUP_ATTACK', 'STRATIFIED']
METHOD = AVAILABLE_METHODS[0]
NUM_OF_STRATIFIED_CLIENTS = 10 # only applies to stratified method
print (METHOD)

INDIVIDUAL_IP


In [197]:
%%capture
%pip install flwr[simulation] torch torchvision matplotlib sklearn openml

In [198]:
import os
import pandas as pd
import numpy as np
import flwr as fl
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from flwr.common import Metrics
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10


In [199]:
print("flwr", fl.__version__)
print("numpy", np.__version__)
print("torch", torch.__version__)
print("torchvision", torchvision.__version__)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Training on {DEVICE}")

flwr 1.4.0
numpy 1.24.3
torch 2.0.1+cpu
torchvision 0.15.2+cpu
Training on cpu


In [200]:
dataset_path = "../datasets/Edge-IIoT/"

df = pd.read_pickle(dataset_path + "Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.pkl")   

## Make two dataframes for 2 models. EI Binary classification and Multi-class classification

In [228]:
# Binary attack dataframe
binary_df=df[['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4',
       'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum',
       'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused',
       'http.file_data', 'http.content_length', 'http.request.uri.query',
       'http.request.method', 'http.referer', 'http.request.full_uri',
       'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack',
       'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options',
       'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu',
       'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conack.flags',
       'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len',
       'mqtt.msg_decoded_as', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len',
       'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver',
       'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'Attack_label']]

In [229]:
# Multiclass attack dataframe
multiclass_df = df[['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4',
       'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum',
       'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused',
       'http.file_data', 'http.content_length', 'http.request.uri.query',
       'http.request.method', 'http.referer', 'http.request.full_uri',
       'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack',
       'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options',
       'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu',
       'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conack.flags',
       'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len',
       'mqtt.msg_decoded_as', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len',
       'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver',
       'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id',
       'Attack_type']]

In [230]:
known_sensor_ip_addresses = [ '192.168.0.101', '192.168.2.194', '192.168.3.18', '192.168.4.73', '192.168.5.47', '192.168.6.56', '192.768.7.62', '192.168.8.163']
print ("known_sensor_ip_addresses:", known_sensor_ip_addresses)

known_sensor_ip_addresses: ['192.168.0.101', '192.168.2.194', '192.168.3.18', '192.168.4.73', '192.168.5.47', '192.168.6.56', '192.768.7.62', '192.168.8.163']


In [251]:
# For each known sensor IP address, show how many row it is associated with the ip.src_host column and ip.dst_host column
for ip in known_sensor_ip_addresses:
    print ("\nip:", ip)
    print ("ip.src_host:", len(df[df['ip.src_host'] == ip]))
    print ("ip.dst_host:", len(df[df['ip.dst_host'] == ip]))



ip: 192.168.0.101
ip.src_host: 667281
ip.dst_host: 665336

ip: 192.168.2.194
ip.src_host: 0
ip.dst_host: 0

ip: 192.168.3.18
ip.src_host: 0
ip.dst_host: 0

ip: 192.168.4.73
ip.src_host: 0
ip.dst_host: 0

ip: 192.168.5.47
ip.src_host: 0
ip.dst_host: 0

ip: 192.168.6.56
ip.src_host: 0
ip.dst_host: 0

ip: 192.768.7.62
ip.src_host: 0
ip.dst_host: 0

ip: 192.168.8.163
ip.src_host: 0
ip.dst_host: 0


## Binary Classification

Categorical data encoding (Dummy Encoding):

EG. Takes a product category and converts it to a binary vector

In [231]:
def encode_text_dummy(df, name):

    dummies = pd.get_dummies(df[name])

    for x in dummies.columns:

        dummy_name = f"{name}-{x}"

        df[dummy_name] = dummies[x]

    df.drop(name, axis=1, inplace=True)

encode_text_dummy(binary_df,'http.request.method')

encode_text_dummy(binary_df,'http.referer')

encode_text_dummy(binary_df,"http.request.version")

encode_text_dummy(binary_df,"dns.qry.name.len")

encode_text_dummy(binary_df,"mqtt.conack.flags")

encode_text_dummy(binary_df,"mqtt.protoname")

encode_text_dummy(binary_df,"mqtt.topic")

In [232]:
# print max index value of binary_df
print("max index value of binary_df:", max(binary_df.index))
print(binary_df.shape)

max index value of binary_df: 2219200
(2219201, 111)


We need to drop some unrequired columns from the DF, but we need to keep the original around as we may need to split the data differently for different models based on things like the IP address

In [233]:
#drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 
drop_columns = ["frame.time", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 

         "http.file_data","http.request.full_uri","icmp.transmit_timestamp",

         "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",

         "tcp.dstport", "udp.port", "mqtt.msg"]

binary_df = binary_df.drop(drop_columns, axis=1)

binary_df = binary_df.dropna(axis=0, how='any')

binary_df = binary_df.drop_duplicates(subset=None, keep="first")

# We cant shuffle at this point as we need to keep the order so we can split the dataset later based on things like IP address
#binary_df_copy = shuffle(binary_df_copy)

# Compute the number of missing values (NaN or null) in each column of a pandas DataFrame object named df.
binary_df.isna().sum()

ip.src_host                            0
ip.dst_host                            0
arp.opcode                             0
arp.hw.size                            0
icmp.checksum                          0
                                      ..
mqtt.protoname-0.0                     0
mqtt.protoname-MQTT                    0
mqtt.topic-0                           0
mqtt.topic-0.0                         0
mqtt.topic-Temperature_and_Humidity    0
Length: 98, dtype: int64

In [234]:
# print max index value of binary_df
print("max index value of binary_df:", max(binary_df.index))
print(binary_df.shape)

binary_df.reset_index(drop=True, inplace=True)

# print max index value of binary_df
print("max index value of binary_df:", max(binary_df.index))
print(binary_df.shape)


max index value of binary_df: 2219200
(1957467, 98)
max index value of binary_df: 1957466
(1957467, 98)


In [241]:

label = binary_df['Attack_label']
le = preprocessing.LabelEncoder()
label_n = le.fit_transform(label.values)

# Stratify based on the attack label to balance the dataset - This is our original copy of the data include IP addresses
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(binary_df, label_n, stratify=label_n, test_size=0.2, random_state=42)

print(X_train_df.shape)

X_train_df.reset_index(drop=True, inplace=True)

#print the max index of X_train_df
print(X_train_df.index.max())

#  print the max index of binary_df
print(binary_df.index.max())

(1565973, 98)
1565972
1957466


In [237]:
label = binary_df['Attack_label']
le = preprocessing.LabelEncoder()
label_n = le.fit_transform(label.values)

# Stratify based on the attack label to balance the dataset - This is our original copy of the data include IP addresses
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(binary_df, label_n, stratify=label_n, test_size=0.2, random_state=42)

print(X_train_df.shape)

#print the max index of X_train_df
print(X_train_df.index.max())

#  print the max index of binary_df
print(binary_df.index.max())

(1565973, 98)
1957466
1957466


In [242]:
binary_df_copy = binary_df.copy()

binary_df_copy = binary_df_copy.drop(["ip.src_host", "ip.dst_host", "Attack_label"], axis=1)

# This is our copy of the data without IP addresses
scaled_features = StandardScaler().fit_transform(binary_df_copy.values)
X_train, X_test, y_train, y_test = train_test_split(scaled_features, Y, stratify=Y, test_size=0.2, random_state=42)

print ("Train:", X_train.shape, y_train.shape)
print ("Test:", X_test.shape, y_test.shape)


Train: (1565973, 95) (1565973,)
Test: (391494, 95) (391494,)


In [205]:
# Binary Attack classification
N_CLASSES = 2
# Number of features = size of binary_df - 1 (Attack_label)
N_FEATURES = binary_df.shape[1] - 1

In [249]:
fl_X_train = []
fl_Y_train = []

if METHOD == AVAILABLE_METHODS[0]:
    # Individual IP address
    for ip in known_sensor_ip_addresses:
        new_ip = [ip]
        print("new_ip:", new_ip)

        X_train_df['ip.src_host']
        
        print("Shape X_Train:", X_train.shape)
        print("Shape y_Train:", y_train.shape)

        # Filter dataframe by IP address
        new_df_src = X_train_df[ X_train_df['ip.src_host'].isin(new_ip) ]
        new_df_dst = X_train_df[ X_train_df['ip.dst_host'].isin(new_ip) ]

        print("Shape new_df_src:", new_df_src.shape)
        print("Shape new_df_dst:", new_df_dst.shape)

        X_np = np.vstack([ X_train[ new_df_src.index, : ], X_train[ new_df_dst.index, :] ])
        y_np = np.hstack([ y_train[ new_df_src.index ], y_train[ new_df_dst.index ] ])

        print ("x_np:", X_np.shape)
        print ("y_np:", y_np.shape)

        fl_X_train.append(X_np)
        fl_Y_train.append(y_np)



new_ip: 192.168.0.101
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (438417, 98)
Shape new_df_dst: (436184, 98)
x_np: (874601, 95)
y_np: (874601,)
new_ip: 192.168.2.194
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (0, 98)
Shape new_df_dst: (0, 98)
x_np: (0, 95)
y_np: (0,)
new_ip: 192.168.3.18
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (0, 98)
Shape new_df_dst: (0, 98)
x_np: (0, 95)
y_np: (0,)
new_ip: 192.168.4.73
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (0, 98)
Shape new_df_dst: (0, 98)
x_np: (0, 95)
y_np: (0,)
new_ip: 192.168.5.47
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (0, 98)
Shape new_df_dst: (0, 98)
x_np: (0, 95)
y_np: (0,)
new_ip: 192.168.6.56
Shape X_Train: (1565973, 95)
Shape y_Train: (1565973,)
Shape new_df_src: (0, 98)
Shape new_df_dst: (0, 98)
x_np: (0, 95)
y_np: (0,)
new_ip: 192.768.7.62
Shape X_Train: (1565973, 95)
Shape y_Tr

# FL Part

In [None]:
from typing import Tuple, Union, List
import numpy as np
from sklearn.linear_model import LogisticRegression
import openml

XY = Tuple[np.ndarray, np.ndarray]
Dataset = Tuple[XY, XY]
LogRegParams = Union[XY, Tuple[np.ndarray]]
XYList = List[XY]

def get_model_parameters(model: LogisticRegression) -> LogRegParams:
    """Returns the paramters of a sklearn LogisticRegression model."""
    if model.fit_intercept:
        params = [
            model.coef_,
            model.intercept_,
        ]
    else:
        params = [
            model.coef_,
        ]
    return params

def set_model_params(
    model: LogisticRegression, params: LogRegParams
) -> LogisticRegression:
    """Sets the parameters of a sklean LogisticRegression model."""
    model.coef_ = params[0]
    if model.fit_intercept:
        model.intercept_ = params[1]
    return model

def set_initial_params(model: LogisticRegression):
    """Sets initial parameters as zeros Required since model params are
    uninitialized until model.fit is called.
    But server asks for initial parameters from clients at launch. Refer
    to sklearn.linear_model.LogisticRegression documentation for more
    information.
    """
    n_classes = N_CLASSES
    n_features = N_FEATURES
    model.classes_ = np.array([i for i in range(10)])

    model.coef_ = np.zeros((n_classes, n_features))
    if model.fit_intercept:
        model.intercept_ = np.zeros((n_classes,))

        