## Refer to paper, codebase and google doc you have

## Experiment 1: Refer to Fig. 2 and Appendix Fig. 7. Use 105 features of Magnifier

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve, classification_report, average_precision_score
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from fcntl import F_SETFL
import pickle
import time
import random
from queue import Queue
import warnings
warnings.filterwarnings("ignore")
criterion = nn.MSELoss()
scaler = preprocessing.MinMaxScaler()

In [None]:
# Returning all files in directory.
def file_name_walk(file_dir):
    file_list = []
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            if os.path.splitext(file)[1] == ".csv":
                file_list.append("{}/{}".format(root, file))
    return file_list

In [None]:
# Loading benign data from a list of devices.
def load_iot_data_seq(device_list=['philips_camera'],begin=0,end=5):
    df_normal = pd.DataFrame()
    normal_list = device_list
    for type_index, type_name in enumerate(normal_list):
        file_list = file_name_walk(
            './DataSets/normal-kitsune_test/{:}'.format(type_name))
        file_list.sort()
        df_normal_type = pd.DataFrame()
        begin_num = begin
        end_num = end
        for i, file_path in enumerate(file_list[begin_num:end_num]):
            try:
                tmp_df = pd.read_csv(file_path, header=None)
                df_normal_type = pd.concat([df_normal_type,tmp_df], ignore_index=True)
                #df_normal_type = df_normal_type.append(tmp_df, ignore_index=True)
            except:
                print(file_path)
        df_normal = pd.concat([df_normal,df_normal_type], ignore_index=True)
        #df_normal = df_normal.append(df_normal_type, ignore_index=True)
    df_normal['class'] = 0
    df_normal.fillna(0,inplace=True)
    return df_normal

In [None]:
# Loading malicious data from a list of attacks.
def load_iot_attack_seq(attack_name='all'):
    df_attack = pd.DataFrame()
    df_attack_label = pd.DataFrame()
    attack_path = './DataSets/Anomaly/attack_kitsune/'
    if (attack_name=='all'):
        attack_list=os.listdir(attack_path)
    else:
        attack_list = [attack_name]
    for type_index, type_name in enumerate(attack_list):
        file_list = file_name_walk('./DataSets/Anomaly/attack_kitsune/{:}'.format(type_name))
        for i, file_path in enumerate(file_list):
            if 'label' in file_path:
                continue
            tmp_df = pd.read_csv(file_path, header=None)
            df_attack = pd.concat([df_attack, tmp_df], ignore_index=True)
            if file_path.replace('.csv', '_label.csv') in file_list:
                tmp_label = pd.read_csv(file_path.replace('.csv', '_label.csv'), header=None)
            else:
                tmp_label = pd.DataFrame(data=([1]*len(tmp_df)), index=None)
            df_attack_label = pd.concat([df_attack_label, tmp_label], ignore_index=True)
    df_attack['class'] = np.ravel(df_attack_label.values)
    df_attack.fillna(0,inplace=True)
    return df_attack

#### Gettings stats from built in isolation forest.

#### Visualise

In [None]:
from sklearn.tree import _tree

# Getting path lengths of each data point.
def get_path_length(estimator, X):
    n_samples = X.shape[0]
    path_lengths = np.zeros(n_samples)
    X = X.values
    for i in range(n_samples):
        node_indicator = estimator.decision_path(X[i].reshape(1, -1)).indices
        path_lengths[i] = len(node_indicator) - 1
    return path_lengths

# Calculating average path length.
def average_path_length(iforest, X):
    all_path_lengths = np.zeros(X.shape[0])
    for estimator in iforest.estimators_:
        all_path_lengths += get_path_length(estimator, X)
    return all_path_lengths / len(iforest.estimators_)

# Plotting the distribution of path lengths for attack data.
def justification_plot(a):
    clf_model = IsolationForest(n_estimators=50, max_samples=200, random_state=114514,
                                contamination=0.1,n_jobs=8)
    tmp = pd.concat([x_train,attack_x_train])
    tmp.fillna(0,inplace=True)
    clf_model.fit(tmp.iloc[:, 1:])
    tmp_normal = df_normal_eval.drop(columns=['class'])
    tmp_normal = tmp_normal.iloc[:, 1:]
    tmp_attack = df_attack_eval.drop(columns=['class'])
    tmp_attack = tmp_attack.iloc[:, 1:]

    path_lengths_normal = average_path_length(clf_model, tmp_normal)
    path_lengths_anomalies = average_path_length(clf_model, tmp_attack)

    plt.figure(figsize=(10, 8), dpi = 1000)
    plt.scatter(path_lengths_normal, range(len(path_lengths_normal)), color='blue', label='Benign Samples', marker='o')
    plt.scatter(path_lengths_anomalies, range(len(path_lengths_anomalies)), color='red', label='Malicious Samples', marker='o')
    a = a.replace('_', ' ')
    plt.xlabel('Expected Path Lengths', fontsize=30)
    plt.ylabel('Samples Distribution', fontsize=30)
    plt.xticks(fontsize=30)
    plt.yticks([])
    plt.legend(loc='upper left',fontsize=30, markerscale=5)
    filename = a + ".pdf"
    plt.savefig(filename, dpi=1000, bbox_inches='tight')
    plt.show()

In [None]:
device_list=['360_camera']
attack_list=['http_ddos','data_theft','keylogging','service_scan','tcp_ddos','mirai','os_scan','aidra','bashlite','mirai_router_filter','os_scan_router','port_scan_router','tcp_ddos_router','udp_ddos','udp_ddos_router']

df_normal_train_data = load_iot_data_seq(device_list=device_list, begin=0, end=1)
contamination = 0.4
for a in attack_list:

    df_attack_train = load_iot_attack_seq(a)
    if contamination != -1:
      num = int((1000 / contamination - 1000) / 0.2)
      df_normal_train_data = df_normal_train_data.sample(n=num, replace=False, random_state=20)
      df_attack_train = df_attack_train.sample(n=5000, replace=False, random_state=20)
    df_normal_train, df_normal_eval = train_test_split(df_normal_train_data, test_size=0.2, random_state=20)

    df_attack_train, df_attack_eval = train_test_split(df_attack_train, test_size=0.2, random_state=20)
    x_train, y_train = df_normal_train.drop(columns=['class']), df_normal_train['class']
    attack_x_train, attack_y_train = df_attack_train.drop(columns=['class']), df_attack_train['class']
    df_eval = pd.concat([df_normal_eval, df_attack_eval])
    x_eval, y_eval = df_eval.drop(columns=['class']), df_eval['class']
    eval = x_eval
    print("------------------------------Attack : ",a, "--------------------------------\n")
    justification_plot(a)


# Train iGuard using Magnifier on 105 features.

In [None]:
class Node:
    def __init__(self, data, left=None, right=None, depth=0):
        self.data = data
        self.left = left
        self.right = right
        self.depth = depth
        self.label = None
        self.val = None
        self.col = None

class isolationTree:
    def __init__(self, data, depth=0, max_depth=0, thres = None):
        self.max_depth = max_depth
        self.thres = thres
        self.root = self.build_tree(data, depth)
        self.model = None
        self.num_rows = 50

    def build_tree(self, data, depth):
        if len(data) == 0:
            return None
        elif len(data) == 1:
            return Node(data, depth=depth)
        else:
            q = Queue()
            root = Node(data, depth=depth)
            q.put(root)

            while not q.empty():
                current_node = q.get()
                if len(current_node.data) > 1:
                  split_column = random.randint(0, len(current_node.data.iloc[0]) - 1)
                  while True:
                    if min(current_node.data.iloc[:, split_column]) != max(current_node.data.iloc[:, split_column]):
                      break
                    split_column = (split_column + 1) % len(current_node.data.iloc[0])
                  split_value = random.uniform(min(current_node.data.iloc[:, split_column]), max(current_node.data.iloc[:, split_column]))
                  left_data = current_node.data[current_node.data.iloc[:, split_column] <= split_value]
                  right_data = current_node.data[current_node.data.iloc[:, split_column] > split_value]
                  current_node.val = split_value
                  current_node.col = split_column

                  if len(left_data) > 0:
                      left_node = Node(left_data, depth=current_node.depth + 1)
                      current_node.left = left_node
                      if len(left_data) != 1:
                        q.put(left_node)

                  if len(right_data) > 0:
                      right_node = Node(right_data, depth=current_node.depth + 1)
                      current_node.right = right_node
                      if len(right_data) != 1:
                        q.put(right_node)
            return root

    def label_tree(self, root):

        if root.left is None and root.right is None:
            sampled_df = root.data.sample(n=self.num_rows, replace=True, random_state=42)

            noise = np.random.normal(0, 0.01, sampled_df.shape)

            sampled_df = scaler.transform(root.data)
            sampled_df = sampled_df + noise
            sampled_df = np.pad(sampled_df, ((0,0),(3,0)), 'constant')

            index = np.empty((0,0))
            Port_index = np.arange(4,-1,-1).reshape(5,-1)
            MIstat_index = np.arange(5,20).reshape(5,-1)
            HHstat_index = np.arange(20,55).reshape(5,-1)
            HHstat_jit_index = np.arange(55,70).reshape(5,-1)
            HpHpstat_index = np.arange(70,105).reshape(5,-1)
            for i in range(5):
                index = np.append(index, Port_index[i])
                index = np.append(index, MIstat_index[i])
                index = np.append(index, HHstat_index[i])
                index = np.append(index, HHstat_jit_index[i])
                index = np.append(index, HpHpstat_index[i])
            sampled_df = sampled_df[:, index.astype(int).tolist()].reshape(-1, 5, 21)
            sampled_df = torch.tensor(sampled_df, dtype=torch.float32)
            pred, _ = self.model(sampled_df)
            rmse = criterion(pred, sampled_df)
            rmse = torch.sqrt(rmse)
            root.label = 1 if rmse > self.thres else 0
        else:
            if root.left:
              self.label_tree(root.left)
            if root.right:
              self.label_tree(root.right)

    def pred_tree(self, data, node):
      if node is None:
        return 1, 0

      if node.left is None and node.right is None:
        return node.label, 0
      if data[node.col] <= node.val:
        label, path_len = self.pred_tree(data,node.left)
        return label, 1 + path_len
      if data[node.col] > node.val:
        label, path_len = self.pred_tree(data,node.right)
        return label, 1 + path_len

class isolationForest(nn.Module):
    def __init__(self, data, n_trees=100, max_depth=5, subspace=256, model=None):
        super().__init__()
        self.data = data
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.subspace = subspace
        self.model = model
        self.thres = 0.4
        self.avg_path_len = 2 * (np.log(subspace - 1) + 0.5772) - 2 * (subspace - 1) / subspace
        self.trees = []
    def forward(self, x):
      return None
    def fit(self):

        for i in range(self.n_trees):
            if self.subspace > 1:
                subdata = self.data.sample(self.subspace)
            else:
                subdata = self.data.sample(frac=self.subspace)
            tree = isolationTree(subdata, depth=0, max_depth = self.max_depth, thres = self.thres)
            self.trees.append(tree)

    def label(self, model):
        for tree in self.trees:
            tree.model = model
        for tree in self.trees:
            tree.label_tree(tree.root)

    def pred(self, data, alpha):
      eval = []
      for i,row in data.iterrows():
          temp = [tree.pred_tree(row, tree.root) for tree in self.trees]
          res = min(temp)[1], max(temp)[1]
          print(res)
          agg_label = sum(x[0] for x in temp) / self.n_trees
          avg_len = sum(x[1] for x in temp) / self.n_trees

          iso_score = 2 ** (- avg_len / self.avg_path_len)
          print(f"len: {avg_len} score: {iso_score}")
          eval.append(-1 if alpha * agg_label + (1 - alpha) * iso_score > 0.5 else 1)
      return eval


In [None]:
def iGuard_control_plane(auto):
    tmp = pd.concat([x_train,attack_x_train])
    tmp.fillna(0,inplace=True)
    tmp = tmp.iloc[:, 1:]
    tmp.columns = range(len(tmp.columns))
    clf_model = isolationForest(tmp, n_trees=50, max_depth=8, subspace=200)
    clf_model.fit()
    print("Isolation Forest training completed\n")
    auto = auto.cpu()
    clf_model.label(auto)
    print("Isolation Forest labeling completed\n")
    x_eval = eval
    x_eval.fillna(0,inplace=True)
    x_eval = x_eval.iloc[:, 1:]
    x_eval.columns = range(len(x_eval.columns))
    y_pred_eval = pred(clf_model, x_eval, 1)
    eval_y = y_eval
    eval_x = x_eval

    y_pred_eval = np.array(y_pred_eval)
    y_pred_eval[y_pred_eval == 1] = 0
    y_pred_eval[y_pred_eval == -1] = 1
    temp_str = classification_report(y_true=eval_y, y_pred=y_pred_eval)
    temp_list = temp_str.split()
    print("Our Isolation Forest + Autoencoder")
    print(temp_str)
    print("Confusion Matrix")
    print(confusion_matrix(eval_y, y_pred_eval))
    print("\n F1 Score")
    print(f1_score(eval_y, y_pred_eval))
    print("\n roc-auc")
    print(roc_auc_score(eval_y, y_pred_eval))
    print("\n pr-auc")
    print(average_precision_score(eval_y, y_pred_eval))
    return clf_model

In [None]:
BATCH_SIZE = 256
TWO_D = True
INPUTSIZE = 105
TEST_BATCH_SIZE = 60000

def train_data_processing(df_normal_train, df_attack_eval, x_eval, y_eval, df_normal_eval, TWO_D):
    X_train, y_train= df_normal_train.drop(columns=[0, 'class']), df_normal_train['class']
    X_normal_eval = df_normal_eval.drop(columns=[0, 'class'])
    X_attack_eval = df_attack_eval.drop(columns=[0, 'class'])
    X_train, y_train = X_train.values, y_train.values
    X_normal_eval = X_normal_eval.values
    X_attack_eval = X_attack_eval.values
    X_train = scaler.fit_transform(X_train)
    X_normal_eval = scaler.transform(X_normal_eval)
    X_attack_eval = scaler.transform(X_attack_eval)
    x_eval = x_eval.drop(columns=[0])
    x_eval = scaler.transform(x_eval)
    y_eval = y_eval.values

    if TWO_D:
        # Padding
        X_train = np.pad(X_train, ((0,0),(3,0)), 'constant')
        x_eval = np.pad(x_eval, ((0,0),(3,0)), 'constant')
        X_normal_eval = np.pad(X_normal_eval, ((0,0),(3,0)), 'constant')
        X_attack_eval = np.pad(X_attack_eval, ((0,0),(3,0)), 'constant')
        index = np.empty((0,0))
        Port_index = np.arange(4,-1,-1).reshape(5,-1)
        MIstat_index = np.arange(5,20).reshape(5,-1)
        HHstat_index = np.arange(20,55).reshape(5,-1)
        HHstat_jit_index = np.arange(55,70).reshape(5,-1)
        HpHpstat_index = np.arange(70,105).reshape(5,-1)
        for i in range(5):
            index = np.append(index, Port_index[i])
            index = np.append(index, MIstat_index[i])
            index = np.append(index, HHstat_index[i])
            index = np.append(index, HHstat_jit_index[i])
            index = np.append(index, HpHpstat_index[i])
        X_train = X_train[:, index.astype(int).tolist()].reshape(-1, 5, 21)
        x_eval = x_eval[:, index.astype(int).tolist()].reshape(-1, 5, 21)
        X_normal_eval = X_normal_eval[:, index.astype(int).tolist()].reshape(-1, 5, 21)
        X_attack_eval = X_attack_eval[:, index.astype(int).tolist()].reshape(-1, 5, 21)
    X_train = torch.tensor(X_train, dtype=torch.float32)
    x_eval = torch.tensor(x_eval, dtype=torch.float32)
    X_normal_eval = torch.tensor(X_normal_eval, dtype=torch.float32)
    X_attack_eval = torch.tensor(X_attack_eval, dtype=torch.float32)
    y_train = torch.tensor(y_train)
    y_eval = torch.tensor(y_eval)

    return X_train, y_train, x_eval, y_eval, X_normal_eval, X_attack_eval


def train_magnifier(model, df_normal_train, df_attack_eval, model_save_path, x_eval, y_eval,  df_normal_eval, TWO_D):
    lr = 1e-2
    weight_decay = 0.01
    epoches = 5
    INPUTSIZE = 100

    X_train, y_train, x_eval, y_eval, X_normal_eval, X_attack_eval = train_data_processing(df_normal_train, df_attack_eval, x_eval, y_eval, df_normal_eval, TWO_D)
    train_datasets = Data.TensorDataset(X_train, y_train)
    train_loader = Data.DataLoader(dataset=train_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    test_datasets = Data.TensorDataset(x_eval, y_eval)
    test_loader = Data.DataLoader(dataset=test_datasets, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

    print("Total number of Epoch: ", epoches)

    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    if torch.cuda.is_available():
        model.cuda()

    for epoch in range(epoches):
        if epoch in [epoches * 0.5, epoches * 1.0]:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
        model.train()
        total_loss = 0
        for step, (b_x, b_y) in enumerate(train_loader):
            # forward
            if torch.cuda.is_available():
              b_x = b_x.cuda()
            output, encoded = model(b_x)
            loss = criterion(b_x, output)
            loss = torch.sqrt(loss)
            total_loss += loss.sum()
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            model.eval()
            loss_normal = 0
            for i in range(0, X_normal_eval.shape[0], 16):
                if torch.cuda.is_available():
                  b_x = X_normal_eval[i:i + 16].cuda()
                else:
                  b_x = X_normal_eval[i:i + 16]
                temp_x, _ = model(b_x)
                loss_temp = criterion(temp_x, b_x)
                loss_normal += loss_temp
            loss_normal = (loss_normal.detach().cpu().numpy()) / i
            print('the eval normal loss ', loss_normal)
            loss_attack = 0
            for i in range(0, X_attack_eval.shape[0], 16):
                if torch.cuda.is_available():
                  b_x = X_attack_eval[i:i + 16].cuda()
                temp_x, _ = model(b_x)
                loss_temp = criterion(temp_x, b_x)
                loss_attack += loss_temp
            loss_attack = (loss_attack.detach().cpu().numpy()) / i
            print('the eval attack loss ', loss_attack)

        print("epoch=", epoch, total_loss)
        print(len(X_train))
    test_magnifier(model, test_loader, y_eval)
    return model

def test_magnifier(test_model, test_loader, y_eval):
    begin = time.time()
    test_model.thr = 0.4

    with torch.no_grad():
        test_model.eval()
        correct = 0.
        df_score = []
        rmse_list = []
        y_pred_eval = []
        eval_y = []
        for batch_idx, (data, target) in enumerate(test_loader):
            if torch.cuda.is_available():
              data = data.cuda()
            eval_output, _ = test_model(data)
            y_pred_eval.extend(test_model.pred(eval_output, data))
            eval_y.extend(target)

    temp_str = classification_report(y_true=eval_y, y_pred=y_pred_eval)
    temp_list = temp_str.split()
    print("Autoencoder")
    print(temp_str)
    print("Confusion Matrix")
    print(confusion_matrix(eval_y, y_pred_eval))
    print("\n F1 Score")
    print(f1_score(eval_y, y_pred_eval))
    print("\n roc-auc")
    print(roc_auc_score(eval_y, y_pred_eval))
    print("\n pr-auc")
    print(average_precision_score(eval_y, y_pred_eval))

def magnifier_trainer():
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    model = Magnifier(input_size=INPUTSIZE, seed=42)
    magnifier = train_magnifier(model, df_normal_train, df_attack_eval, model_save_path, x_eval, y_eval, df_normal_eval, TWO_D)
    print("Autoencoder training completed\n")
    return magnifier

In [None]:
def sk_iforest():
    sklearn_clf_model = IsolationForest(n_estimators=50, max_samples=200, random_state=114514,
                             contamination=0.2,n_jobs=8)

    tmp = pd.concat([x_train,attack_x_train])
    tmp.fillna(0,inplace=True)
    sklearn_clf_model.fit(tmp.iloc[:, 1:])

    y_pred_eval = sklearn_clf_model.predict(x_eval.iloc[:, 1:])
    eval_y = y_eval
    eval_x = x_eval

    y_pred_eval[y_pred_eval == 1] = 0
    y_pred_eval[y_pred_eval == -1] = 1
    temp_str = classification_report(y_true=eval_y, y_pred=y_pred_eval)
    temp_list = temp_str.split()
    print("sklearn Isolation Forest")
    print(temp_str)
    print("Confusion Matrix")
    print(confusion_matrix(eval_y, y_pred_eval))
    print("\n F1 Score")
    print(f1_score(eval_y, y_pred_eval))
    print("\n roc-auc")
    print(roc_auc_score(eval_y, y_pred_eval))
    print("\n pr-auc")
    print(average_precision_score(eval_y, y_pred_eval))

## Experiment 2: Fig. 5 and Appendix Fig. 8

In [None]:
# Use iGuard_control_plane to obtain comparison with Magnifier and iForest on F1 score, ROC_AUC and ROC_PR on best configuration.
# Refer to your google doc

device_list=['360_camera']
attack_list=['http_ddos','data_theft','keylogging','service_scan','tcp_ddos','mirai','os_scan','aidra','bashlite','mirai_router_filter','os_scan_router','port_scan_router','tcp_ddos_router','udp_ddos','udp_ddos_router']

df_normal_train_data = load_iot_data_seq(device_list=device_list, begin=0, end=1)
contamination = 0.4
for a in attack_list:

    df_attack_train = load_iot_attack_seq(a)
    if contamination != -1:
      num = int((1000 / contamination - 1000) / 0.2)
      df_normal_train_data = df_normal_train_data.sample(n=num, replace=False, random_state=20)
      df_attack_train = df_attack_train.sample(n=5000, replace=False, random_state=20)
    df_normal_train, df_normal_eval = train_test_split(df_normal_train_data, test_size=0.2, random_state=20)

    df_attack_train, df_attack_eval = train_test_split(df_attack_train, test_size=0.2, random_state=20)
    x_train, y_train = df_normal_train.drop(columns=['class']), df_normal_train['class']
    attack_x_train, attack_y_train = df_attack_train.drop(columns=['class']), df_attack_train['class']
    df_eval = pd.concat([df_normal_eval, df_attack_eval])
    x_eval, y_eval = df_eval.drop(columns=['class']), df_eval['class']
    eval = x_eval
    print("------------------------------Attack : ",a, "--------------------------------\n")
    sk_iforest()
    magnifier = magnifier_trainer()
    iGuard = iGuard_control_plane(magnifier)

# Train iGuard on data plane features using custom autoencoder.

In [None]:
# def sk_iforest(a):
#     sklearn_clf_model = IsolationForest(n_estimators=50, max_samples=200, random_state=114514,
#                              contamination=0.2,n_jobs=8)
#     tmp = pd.concat([x_train,attack_x_train])
#     tmp.fillna(0,inplace=True)
#     sklearn_clf_model.fit(tmp)
#     y_pred_eval = sklearn_clf_model.predict(x_eval)
#     eval_y = y_eval
#     eval_x = x_eval

#     y_pred_eval[y_pred_eval == 1] = 0
#     y_pred_eval[y_pred_eval == -1] = 1
#     temp_str = classification_report(y_true=eval_y, y_pred=y_pred_eval)
#     temp_list = temp_str.split()
#     print("sklearn Isolation Forest")
#     print(temp_str)
#     print("Confusion Matrix")
#     print(confusion_matrix(eval_y, y_pred_eval))
#     print("\n F1 Score")
#     print(f1_score(eval_y, y_pred_eval))
#     print("\n roc-auc")
#     print(roc_auc_score(eval_y, y_pred_eval))
#     print("\n pr-auc")
#     print(average_precision_score(eval_y, y_pred_eval))
#     name = 'iForest'+a+'.pkl'
#     with open(name, 'wb') as f:
#         pickle.dump(sklearn_clf_model, f)

In [None]:
class DilatedSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, dilation):
        super(DilatedSeparableConv, self).__init__()
        self.depthwise = nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=dilation, dilation=dilation, groups=in_channels)
        self.pointwise = nn.Conv1d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        torch.manual_seed(42)
        torch.cuda.manual_seed(42)
        self.encoder = nn.Sequential(
            DilatedSeparableConv(1, 16, dilation=1),
            nn.ReLU(True),
            nn.MaxPool1d(2),
            DilatedSeparableConv(16, 32, dilation=2),
            nn.ReLU(True),
            nn.MaxPool1d(2),
            DilatedSeparableConv(32, 64, dilation=4),
            nn.ReLU(True),
            nn.MaxPool1d(2),
            nn.Conv1d(64, latent_dim, kernel_size=1)
        )

        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(latent_dim, 64, kernel_size=1),
            nn.ReLU(True),
            nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2, padding=1),
            nn.ReLU(True),
            nn.ConvTranspose1d(32, 16, kernel_size=5, stride=2, padding=1),
            nn.ReLU(True),
            nn.ConvTranspose1d(16, 1, kernel_size=6, stride=2, padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
def train_custom_autoencoder(df_normal_train, df_normal_eval, df_attack_eval):
    X_train = df_normal_train.values
    X_test = pd.concat([df_normal_eval, df_attack_eval]).values
    actual = torch.cat([torch.zeros(df_normal_eval.shape[0]), torch.ones(df_attack_eval.shape[0])])
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    input_dim = X_train.shape[1]
    encoding_dim = 8
    num_epochs = 1000
    batch_size = 512

    autoencoder = Autoencoder(input_dim, encoding_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(autoencoder.parameters(), lr=1e-3)
    #optimizer = optim.Adagrad(autoencoder.parameters(), lr=0.01)
    #optimizer = optim.RMSprop(autoencoder.parameters(), lr=0.001, alpha=0.9)
    #optimizer = optim.SGD(autoencoder.parameters(), lr=0.1, weight_decay=0.001)

    autoencoder = autoencoder.cuda()

    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(0, len(X_train), batch_size):
            inputs = torch.tensor(X_train[i:i+batch_size], dtype=torch.float).unsqueeze(0)
            inputs = inputs.cuda()
            inputs = inputs.transpose(0, 1)
            outputs = autoencoder(inputs)
            loss = criterion(outputs, inputs)
            total_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch+1) % 200 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss.sum():.4f}')

    autoencoder.eval()
    with torch.no_grad():
        input = torch.tensor(X_test, dtype=torch.float).unsqueeze(0).transpose(0, 1).cuda()
        decoded_data = autoencoder(input).cpu().numpy()
    decoded_data_binary = decoded_data.squeeze()
    mse = np.mean(np.power(X_test - decoded_data_binary, 2), axis=1)
    rmse = np.sqrt(mse)

    for i in [0.001]:
        thres = i
        print("-------------i",i)
        predicted = pd.Series(np.where(rmse > thres, 1, 0),dtype="float64")
        f1 = f1_score(actual, predicted)
        print(confusion_matrix(actual, predicted))
        print("\n Classification report")
        print(classification_report(actual, predicted))
        print('F1 Score: ', f1)

    return autoencoder

In [None]:
def custom_autoencoder():
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    auto = train_custom_autoencoder(df_norm_train.iloc[:,:12], df_normal_eval.iloc[:,:12], df_attack_eval.iloc[:,:12])
    print("Autoencoder training completed\n")
    return auto

In [None]:
def iGuard_data_plane(auto,a):
    tmp = pd.concat([x_train,attack_x_train])
    tmp.fillna(0,inplace=True)
    tmp.columns = range(len(tmp.columns))
    clf_model = isolationForest(tmp, n_trees=50, max_depth=8, subspace=200)
    clf_model.fit()
    print("Isolation Forest training completed\n")
    auto = auto.cpu()
    clf_model.label(auto)
    print("Isolation Forest labeling completed\n")
    x_eval = eval
    x_eval.fillna(0,inplace=True)
    x_eval.columns = range(len(x_eval.columns))
    y_pred_eval = pred(clf_model, x_eval, 1)
    eval_y = y_eval
    eval_x = x_eval

    y_pred_eval = np.array(y_pred_eval)
    y_pred_eval[y_pred_eval == 1] = 0
    y_pred_eval[y_pred_eval == -1] = 1
    print("Our Isolation Forest + Autoencoder")
    print("Confusion Matrix")
    print(confusion_matrix(eval_y, y_pred_eval))
    print("\n Classification report")
    print(classification_report(eval_y, y_pred_eval))
    print("\n F1 Score")
    print(f1_score(eval_y, y_pred_eval))
    print("\n roc-auc")
    print(roc_auc_score(eval_y, y_pred_eval))
    print("\n pr-auc")
    print(average_precision_score(eval_y, y_pred_eval))
    name = 'iGuard'+a+'.pkl'
    with open(name, 'wb') as f:
        pickle.dump(clf_model, f)

In [None]:
device_list=['360_camera']
attack_list=['http_ddos','data_theft','keylogging','service_scan','tcp_ddos','mirai','os_scan','aidra','bashlite','mirai_router_filter','os_scan_router','port_scan_router','tcp_ddos_router','udp_ddos','udp_ddos_router']
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
contamination = 0.2
for a in attack_list:

    normal_path = "/content/drive/MyDrive/HorusEye/DataSets/Dataplane/Normal/" + device_list[0] + ".csv"
    attack_path = "/content/drive/MyDrive/HorusEye/DataSets/Dataplane/Anomaly/" + a + ".csv"
    df_normal_train_data = pd.read_csv(normal_path)
    df_attack_train = pd.read_csv(attack_path)
    df_normal_train_data = df_normal_train_data.drop(columns=['tcp_udp','dst_port','n_packets'])
    df_attack_train = df_attack_train.drop(columns=['tcp_udp','dst_port','n_packets'])
    df_normal_train_data = df_normal_train_data.applymap(int)
    df_attack_train = df_attack_train.applymap(int)
    datafetch = True
    n1 = 200
    n2 = 1000
    while(datafetch):
        if contamination != -1:
            try:
                num = int((n1 / contamination - n1) / 0.2)
                temp = df_normal_train_data.sample(n=num, replace=False, random_state=20)
                df_norm_train = df_normal_train_data.drop(temp.index)

                df_attack_train = df_attack_train.sample(n=n2, replace=False, random_state=20)
                df_normal_train_data = temp
                datafetch = False
            except ValueError as e:
                if "Cannot take a larger sample than population when 'replace=False'" in str(e):
                    n1 = int(n1/2)
                    n2 = int(n2/2)
                else:
                    raise e

    df_normal_train, df_normal_eval = train_test_split(df_normal_train_data, test_size=0.2, random_state=20)
    df_norm_train = pd.concat([df_norm_train, df_normal_train])
    df_attack_train, df_attack_eval = train_test_split(df_attack_train, test_size=0.2, random_state=20)
    x_train, y_train = df_normal_train.drop(columns=['class']), df_normal_train['class']
    attack_x_train, attack_y_train = df_attack_train.drop(columns=['class']), df_attack_train['class']
    df_eval = pd.concat([df_normal_eval, df_attack_eval])
    x_eval, y_eval = df_eval.drop(columns=['class']), df_eval['class']
    eval = x_eval
    print("------------------------------Attack : ",a, "--------------------------------\n")
    sk_iforest(a)
    auto = custom_autoencoder()
    iGuard_data_plane(auto,a)

## Experiment 3: Fig. 6, Appendix Fig. 9, and Tables 2, 3 (Data Plane Simulation)

In [None]:
# Use iGuard_data_plane to obtain comparison with Magnifier and iForest on F1 score, ROC_AUC and ROC_PR on best configuration.
# Refer to your google doc

## Experiment 4: Comparison with VAE (See Fig. 10 in Appendix). Compare Magnifier with VAE would suffice.

In [None]:
# Define the VAE model with 4 hidden layers
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4, latent_dim):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc21 = nn.Linear(hidden_dim4, latent_dim)  # Mean
        self.fc22 = nn.Linear(hidden_dim4, latent_dim)  # Log variance

        # Decoder
        self.fc5 = nn.Linear(latent_dim, hidden_dim4)
        self.fc6 = nn.Linear(hidden_dim4, hidden_dim3)
        self.fc7 = nn.Linear(hidden_dim3, hidden_dim2)
        self.fc8 = nn.Linear(hidden_dim2, hidden_dim1)
        self.fc9 = nn.Linear(hidden_dim1, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        h2 = torch.relu(self.fc2(h1))
        h3 = torch.relu(self.fc3(h2))
        h4 = torch.relu(self.fc4(h3))
        return self.fc21(h4), self.fc22(h4)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h5 = torch.relu(self.fc5(z))
        h6 = torch.relu(self.fc6(h5))
        h7 = torch.relu(self.fc7(h6))
        h8 = torch.relu(self.fc8(h7))
        return torch.sigmoid(self.fc9(h8))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def pred(self, x, threshold):
        recon, _, _ = self.forward(x)
        mse_loss = F.mse_loss(recon, x, reduction='none')
        mse_per_sample = mse_loss.mean(dim=1)
        predictions = (mse_per_sample > threshold).float()  # Convert boolean to float (0 or 1)

        return predictions

def loss_function(recon_x, x, mu, logvar):
    BCE = nn.functional.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

# Hyperparameters
input_dim = 103
hidden_dim1 = 128
hidden_dim2 = 64
hidden_dim3 = 32
hidden_dim4 = 16
latent_dim = 32
learning_rate = 0.001
num_epochs = 50
batch_size = 32



In [None]:
device_list=['360_camera']
attack_list=['http_ddos','data_theft','keylogging','service_scan','tcp_ddos','mirai','os_scan','aidra','bashlite','mirai_router_filter','os_scan_router','port_scan_router','tcp_ddos_router','udp_ddos','udp_ddos_router']

for a in attack_list:
    df_normal_train_data = load_iot_data_seq(device_list=device_list, begin=0, end=1)
    df_attack_train = load_iot_attack_seq(a)

    df_normal_train, df_normal_eval = train_test_split(df_normal_train_data, test_size=0.2, random_state=20)
    df_attack_train, df_attack_eval = train_test_split(df_attack_train, test_size=0.2, random_state=20)

    df_eval = pd.concat([df_normal_eval,df_attack_eval], ignore_index=True)

    X_train, y_train = df_normal_train.drop(columns=['class']), df_normal_train['class']
    X_attack_test, y_attack_test = df_attack_train.drop(columns=['class']), df_attack_train['class']
    X_normal_test, y_noraml_test = df_normal_eval.drop(columns=['class']), df_normal_eval['class']
    df_X_test, df_y_test = df_eval.drop(columns=['class']), df_eval['class']

    X_train = scaler.fit_transform(X_train.values)
    X_attack_test = scaler.transform(X_attack_test.values)
    X_normal_test = scaler.transform(X_normal_test.values)
    df_X_test = scaler.transform(df_X_test.values)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32)

    X_attack_test = torch.tensor(X_attack_test, dtype=torch.float32)
    y_attack_test = torch.tensor(y_attack_test.values, dtype=torch.float32)
    X_normal_test = torch.tensor(X_normal_test, dtype=torch.float32)
    y_noraml_test = torch.tensor(y_noraml_test.values, dtype=torch.float32)
    df_X_test = torch.tensor(df_X_test, dtype=torch.float32)
    df_y_test = torch.tensor(df_y_test.values, dtype=torch.float32)

    train_datasets = TensorDataset(X_train)
    train_loader = DataLoader(dataset=train_datasets, batch_size=256, shuffle=True, num_workers=0)
    normal_test_datasets = TensorDataset(X_normal_test, y_noraml_test)
    normal_test_loader = DataLoader(dataset=normal_test_datasets, batch_size=256, shuffle=True, num_workers=0)
    attack_test_datasets = TensorDataset(X_attack_test, y_attack_test)
    attack_test_loader = DataLoader(dataset=attack_test_datasets, batch_size=256, shuffle=True, num_workers=0)
    test_datasets = TensorDataset(df_X_test, df_y_test)
    test_loader = DataLoader(dataset=test_datasets, batch_size=256, shuffle=True, num_workers=0)
    print("------------------------------Attack : ",a, "--------------------------------\n")

    model = VAE(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (x,) in enumerate(train_loader):
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(x)
            loss = loss_function(recon_batch, x, mu, logvar)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {total_loss}')

    print("Training complete!")

    with torch.no_grad():
        model.eval()
        loss_normal = 0
        for batch_idx, (x, y) in enumerate(normal_test_loader):
            b_x = x
            temp_x, _, _ = model(b_x)
            loss_temp = criterion(temp_x, b_x)
            loss_normal += loss_temp
        loss_normal = (loss_normal.detach().cpu().numpy())
        print('the eval normal loss ', loss_normal)
        loss_attack = 0
        for batch_idx, (x, y) in enumerate(attack_test_loader):
            b_x = x
            temp_x, _, _ = model(b_x)
            loss_temp = criterion(temp_x, b_x)
            loss_attack += loss_temp
        loss_attack = (loss_attack.detach().cpu().numpy())

        print('the eval attack loss ', loss_attack)

    with torch.no_grad():
        model.eval()
        correct = 0.
        df_score = []
        rmse_list = []
        y_pred_eval = []
        eval_y = []
        for batch_idx, (data, target) in enumerate(test_loader):
            y_pred_eval.extend(model.pred(data, 0.1))
            eval_y.extend(target)

    temp_str = classification_report(y_true=eval_y, y_pred=y_pred_eval)
    temp_list = temp_str.split()
    print("VAE")
    print(temp_str)
    print("Confusion Matrix")
    print(confusion_matrix(eval_y, y_pred_eval))
    print("\n F1 Score")
    print(f1_score(eval_y, y_pred_eval))
    print("\n roc-auc")
    print(roc_auc_score(eval_y, y_pred_eval))
    print("\n pr-auc")
    print(average_precision_score(eval_y, y_pred_eval))