In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

### Feature Reduction

> https://github.com/avdhesh201/Intrusion-Detection-System/tree/main

Tweaked codebase of mentioned repo
> https://colab.research.google.com/drive/1mWcY6DGE9WnlUsxgYP6za7kGcf95RFzZ?usp=sharing

In [65]:
# Load the stored data
with open('./data/X_train_fr.pickle', 'rb') as f:
    X_train = pickle.load(f)

with open('./data/X_test_fr.pickle', 'rb') as f:
    X_test = pickle.load(f)

with open('./data/y_train_fr.pickle', 'rb') as f:
    y_train = pickle.load(f)

with open('./data/y_test_fr.pickle', 'rb') as f:
    y_test = pickle.load(f)

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

df = pd.read_csv("./data/CICIDS2017_fr.csv")
df.sample(2)

Unnamed: 0.1,Unnamed: 0,Init_Win_bytes_backward,min_seg_size_forward,Active Max,Max Packet Length,Init_Win_bytes_forward,Fwd Packet Length Max,Flow IAT Max,Total Length of Fwd Packets,Flow Duration,...,Fwd Packet Length Min,Packet Length Std,Subflow Bwd Packets,Avg Fwd Segment Size,Fwd PSH Flags,act_data_pkt_fwd,Fwd IAT Std,Packet Length Variance,Fwd Packet Length Mean,Label
333,333,0.007099,0.0,0.0,0.000257,0.017227,0.000257,2.881356e-07,2e-06,2.833447e-07,...,0.004364,0.0,0.000181,0.00101,0.0,0.0,0.0,0.0,0.00101,1
1,1,0.800006,0.6,0.0,0.0,0.445572,0.0,0.04336431,0.0,0.04265071,...,0.0,0.0,0.000181,0.0,0.0,0.0,0.05715,0.0,0.0,6


In [66]:
feature_names = df.columns[1:-1]

In [67]:
# Hash Partitionaing - With Poisoning - Robustness - Feature reducted data

def hash_partition_data(X, y, num_partitions=5, overlap=0, poison_fraction=0.1):
    pre_hash_data = X[:, :-1]
    idxgroup_final = []
    
    for time in range(overlap + 1):
        hash_data = [hash(str(row) + str(time)) % num_partitions for row in pre_hash_data]
        
        if time != overlap:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions)]
        else:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions - overlap * num_partitions)]
        
        idxgroup_final += idxgroup
    
    X_partitions = []
    y_partitions = []
    
    for i in range(num_partitions):
        idx_train = np.concatenate([idxgroup_final[j] for j in range(num_partitions) if j != i])
        idx_test = idxgroup_final[i]
        
        if len(idx_train) > 0 and len(idx_test) > 0:  # Check if partition has at least one sample
            X_train_partition, X_test_partition = X[idx_train], X[idx_test]
            y_train_partition, y_test_partition = y[idx_train], y[idx_test]
            
            # Introduce poison into the training data
            n_poison_samples = int(len(X_train_partition) * poison_fraction)
            poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
            y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
            
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
    
    return X_partitions, y_partitions


# Modified partition_data function
def partition_data(X, y, num_partitions=6, overlap=0, poison_fraction=0.1):
    if overlap > 0:
        return hash_partition_data(X, y, num_partitions, overlap, poison_fraction)
    else:
        X_partitions = []
        y_partitions = []
        
        for i in range(num_partitions):
            X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
            
            # Introduce poison into the training data
            n_poison_samples = int(len(X_train_partition) * poison_fraction)
            poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
            y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
            
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
        
        return X_partitions, y_partitions

In [70]:
# Feature importance or SHAP (SHapley Additive exPlanations).
def evaluate_ensemble(X_partitions, y_partitions, robustness_samples=100):
    ensemble_scores = []
    precision_scores = []
    recall_scores = []
    fscore_scores = []
    robustness_accuracies = []
    
    for i in range(len(X_partitions)):
        X_train, X_test = X_partitions[i]
        y_train, y_test = y_partitions[i]
        
        ensemble = DecisionTreeClassifier(random_state=0)
        ensemble.fit(X_train, y_train)
        ensemble_score = ensemble.score(X_test, y_test)
        y_predict = ensemble.predict(X_test)
        y_true = y_test
        
        ensemble_scores.append(ensemble_score)
        precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_predict, average='weighted')
        precision_scores.append(precision)
        recall_scores.append(recall)
        fscore_scores.append(fscore)
        
        # Evaluate robustness
        y_poison = np.random.randint(0, np.max(y_true) + 1, size=robustness_samples)
        y_predict_poison = ensemble.predict(X_test[:robustness_samples])
        
        accuracy_poison = accuracy_score(y_poison, y_predict_poison)
        robustness_accuracies.append(accuracy_poison)
        
        # Calculate feature importance
        feature_importance = ensemble.feature_importances_
        
        # Sort feature importance in descending order
        sorted_indices = np.argsort(feature_importance)[::-1]
        sorted_feature_names = [feature_names[i] for i in sorted_indices]
        sorted_importance = feature_importance[sorted_indices]

        # Print feature importance
        print("Feature Importance:")
        for feature_name, importance in zip(sorted_feature_names, sorted_importance):
            print(f"{feature_name}: {importance}")
        print()
    
    print('Average Accuracy of Ensemble: ' + str(np.mean(ensemble_scores)))
    print('Average Precision of Ensemble: ' + str(np.mean(precision_scores)))
    print('Average Recall of Ensemble: ' + str(np.mean(recall_scores)))
    print('Average F1-score of Ensemble: ' + str(np.mean(fscore_scores)))
    
    average_robustness = np.mean(robustness_accuracies)
    print('Average Robustness Accuracy: ' + str(average_robustness))
    

In [71]:
X_partitions, y_partitions = partition_data(X, y)
evaluate_ensemble(X_partitions, y_partitions)

Feature Importance:
Init_Win_bytes_backward: 0.253338722650734
Active Max: 0.1537901945447424
min_seg_size_forward: 0.12508021800301988
Flow IAT Max: 0.09134130613443636
Bwd Packet Length Min: 0.04247315924294891
Bwd IAT Mean: 0.04001696974800075
Bwd Packets/s: 0.0396455386443121
Avg Bwd Segment Size: 0.030145264725682016
Init_Win_bytes_forward: 0.026286558382413998
Fwd IAT Std: 0.015171772634014386
Flow IAT Mean: 0.014300503225974303
Flow IAT Std: 0.013931584345300183
Fwd PSH Flags: 0.013912480745082218
Fwd IAT Max: 0.01355077585354255
Fwd IAT Total: 0.01314742821787908
Fwd Header Length: 0.012535535778772915
Packet Length Std: 0.012223870580429935
Flow Duration: 0.012123919192075394
Fwd IAT Min: 0.011239067480553942
Max Packet Length: 0.009972196466179209
Fwd Packet Length Max: 0.009371892880646182
Packet Length Variance: 0.0066070618535190515
Bwd Packet Length Max: 0.006150202350645474
Fwd Packet Length Std: 0.005279473908480985
Packet Length Mean: 0.0042701090196574105
Bwd IAT Tota