In [1]:
SEED = 1

In [2]:
import pandas as pd

In [3]:
# SCVIC IMPORT

training_data = pd.read_csv("../data/external/SCVIC.nosync/Training.csv")
testing_data = pd.read_csv("../data/external/SCVIC.nosync/Testing.csv")

In [4]:
# QUICK PREPROCESSING

columns_to_drop = ['Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Flow ID', 'Protocol', 'Timestamp']
training_data = training_data.drop(columns=columns_to_drop)
testing_data = testing_data.drop(columns=columns_to_drop)

training_data = training_data.replace([float('inf'), -float('inf')], float('nan')).dropna()
testing_data = testing_data.replace([float('inf'), -float('inf')], float('nan')).dropna()

training_data = pd.get_dummies(training_data, columns=['Label'])
testing_data = pd.get_dummies(testing_data, columns=['Label'])


In [5]:
#RANDOM FOREST TRAINING

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

X_train = training_data.drop(columns=[col for col in training_data.columns if 'Label_' in col])
y_train = training_data[[col for col in training_data.columns if 'Label_' in col]]
X_test = testing_data.drop(columns=[col for col in testing_data.columns if 'Label_' in col])
y_test = testing_data[[col for col in testing_data.columns if 'Label_' in col]]

rf_model = RandomForestClassifier(random_state=SEED)
rf_model.fit(X_train, y_train)

y_pred_user_defined = rf_model.predict(X_test)

f1_scores_user_defined = f1_score(y_test, y_pred_user_defined, average=None)
mean_f1_score_user_defined = f1_score(y_test, y_pred_user_defined, average='macro')

f1_scores_user_defined, mean_f1_score_user_defined

(array([0.21428571, 0.81818182, 0.82656827, 0.99949096, 0.88195387,
        0.74725275]),
 0.7479555628422453)

In [6]:
category_f1_scores_user_defined = dict(zip(y_test.columns, f1_scores_user_defined))

category_f1_scores_user_defined, mean_f1_score_user_defined

({'Label_DataExfiltration': 0.2142857142857143,
  'Label_InitialCompromise': 0.8181818181818182,
  'Label_LateralMovement': 0.826568265682657,
  'Label_NormalTraffic': 0.9994909646220412,
  'Label_Pivoting': 0.8819538670284938,
  'Label_Reconnaissance': 0.7472527472527472},
 0.7479555628422453)

In [7]:
# XGBOOST TRAINING

from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=SEED)
xgb.fit(X_train, y_train)

y_pred_user_defined = xgb.predict(X_test)

f1_scores_user_defined = f1_score(y_test, y_pred_user_defined, average=None)
mean_f1_score_user_defined = f1_score(y_test, y_pred_user_defined, average='macro')

f1_scores_user_defined, mean_f1_score_user_defined

(array([0.33707865, 0.83443709, 0.81226054, 0.99955454, 0.75423729,
        0.62121212]),
 0.7264633708215354)

In [8]:
category_f1_scores_user_defined = dict(zip(y_test.columns, f1_scores_user_defined))

category_f1_scores_user_defined, mean_f1_score_user_defined

({'Label_DataExfiltration': 0.3370786516853933,
  'Label_InitialCompromise': 0.8344370860927152,
  'Label_LateralMovement': 0.8122605363984673,
  'Label_NormalTraffic': 0.9995545414049218,
  'Label_Pivoting': 0.7542372881355932,
  'Label_Reconnaissance': 0.6212121212121212},
 0.7264633708215354)

In [9]:
# TAB PFN doesn't operate on one hot ecnoded data - the fastest way is to load the dataset again
import pandas as pd
SEED =1 

training_data = pd.read_csv("../data/external/SCVIC.nosync/Training.csv")
testing_data = pd.read_csv("../data/external/SCVIC.nosync/Testing.csv")

columns_to_drop = ['Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Flow ID', 'Protocol', 'Timestamp']
training_data = training_data.drop(columns=columns_to_drop)
testing_data = testing_data.drop(columns=columns_to_drop)

training_data = training_data.replace([float('inf'), -float('inf')], float('nan')).dropna()
testing_data = testing_data.replace([float('inf'), -float('inf')], float('nan')).dropna()

X_train = training_data.drop(columns='Label')
y_train = training_data['Label']
X_test = testing_data.drop(columns='Label')
y_test = testing_data['Label']

In [10]:
y_train.value_counts()

Label
NormalTraffic        253028
Pivoting               2122
Reconnaissance          833
LateralMovement         728
DataExfiltration        527
InitialCompromise        73
Name: count, dtype: int64

In [16]:
# SAVING THE PROCESSED DATA
X_train.to_csv("../data/processed/SCVIC/X_train.csv", index=False)
y_train.to_csv("../data/processed/SCVIC/y_train.csv", index=False)
X_test.to_csv("../data/processed/SCVIC/X_test.csv", index=False)
y_test.to_csv("../data/processed/SCVIC/y_test.csv", index=False)

In [11]:

from sklearn.utils import resample
X_train, y_train = pd.concat([X_train[y_train == 'NormalTraffic'].sample(1000, replace=True, random_state=0), X_train[y_train != 'NormalTraffic']]), pd.concat([y_train[y_train == 'NormalTraffic'].sample(1000, replace=True, random_state=0), y_train[y_train != 'NormalTraffic']])


In [12]:
y_train.value_counts()

Label
Pivoting             2122
NormalTraffic        1000
Reconnaissance        833
LateralMovement       728
DataExfiltration      527
InitialCompromise      73
Name: count, dtype: int64

In [13]:
# TAB PFN

from tabpfn import TabPFNClassifier

tabpfn = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)

tabpfn.fit(X_train, y_train, overwrite_warning=True)

y_pred_user_defined = tabpfn.predict(X_test)

f1_scores_user_defined = f1_score(y_test, y_pred_user_defined, average=None)
mean_f1_score_user_defined = f1_score(y_test, y_pred_user_defined, average='macro')

f1_scores_user_defined, mean_f1_score_user_defined

ModuleNotFoundError: No module named 'typing_extensions'