<a href="https://colab.research.google.com/github/kdemertzis/EKPA/blob/main/OnlineLearning_IPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#εισαγωγή βιβλιοθηκών - πακέτων
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
import requests
from io import BytesIO
import numpy as np


In [6]:
#κατέβασμα και αποσυμπίεση του συνόλου δεδομένων
url = "https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/pcap_data.csv"
df = pd.read_csv(url)
X = df.drop('target', axis=1).values
y = df['target'].values

In [236]:
#δημιουργία του συνόλου σε δύο κλάσεις Normal (0) και Attack (1)
X = df.drop("target", axis=1)
y = df["target"].apply(lambda x: 1 if x == "normal." else 0)

In [237]:
#έλεγχος κλάσεων
print("Κατανομή κλάσεων y:")
print(y.value_counts())

Κατανομή κλάσεων y:
target
0    14967
Name: count, dtype: int64


In [238]:
#εντοπισμός κατηγορικών μεταβλητών
categorical_features = ['protocol_type', 'service', 'flag']

In [239]:
#διαχωρισμός κατηγορικών και αριθμητικών μεταβλητών
numeric_features = X.columns.difference(categorical_features)

In [240]:
#δημιουργία διοχέτευσης (αγωγού) προεπεξεργασίας με κωδικοποίηση μίας δέσμης (one-hot encoding) για κατηγορικές μεταβλητές
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [241]:
#τμηματοποίηση δεδομένων σε σύνολα εκαπίδευσης και δοκιμών
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [242]:
#δημιουργία διοχέτευσης (αγωγού) SMOTE για τις αριθμητικές μεταβλητές μόνο
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(random_state=42, categorical_features=[X.columns.get_loc(col) for col in categorical_features if col in X.columns])), # Check if column exists before getting location
    ('classifier', RandomForestClassifier(random_state=42))
])

In [243]:
#καθορισμός κατωφλίου (threshold) για την διακοπή διακτυακής κίνησης
blocking_threshold = 0.001

In [244]:

#δημιουργία βρόχου συνεχούς - αυξητικής μάθησης
batch_size = 10000
for epoch in range(1, 10):  #δυνατότητα αλλαγής των εποχών
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train.iloc[i:i + batch_size]
        y_batch = y_train.iloc[i:i + batch_size]


In [245]:
# Assuming 'categorical_features' is a list of column names defined earlier
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_features if col in X_train.columns] # Determine indices based on the entire training set

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTENC(random_state=42, categorical_features=categorical_indices)), # Pass the pre-computed indices
    ('classifier', RandomForestClassifier(random_state=42))
])

In [246]:
if i % batch_size == 1 and i > 0:
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            blocked_indices = np.where(y_pred_proba > blocking_threshold)[0] #
            if len(blocked_indices) > 0:
                print(f"Blocking {len(blocked_indices)} malicious traffic instances.") # Correct indentation

In [247]:
if i % batch_size == 1 and i > 0:
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    blocked_indices = np.where(y_pred_proba > blocking_threshold)[0]
    if len(blocked_indices) > 0:
        print(f"Blocking {len(blocked_indices)} malicious traffic instances.")
        X_test = X_test.drop(blocked_indices)
        y_test = y_test.drop(blocked_indices)
        print("Updated test set size:", len(X_test))
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Updated accuracy: {accuracy}")
        print(classification_report(y_test, y_pred))
        print("--------------------------------------------------")
        print("Epoch:", epoch, "Batch:", i // batch_size)
        print("--------------------------------------------------")



