In [3]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import json
import os

monday_path = "D:/Dissertation 2025/Results/Monday_clean.csv"
friday_path = "D:/Dissertation 2025/Results/Friday_clean.csv"


In [7]:
def load_and_get_top20(path, day_name):
    print(f"\nProcessing {day_name}...")

    # Load CSV
    df = pl.read_csv(path, ignore_errors=True).to_pandas()

    # Detect label column
    if "y_binary" in df.columns:
        y = df["y_binary"].astype(int)
    elif "Label" in df.columns:
        y = df["Label"].map({"BENIGN": 0, "ATTACK": 1})
    else:
        raise ValueError("No label column found.")

    # Keep only numeric columns
    X = df.select_dtypes(include=["number"]).copy()

    # FIX: Clean infinities and huge values
    X = X.replace([np.inf, -np.inf], np.nan)

    # Drop columns that are 100% NaN
    X = X.dropna(axis=1, how="all")

    # Fill NaN with median (best practice for CICIDS2017)
    X = X.fillna(X.median())

    # Train RF
    rf = RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X, y)

    # Top 20 features
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    top20 = X.columns[indices[:20]].tolist()

    print(f"Top 20 features for {day_name}:")
    print(top20)

    # Save JSON
    out_path = f"D:/Dissertation 2025/Results/top20_features_{day_name}.json"
    with open(out_path, "w") as f:
        json.dump(top20, f, indent=4)

    print("Saved:", out_path)
    return top20


In [5]:
top20_mon = load_and_get_top20(monday_path, "Monday")



Processing Monday...
Top 20 features for Monday:
['y_binary', ' Idle Min', ' Idle Max', ' Idle Std', 'Idle Mean', ' Active Min', ' Active Max', ' Active Std', 'Active Mean', ' min_seg_size_forward', ' act_data_pkt_fwd', ' Init_Win_bytes_backward', 'Init_Win_bytes_forward', ' Subflow Bwd Bytes', ' Subflow Bwd Packets', ' Subflow Fwd Bytes', 'Subflow Fwd Packets', 'Bwd Avg Bulk Rate', ' Bwd Avg Packets/Bulk', ' Bwd Avg Bytes/Bulk']
Saved: D:/Dissertation 2025/Results/top20_features_Monday.json


In [8]:
top20_fri = load_and_get_top20(friday_path, "Friday")



Processing Friday...
Top 20 features for Friday:
['y_binary', ' Subflow Fwd Bytes', ' Fwd Packet Length Max', 'Total Length of Fwd Packets', ' Fwd Packet Length Mean', ' Packet Length Mean', ' Average Packet Size', ' PSH Flag Count', ' Avg Fwd Segment Size', ' Flow IAT Max', ' Flow Duration', ' Bwd Packets/s', ' Max Packet Length', ' Fwd IAT Mean', ' Packet Length Variance', ' Packet Length Std', 'Flow Bytes/s', ' Total Fwd Packets', ' Avg Bwd Segment Size', 'Init_Win_bytes_forward']
Saved: D:/Dissertation 2025/Results/top20_features_Friday.json


In [10]:
import json

features_mon = json.load(open("D:/Dissertation 2025/Results/top20_features_Monday.json"))
features_fri = json.load(open("D:/Dissertation 2025/Results/top20_features_Friday.json"))


In [11]:
mon = pl.read_csv("D:/Dissertation 2025/Results/Monday_clean.csv", ignore_errors=True).to_pandas()
fri = pl.read_csv("D:/Dissertation 2025/Results/Friday_clean.csv", ignore_errors=True).to_pandas()


In [12]:
X_mon = mon[features_mon]
y_mon = mon["y_binary"].astype(int)
x_fri = fri[features_fri]
y_fri = fri["y_binary"].astype(int)

In [13]:
X_mon = X_mon.replace([np.inf, -np.inf], np.nan)
X_mon = X_mon.fillna(X_mon.median())
x_fri = x_fri.replace([np.info, -np.inf],np.nan)
y_fri = y_fri.fillna(x_fri.median())

In [15]:
print(X_mon.shape)
print(y_mon.shape)
print(x_fri.shape)
print(y_fri.shape)



(529918, 20)
(529918,)
(286467, 20)
(286467,)


In [17]:
print(y_mon.value_counts())


y_binary
0    529918
Name: count, dtype: int64


In [18]:
y_fri.value_counts()


y_binary
1    158930
0    127537
Name: count, dtype: int64