In [1]:
import polars as pl
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

MONDAY_FILE = r"D:/Dissertation 2025/Results/Monday_clean.csv"
FRIDAY_FILE = r"D:/Dissertation 2025/Results/Friday_clean.csv"

label_cols = {"Label", "y_binary", "y_family", "day"}


In [None]:
n_sample = 20000   

monday = pl.read_csv(
    MONDAY_FILE,
    n_rows=n_sample,
    infer_schema_length=0,
    ignore_errors=True
)

friday = pl.read_csv(
    FRIDAY_FILE,
    n_rows=n_sample,
    infer_schema_length=0,
    ignore_errors=True
)

print("Monday sample:", monday.shape)
print("Friday sample:", friday.shape)


Monday sample: (20000, 81)
Friday sample: (20000, 81)


In [3]:
def to_numeric(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns([
        pl.col(c).cast(pl.Float64, strict=False)
        for c in df.columns
        if c not in label_cols
    ])

monday_num = to_numeric(monday)
friday_num = to_numeric(friday)

num_cols = [c for c in monday_num.columns if c not in label_cols]
print("Numeric feature count:", len(num_cols))


Numeric feature count: 77


In [None]:
ks_results = []

for col in num_cols:
    m_vals = monday_num[col].to_numpy()
    f_vals = friday_num[col].to_numpy()

    # drop NaNs
    m_vals = m_vals[~np.isnan(m_vals)]
    f_vals = f_vals[~np.isnan(f_vals)]

    if len(m_vals) < 50 or len(f_vals) < 50:
        continue  

    stat, pval = ks_2samp(m_vals, f_vals)
    ks_results.append({
        "feature": col,
        "ks_stat": stat,
        "p_value": pval
    })

ks_df = pd.DataFrame(ks_results).sort_values("ks_stat", ascending=False)
print("\nTop 15 features by KS drift:\n")
print(ks_df.head(15))



Top 15 features by KS drift:

                   feature  ks_stat  p_value
36           Bwd Packets/s  0.28105      0.0
19           Fwd IAT Total  0.27010      0.0
20            Fwd IAT Mean  0.26450      0.0
8    Fwd Packet Length Std  0.25840      0.0
22             Fwd IAT Max  0.25730      0.0
5    Fwd Packet Length Max  0.25040      0.0
27             Bwd IAT Max  0.24945      0.0
24           Bwd IAT Total  0.24925      0.0
35           Fwd Packets/s  0.24805      0.0
25            Bwd IAT Mean  0.24745      0.0
14          Flow Packets/s  0.24610      0.0
15           Flow IAT Mean  0.24555      0.0
0            Flow Duration  0.24500      0.0
10   Bwd Packet Length Min  0.24460      0.0
65  Init_Win_bytes_forward  0.22705      0.0


In [5]:
def psi_for_feature(monday_vals, friday_vals, n_bins=10, eps=1e-6):
    # remove NaNs
    monday_vals = monday_vals[~np.isnan(monday_vals)]
    friday_vals = friday_vals[~np.isnan(friday_vals)]

    if len(monday_vals) == 0 or len(friday_vals) == 0:
        return np.nan

    # create bins based on Monday quantiles
    quantiles = np.linspace(0, 1, n_bins+1)
    try:
        cuts = np.quantile(monday_vals, quantiles)
    except Exception:
        return np.nan

    # avoid duplicate bin edges
    cuts = np.unique(cuts)
    if len(cuts) <= 2:
        return np.nan

    # bin Monday & Friday
    monday_bins = np.digitize(monday_vals, cuts[1:-1], right=True)
    friday_bins = np.digitize(friday_vals, cuts[1:-1], right=True)

    # proportions
    m_counts = np.bincount(monday_bins, minlength=len(cuts)-1) / len(monday_vals)
    f_counts = np.bincount(friday_bins, minlength=len(cuts)-1) / len(friday_vals)

    # avoid zero
    m_counts = np.clip(m_counts, eps, 1)
    f_counts = np.clip(f_counts, eps, 1)

    psi = np.sum((f_counts - m_counts) * np.log(f_counts / m_counts))
    return psi


In [6]:
psi_results = []

for col in num_cols:
    m_vals = monday_num[col].to_numpy()
    f_vals = friday_num[col].to_numpy()

    psi_val = psi_for_feature(m_vals, f_vals, n_bins=10)
    psi_results.append({
        "feature": col,
        "psi": psi_val
    })

psi_df = pd.DataFrame(psi_results).dropna().sort_values("psi", ascending=False)

print("\nTop 15 features by PSI drift:\n")
print(psi_df.head(15))



Top 15 features by PSI drift:

                    feature       psi
36            Bwd Packets/s  0.465467
35            Fwd Packets/s  0.423357
15            Flow IAT Mean  0.422160
37        Min Packet Length  0.420435
14           Flow Packets/s  0.419386
6     Fwd Packet Length Min  0.413733
34        Bwd Header Length  0.411593
2    Total Backward Packets  0.406830
63      Subflow Bwd Packets  0.406830
0             Flow Duration  0.395436
17             Flow IAT Max  0.390661
5     Fwd Packet Length Max  0.369238
38        Max Packet Length  0.367789
22              Fwd IAT Max  0.350660
20             Fwd IAT Mean  0.333910


  diff_b_a = subtract(b, a)


In [7]:
out_base = r"D:/Dissertation 2025/Results"

ks_df.to_csv(out_base + "/monday_friday_ks_results.csv", index=False)
psi_df.to_csv(out_base + "/monday_friday_psi_results.csv", index=False)

print("\nSaved KS + PSI results to Results folder.")



Saved KS + PSI results to Results folder.
