In [None]:
import polars as pl

file = r"D:/Dissertation 2025/Results/Friday_clean.csv"

# Load SMALL sample 
sample_df = pl.read_csv(
    file,
    n_rows=30000,             
    infer_schema_length=0,    
    ignore_errors=True
)

print("Sample shape:", sample_df.shape)
print("Columns loaded:", len(sample_df.columns))


Sample shape: (30000, 81)
Columns loaded: 81


In [6]:
label_cols = {"Label", "y_binary", "y_family", "day"}

numeric_df = sample_df.with_columns([
    pl.col(col).cast(pl.Float64, strict=False)
    for col in sample_df.columns
    if col not in label_cols
])

print("Numeric DF shape:", numeric_df.shape)


Numeric DF shape: (30000, 81)


In [7]:
# Select numeric columns only
num_cols = [c for c in numeric_df.columns if c not in label_cols]

# Convert Polars → pandas safely
df_dict = numeric_df.select(num_cols).to_dict(as_series=True)

import pandas as pd
corr_df = pd.DataFrame(df_dict)

# Correlation matrix
corr_matrix = corr_df.corr()

print("\nCorrelation matrix:")
print(corr_matrix)



Correlation matrix:
                               Flow Duration   Total Fwd Packets  \
 Flow Duration                      1.000000            0.307298   
 Total Fwd Packets                  0.307298            1.000000   
 Total Backward Packets             0.243719            0.975712   
Total Length of Fwd Packets         0.225467            0.293829   
 Total Length of Bwd Packets        0.167233            0.957829   
...                                      ...                 ...   
 Active Min                         0.115810            0.144508   
Idle Mean                           0.651558            0.168449   
 Idle Std                           0.169020            0.063118   
 Idle Max                           0.655198            0.172893   
 Idle Min                           0.630911            0.160136   

                               Total Backward Packets  \
 Flow Duration                               0.243719   
 Total Fwd Packets                           0.9

In [None]:
# Mutual Information
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import numpy as np

# Convert Polars → pandas safely
df_dict = numeric_df.select(num_cols).to_dict(as_series=True)
X = pd.DataFrame(df_dict)

# Target
y = numeric_df["y_binary"].to_pandas()

# CLEAN X 
X = X.replace([np.inf, -np.inf], np.nan)  # remove inf
X = X.clip(lower=-1e10, upper=1e10)       # clip extreme values
X = X.fillna(0)                           # final fill

# Compute MI
mi_scores = mutual_info_classif(X, y, discrete_features=False)

# Sorted MI values
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

print("\nTop 20 MI features:\n", mi_series.head(20))



Top 20 MI features:
  min_seg_size_forward          0.004230
 Down/Up Ratio                 0.002988
 Init_Win_bytes_backward       0.001708
Init_Win_bytes_forward         0.001587
 ACK Flag Count                0.001541
 Total Backward Packets        0.001368
 PSH Flag Count                0.001261
 Fwd Packet Length Min         0.000416
 Flow Duration                 0.000373
Flow Bytes/s                   0.000351
 Min Packet Length             0.000329
 URG Flag Count                0.000270
 Bwd Packet Length Min         0.000252
Total Length of Fwd Packets    0.000223
 Packet Length Mean            0.000218
 Average Packet Size           0.000210
 Max Packet Length             0.000190
 Avg Fwd Segment Size          0.000190
 Fwd Packet Length Mean        0.000175
 Flow IAT Max                  0.000173
dtype: float64
