In [10]:
from pathlib import Path
import numpy as np
import pandas as pd


In [11]:
DATA_DIR = Path("Processed_UE_Datasets_unscaled")  # adjust if needed
LABEL_COL = "binary_label"

csv_files = sorted([p for p in DATA_DIR.glob("*.csv") if "test" not in p.name.lower()])
len(csv_files), csv_files[:]


(7,
 [PosixPath('Processed_UE_Datasets_unscaled/ue_3557821101183501_malicious_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8609960468879057_normal_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8609960480666910_normal_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8609960480859058_normal_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8642840401594200_malicious_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8642840401612300_malicious_labeled.csv'),
  PosixPath('Processed_UE_Datasets_unscaled/ue_8677660403123800_malicious_labeled.csv')])

In [12]:
def pooled_stats_for_label(csv_files, label_col="binary_label", chunksize=None):
    # Accumulators per class: count, sum, sumsq (per feature)
    acc = {
        0: {"n": None, "sum": None, "sumsq": None, "cols": None},
        1: {"n": None, "sum": None, "sumsq": None, "cols": None},
    }

    def init_if_needed(cls, cols):
        if acc[cls]["cols"] is None:
            acc[cls]["cols"] = cols
            acc[cls]["n"] = pd.Series(0, index=cols, dtype="int64")
            acc[cls]["sum"] = pd.Series(0.0, index=cols, dtype="float64")
            acc[cls]["sumsq"] = pd.Series(0.0, index=cols, dtype="float64")

    def update(df):
        if label_col not in df.columns:
            raise KeyError(f"Missing label column '{label_col}' in df columns.")

        # Make labels numeric 0/1
        y = pd.to_numeric(df[label_col], errors="coerce")
        df = df.loc[y.notna()].copy()
        y = y.loc[y.notna()].astype(int)

        # Use numeric columns only (exclude label itself)
        Xnum = df.select_dtypes(include=[np.number]).copy()
        if label_col in Xnum.columns:
            Xnum = Xnum.drop(columns=[label_col])

        if Xnum.shape[1] == 0:
            return

        for cls in (0, 1):
            mask = (y == cls).to_numpy()
            if mask.sum() == 0:
                continue

            X = Xnum.loc[mask]

            init_if_needed(cls, X.columns)

            # align columns in case of mismatch across files
            X = X.reindex(columns=acc[cls]["cols"])

            n_add = X.notna().sum(axis=0).astype("int64")
            sum_add = X.sum(axis=0, skipna=True)
            sumsq_add = (X**2).sum(axis=0, skipna=True)

            acc[cls]["n"] = acc[cls]["n"].add(n_add, fill_value=0).astype("int64")
            acc[cls]["sum"] = acc[cls]["sum"].add(sum_add, fill_value=0.0)
            acc[cls]["sumsq"] = acc[cls]["sumsq"].add(sumsq_add, fill_value=0.0)

    for f in csv_files:
        if chunksize is None:
            df = pd.read_csv(f)
            update(df)
        else:
            for chunk in pd.read_csv(f, chunksize=chunksize):
                update(chunk)

    def finalize(cls):
        if acc[cls]["cols"] is None:
            return pd.Series(dtype=float), pd.Series(dtype=float)

        n = acc[cls]["n"].astype("float64")
        s = acc[cls]["sum"]
        ss = acc[cls]["sumsq"]

        mean = s / n
        var = (ss - n * (mean**2)) / (n - 1)   # sample variance
        std = np.sqrt(var.clip(lower=0))
        return mean, std

    mean0, std0 = finalize(0)
    mean1, std1 = finalize(1)

    features = sorted(set(mean0.index).union(set(mean1.index)))
    table = pd.DataFrame({
        "Feature": features,
        "Normal Mean": mean0.reindex(features).to_numpy(),
        "Normal Std": std0.reindex(features).to_numpy(),
        "Attack Mean": mean1.reindex(features).to_numpy(),
        "Attack Std": std1.reindex(features).to_numpy(),
    })

    return table


In [13]:
stats_table = pooled_stats_for_label(csv_files, label_col=LABEL_COL, chunksize=None)
stats_table.head(20)


Unnamed: 0,Feature,Normal Mean,Normal Std,Attack Mean,Attack Std
0,attack_number,0.126244,0.6129868,3.44415,1.498775
1,cqi,12.29242,3.09109,12.2349,3.68681
2,dl_bitrate,1116130.0,1570155.0,57793.58,40502.35
3,dl_err,0.02446472,0.2372337,0.07456528,0.3911832
4,dl_mcs,19.6121,7.190199,21.1074,8.799919
5,dl_retx,36.15998,59.74037,8.889626,12.31826
6,dl_tx,594.5355,828.2705,292.233,180.1266
7,epre,-100.0154,13.88332,-109.3514,14.87084
8,event,0.5846554,0.7692217,0.8255231,0.8738753
9,imeisv,-36741280000000.0,0.0,2685938000000000.0,0.0


In [14]:
stats_table_sorted_round = stats_table_sorted.copy()
for c in ["Normal Mean", "Normal Std", "Attack Mean", "Attack Std"]:
    stats_table_sorted_round[c] = stats_table_sorted_round[c].round(4)

stats_table_sorted_round.head(30)


Unnamed: 0,Feature,Normal Mean,Normal Std,Attack Mean,Attack Std
0,attack_number,0.1262,0.613,3.4441,1.4988
1,cqi,12.2924,3.0911,12.2349,3.6868
2,dl_bitrate,1116130.0,1570155.0,57793.58,40502.35
3,dl_err,0.0245,0.2372,0.0746,0.3912
4,dl_mcs,19.6121,7.1902,21.1074,8.7999
5,dl_retx,36.16,59.7404,8.8896,12.3183
6,dl_tx,594.5355,828.2705,292.233,180.1266
7,epre,-100.0154,13.8833,-109.3514,14.8708
8,event,0.5847,0.7692,0.8255,0.8739
9,imeisv,-36741280000000.0,0.0,2685938000000000.0,0.0


In [15]:
to_drop = {"imeisv", "event", "multiclass_label","attack_number"}

stats_table_filtered = stats_table_sorted_round[
    ~stats_table_sorted_round["Feature"].isin(to_drop)
].reset_index(drop=True)

stats_table_filtered


Unnamed: 0,Feature,Normal Mean,Normal Std,Attack Mean,Attack Std
0,cqi,12.2924,3.0911,12.2349,3.6868
1,dl_bitrate,1116130.0,1570155.0,57793.58,40502.35
2,dl_err,0.0245,0.2372,0.0746,0.3912
3,dl_mcs,19.6121,7.1902,21.1074,8.7999
4,dl_retx,36.16,59.7404,8.8896,12.3183
5,dl_tx,594.5355,828.2705,292.233,180.1266
6,epre,-100.0154,13.8833,-109.3514,14.8708
7,p_ue,-6.7007,12.6363,-7.8618,9.1074
8,pusch_snr,27.0695,8.5617,24.6243,5.6399
9,ul_bitrate,5558019.0,9065503.0,8460140.0,8050658.0


In [16]:
pretty = stats_table_filtered.copy()

fmt = {
    "Normal Mean": "{:.3f}",
    "Normal Std": "{:.3f}",
    "Attack Mean": "{:.3f}",
    "Attack Std": "{:.3f}",
}

for col, f in fmt.items():
    pretty[col] = pretty[col].map(lambda x: f.format(x) if pd.notna(x) else "")

print(pretty.to_string(index=False))


   Feature Normal Mean  Normal Std Attack Mean  Attack Std
       cqi      12.292       3.091      12.235       3.687
dl_bitrate 1116129.859 1570154.547   57793.580   40502.350
    dl_err       0.025       0.237       0.075       0.391
    dl_mcs      19.612       7.190      21.107       8.800
   dl_retx      36.160      59.740       8.890      12.318
     dl_tx     594.535     828.270     292.233     180.127
      epre    -100.015      13.883    -109.351      14.871
      p_ue      -6.701      12.636      -7.862       9.107
 pusch_snr      27.070       8.562      24.624       5.640
ul_bitrate 5558019.372 9065503.199 8460139.526 8050658.244
    ul_err       0.030       0.417       0.235       1.153
    ul_mcs      20.351       6.722      19.973       5.759
   ul_retx      53.517      74.083      63.016      31.417
     ul_tx     616.827     702.891     678.936     352.318


In [17]:
import json
import time

json_payload = {
    "status": "computed",
    "computed_at_unix": int(time.time()),
    "method": "statistical_summary",
    "label_definition": {
        "0": "normal",
        "1": "attack"
    },
    "feature_columns": stats_table_filtered["Feature"].tolist(),
    "global_feature_statistics": []
}

for _, row in stats_table_filtered.iterrows():
    json_payload["global_feature_statistics"].append({
        "feature": row["Feature"],
        "normal_mean": float(row["Normal Mean"]),
        "normal_std": float(row["Normal Std"]),
        "attack_mean": float(row["Attack Mean"]),
        "attack_std": float(row["Attack Std"]),
    })


In [18]:
out_path = "global_feature_statistics.json"

with open(out_path, "w") as f:
    json.dump(json_payload, f, indent=2)

out_path


'global_feature_statistics.json'