In [1]:
import pandas as pd
import os
import glob
import joblib
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Paths
# -----------------------------
input_folder = 'Per_UE_Datasets'
output_folder = 'Processed_UE_Datasets'
scaler_path = os.path.join(output_folder, 'feature_scaler.joblib')

os.makedirs(output_folder, exist_ok=True)

# Columns
relevant_columns = ['_time','imeisv','epre','pusch_snr','p_ue','ul_mcs','cqi','ul_bitrate',
            'dl_mcs','dl_retx','ul_tx','dl_tx','ul_retx','dl_bitrate','dl_err','ul_err','attack_number',
            'event','binary_label','multiclass_label'] # features in the dataset

# Columns to normalize
numeric_columns = ['epre','pusch_snr','p_ue','ul_mcs','cqi','ul_bitrate',
            'dl_mcs','dl_retx','ul_tx','dl_tx','ul_retx','dl_bitrate','dl_err','ul_err'] # numeric features in the dataset

# -----------------------------
# Load files
# -----------------------------
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

dataframes = {}
train_numeric_data = []

# -----------------------------
# Read ONCE & collect train data
# -----------------------------
for file in csv_files:
    df = pd.read_csv(file)[relevant_columns]
    dataframes[file] = df

    if 'test' not in os.path.basename(file).lower():
        train_numeric_data.append(df[numeric_columns])

# -----------------------------
# Fit scaler (TRAIN ONLY)
# -----------------------------
train_numeric_data = pd.concat(train_numeric_data, axis=0)

scaler = StandardScaler()
scaler.fit(train_numeric_data)

# -----------------------------
# SAVE SCALER + METADATA
# -----------------------------
scaler_bundle = {
    "scaler": scaler,
    "numeric_columns": numeric_columns,
    "fitted_on": "train_UEs_only",
    "scaler_type": "StandardScaler"
}

joblib.dump(scaler_bundle, scaler_path)

# -----------------------------
# Transform ALL files
# -----------------------------
for file, df in dataframes.items():
    df[numeric_columns] = scaler.transform(df[numeric_columns])

    output_file = os.path.join(output_folder, os.path.basename(file))
    df.to_csv(output_file, index=False)

print("âœ… Processing complete")
print(f"ðŸ“¦ Scaler bundle saved to: {scaler_path}")


âœ… Processing complete
ðŸ“¦ Scaler bundle saved to: Processed_UE_Datasets/feature_scaler.joblib
