In [7]:
import pandas as pd

oran = pd.read_csv("original_preprocess.csv")
ul = pd.read_csv("dataset_ul.csv")

In [14]:
print("ORAN raw shape:", oran.shape)
print("UL raw shape  :", ul.shape)

ORAN raw shape: (17501, 39)
UL raw shape  : (19892, 40)


In [8]:
oran.head()
oran.shape
oran.columns

Index(['date', 'BW', 'TM', 'traffic_load_dl', 'traffic_load_ul', 'txgain_dl',
       'txgain_ul', 'selected_mcs_dl', 'selected_mcs_ul',
       'selected_airtime_dl', 'selected_airtime_ul', 'mean_used_mcs_dl',
       'mean_used_mcs_ul', 'bsr_dl', 'bsr_ul', 'gput_ul', 'mean_snr_ul',
       'turbodec_it', 'dec_time', 'nRBs_ul', 'num_ues', 'thr_dl', 'thr_ul',
       'bler_dl', 'bler_ul', 'tbs_dl', 'pm_power', 'pm_var', 'pm_median',
       'n_pm', 'rapl_power', 'rapl_var', 'n_rapl', 'clockspeed', 'airtime_dl',
       'airtime_ul', 'cqi_dl', 'cqi_ul', 'fixed_mcs_flag'],
      dtype='object')

In [11]:
oran.isna().sum()
oran = oran.dropna()
oran = oran[oran["pm_power"] > 0]

In [5]:
ul.shape
ul.columns

Index(['date', 'cpu_platform', 'BW', 'TM', 'UL/DL', 'traffic_load', 'txgain',
       'cpu_time', 'number_active_cores', 'pinning', 'cpu_config',
       'selected_mcs', 'selected_airtime', 'mean_used_mcs', 'bsr', 'num_ues',
       'thr', 'gput', 'mean_snr', 'var_snr', 'bler', 'turbodec_it',
       'rssi_mean', 'rssi_var', 'overflows', 'underflows', 'lates', 'dec_time',
       'pm_power', 'pm_var', 'n_pm', 'rapl_power', 'rapl_var', 'n_rapl',
       'clockspeed', 'nRBs', 'airtime', 'num_subsamples', 'fixed_mcs_flag',
       'failed_experiment'],
      dtype='object')

In [12]:
ul.isna().sum()
ul = ul.dropna()
ul = ul[ul["rapl_power"] > 0]

In [15]:
TARGET = "pm_power"

COMMON_FEATURES = [
    "airtime",
    "selected_mcs",
    "mean_used_mcs",
    "mean_snr",
    "traffic_load"
]

oran_colmap = {
    "airtime_ul": "airtime",
    "selected_mcs_ul": "selected_mcs",
    "mean_used_mcs_ul": "mean_used_mcs",
    "mean_snr_ul": "mean_snr",
    "traffic_load_ul": "traffic_load",
    TARGET: TARGET  
}

oran_needed_cols = list(oran_colmap.keys())


oran = oran[oran_needed_cols].copy()
oran = oran.rename(columns=oran_colmap)

for c in COMMON_FEATURES + [TARGET]:
    oran[c] = pd.to_numeric(oran[c], errors="coerce")

oran = oran.dropna(subset=COMMON_FEATURES + [TARGET])
oran = oran[oran[TARGET] > 0]

print("✅ ORAN cleaned shape:", oran.shape)

✅ ORAN cleaned shape: (17501, 6)


In [16]:
ul = ul.copy()

if "UL/DL" in ul.columns:
    ul = ul[ul["UL/DL"] == "UL"]

if "failed_experiment" in ul.columns:
    ul = ul[ul["failed_experiment"] == 0]

needed_ul_cols = COMMON_FEATURES + [TARGET]
ul = ul[needed_ul_cols].copy()

for c in COMMON_FEATURES + [TARGET]:
    ul[c] = pd.to_numeric(ul[c], errors="coerce")

ul = ul.dropna(subset=COMMON_FEATURES + [TARGET])
ul = ul[ul[TARGET] > 0]

print("✅ UL cleaned shape:", ul.shape)

✅ UL cleaned shape: (18909, 6)


In [18]:
print("ORAN columns:", oran.columns.tolist())
print("UL columns  :", ul.columns.tolist())

oran.to_csv("clean_oran_stage1.csv", index=False)
ul.to_csv("clean_ul_stage1.csv", index=False)

ORAN columns: ['airtime', 'selected_mcs', 'mean_used_mcs', 'mean_snr', 'traffic_load', 'pm_power']
UL columns  : ['airtime', 'selected_mcs', 'mean_used_mcs', 'mean_snr', 'traffic_load', 'pm_power']
