In [1]:
import pandas as pd

df_normal = pd.read_csv("Processed_UE_Datasets_unscaled/ue_8628490433231158_normal_labeled_test.csv")
df_malicious = pd.read_csv("Processed_UE_Datasets_unscaled/ue_8642840401624200_malicious_labeled_test.csv")

print("Normal shape:", df_normal.shape)
print("Malicious shape:", df_malicious.shape)

print("\nNormal columns:\n", df_normal.columns.tolist())
print("\nMalicious columns:\n", df_malicious.columns.tolist())

print("\nNormal _time head:\n", df_normal["_time"].head(5))
print("\nMalicious _time head:\n", df_malicious["_time"].head(5))


Normal shape: (76687, 20)
Malicious shape: (75304, 20)

Normal columns:
 ['_time', 'imeisv', 'epre', 'pusch_snr', 'p_ue', 'ul_mcs', 'cqi', 'ul_bitrate', 'dl_mcs', 'dl_retx', 'ul_tx', 'dl_tx', 'ul_retx', 'dl_bitrate', 'dl_err', 'ul_err', 'attack_number', 'event', 'binary_label', 'multiclass_label']

Malicious columns:
 ['_time', 'imeisv', 'epre', 'pusch_snr', 'p_ue', 'ul_mcs', 'cqi', 'ul_bitrate', 'dl_mcs', 'dl_retx', 'ul_tx', 'dl_tx', 'ul_retx', 'dl_bitrate', 'dl_err', 'ul_err', 'attack_number', 'event', 'binary_label', 'multiclass_label']

Normal _time head:
 0    17/08/2024 12:00:03.0
1    17/08/2024 12:00:08.1
2    17/08/2024 12:00:13.2
3    17/08/2024 12:00:18.4
4    17/08/2024 12:00:23.5
Name: _time, dtype: str

Malicious _time head:
 0    17/08/2024 12:00:01.7
1    17/08/2024 12:00:06.8
2    17/08/2024 12:00:12.0
3    17/08/2024 12:00:17.2
4    17/08/2024 12:00:22.3
Name: _time, dtype: str


In [3]:
def fix_time(df, time_col="_time"):
    # Force python strings (avoids Arrow large_string issues)
    s = df[time_col].astype("string").astype(object).astype(str)
    t = pd.to_datetime(s, errors="coerce", dayfirst=True)  # dayfirst=True for 17/08/2024
    df = df.copy()
    df[time_col] = t
    df = df.dropna(subset=[time_col]).sort_values(time_col).reset_index(drop=True)
    return df

df_normal = fix_time(df_normal, "_time")
df_malicious = fix_time(df_malicious, "_time")

print("Normal _time dtype:", df_normal["_time"].dtype)
print("Malicious _time dtype:", df_malicious["_time"].dtype)

print(df_normal["_time"].head(3))
print(df_malicious["_time"].head(3))


Normal _time dtype: datetime64[us]
Malicious _time dtype: datetime64[us]
0   2024-08-17 12:00:03.000
1   2024-08-17 12:00:08.100
2   2024-08-17 12:00:13.200
Name: _time, dtype: datetime64[us]
0   2024-08-17 12:00:01.700
1   2024-08-17 12:00:06.800
2   2024-08-17 12:00:12.000
Name: _time, dtype: datetime64[us]


  t = pd.to_datetime(s, errors="coerce", dayfirst=True)  # dayfirst=True for 17/08/2024
  t = pd.to_datetime(s, errors="coerce", dayfirst=True)  # dayfirst=True for 17/08/2024


In [4]:
for df, name in [(df_normal, "normal"), (df_malicious, "malicious")]:
    df["binary_label"] = pd.to_numeric(df["binary_label"], errors="coerce").fillna(0).astype(int)
    print(name, "binary_label counts:", df["binary_label"].value_counts().to_dict())


normal binary_label counts: {0: 76687}
malicious binary_label counts: {0: 70634, 1: 4670}


In [9]:
import joblib

MODEL_PATH = "threat_score_model.joblib"  # <-- update this
# -----------------------------
# Load models
# -----------------------------
bundle = joblib.load(MODEL_PATH)
model = bundle["model"]
feature_columns = bundle["feature_columns"]
label_column    = bundle["label_column"]



In [11]:
THRESHOLD = 0.5  # as you said

def add_model_predictions(df, model, feature_cols, label_col, threshold=0.5):
    df = df.copy()

    # Ensure feature columns exist
    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing feature columns: {missing}")

    # Ensure label column exists (ground truth)
    if label_col not in df.columns:
        raise ValueError(f"Missing label column '{label_col}' in dataframe.")

    # Prepare X
    X = df[feature_cols]

    # Predict probability of malicious
    proba = model.predict_proba(X)[:, 1]

    # Store outputs
    df["threat_score"] = proba
    df["pred_label"] = (df["threat_score"] >= threshold).astype(int)

    # Make sure ground-truth label is int 0/1
    df[label_col] = df[label_col].astype(int)

    return df

df_normal = add_model_predictions(df_normal, model, feature_columns, label_column, threshold=THRESHOLD)
df_malicious = add_model_predictions(df_malicious, model, feature_columns, label_column, threshold=THRESHOLD)


In [14]:
print("Label column name:", label_column)

print("\nNormal prediction counts:", df_normal["pred_label"].value_counts().to_dict())
print("Normal ground-truth counts:", df_normal[label_column].value_counts().to_dict())

print("\nMalicious prediction counts:", df_malicious["pred_label"].value_counts().to_dict())
print("Malicious ground-truth counts:", df_malicious[label_column].value_counts().to_dict())

display(df_normal[["_time", label_column, "pred_label", "threat_score"]].head(5))
display(df_malicious[["_time", label_column, "pred_label", "threat_score"]].head(5))

Label column name: binary_label

Normal prediction counts: {0: 76685, 1: 2}
Normal ground-truth counts: {0: 76687}

Malicious prediction counts: {0: 70518, 1: 4786}
Malicious ground-truth counts: {0: 70634, 1: 4670}


Unnamed: 0,_time,binary_label,pred_label,threat_score
0,2024-08-17 12:00:03.000,0,0,0.00052
1,2024-08-17 12:00:08.100,0,0,0.00052
2,2024-08-17 12:00:13.200,0,0,0.00052
3,2024-08-17 12:00:18.400,0,0,0.00052
4,2024-08-17 12:00:23.500,0,0,0.00052


Unnamed: 0,_time,binary_label,pred_label,threat_score
0,2024-08-17 12:00:01.700,0,0,0.000569
1,2024-08-17 12:00:06.800,0,0,0.000569
2,2024-08-17 12:00:12.000,0,0,0.000569
3,2024-08-17 12:00:17.200,0,0,0.000569
4,2024-08-17 12:00:22.300,0,0,0.000569


In [15]:
# -----------------------------
# Step 5: Extract 10×12 windows
# -----------------------------
WINDOW_LEN = 12
MAX_GAP_SECONDS = 6
TIME_COL = "_time"
N_TN = 5
N_TP = 3
N_TR = 2

# Make sure time is sorted
df_normal = df_normal.sort_values(TIME_COL).reset_index(drop=True)
df_malicious = df_malicious.sort_values(TIME_COL).reset_index(drop=True)

def is_contiguous_window(w, time_col=TIME_COL, max_gap_s=MAX_GAP_SECONDS):
    gaps = w[time_col].diff().dt.total_seconds().fillna(0)
    return bool((gaps <= max_gap_s).all())

def extract_non_overlapping_windows(df, condition_fn, n_windows, window_len=WINDOW_LEN):
    """
    Scan left-to-right and collect the first n_windows matching condition_fn.
    Non-overlapping: after taking a window starting at i, skip ahead by window_len.
    """
    windows = []
    i = 0
    while i <= len(df) - window_len and len(windows) < n_windows:
        w = df.iloc[i:i+window_len]
        if is_contiguous_window(w) and condition_fn(w):
            windows.append(w.copy())
            i += window_len  # jump to avoid overlap
        else:
            i += 1
    return windows

# --- Define conditions using both GT and predictions ---
tn_condition = lambda w: ((w[label_column] == 0).all() and (w["pred_label"] == 0).all())
tp_condition = lambda w: ((w[label_column] == 1).all() and (w["pred_label"] == 1).all())

def transition_condition(w):
    half = WINDOW_LEN // 2
    return (
        (w[label_column].iloc[:half] == 0).all() and
        (w[label_column].iloc[half:] == 1).all() and
        (w["pred_label"].iloc[:half] == 0).all() and
        (w["pred_label"].iloc[half:] == 1).all()
    )

# --- Extract windows ---
tn_windows = extract_non_overlapping_windows(df_normal, tn_condition, N_TN)
tp_windows = extract_non_overlapping_windows(df_malicious, tp_condition, N_TP)
tr_windows = extract_non_overlapping_windows(df_malicious, transition_condition, N_TR)

print(f"TN windows found: {len(tn_windows)} / {N_TN}")
print(f"TP windows found: {len(tp_windows)} / {N_TP}")
print(f"Transition windows found: {len(tr_windows)} / {N_TR}")

# If any category is short, relax selection to GT-only as a fallback (still useful for plots)
if len(tp_windows) < N_TP:
    print("\n⚠️ Not enough TP windows under (GT=1 & pred=1). Trying GT-only TP windows...")
    tp_gt_only = lambda w: (w[label_column] == 1).all()
    tp_windows = extract_non_overlapping_windows(df_malicious, tp_gt_only, N_TP)
    print(f"GT-only TP windows found: {len(tp_windows)} / {N_TP}")

if len(tr_windows) < N_TR:
    print("\n⚠️ Not enough TRANSITION windows under (GT & pred). Trying GT-only transition windows...")
    def tr_gt_only(w):
        half = WINDOW_LEN // 2
        return (w[label_column].iloc[:half] == 0).all() and (w[label_column].iloc[half:] == 1).all()
    tr_windows = extract_non_overlapping_windows(df_malicious, tr_gt_only, N_TR)
    print(f"GT-only TRANSITION windows found: {len(tr_windows)} / {N_TR}")

# --- Combine and add metadata ---
all_windows = []
win_id = 0

def add_meta(w, win_type, source):
    nonlocal_vars = None  # no-op; just to keep notebook linters calm
    w = w.copy()
    w["window_id"] = win_id
    w["window_type"] = win_type  # TN / TP / TRANSITION
    w["source_trace"] = source   # normal / malicious
    return w

for w in tn_windows:
    all_windows.append(add_meta(w, "TN", "normal")); win_id += 1
for w in tp_windows:
    all_windows.append(add_meta(w, "TP", "malicious")); win_id += 1
for w in tr_windows:
    all_windows.append(add_meta(w, "TRANSITION", "malicious")); win_id += 1

eval_df = pd.concat(all_windows, ignore_index=True)
print("\nTotal rows in eval_df:", len(eval_df), "(expected 120 if 10 windows × 12 rows)")

# --- Save outputs ---
eval_df.to_csv("eval_dataset_10x12_windows.csv", index=False)
print("Saved: eval_dataset_10x12_windows.csv")

summary = (eval_df
           .groupby(["window_id","window_type","source_trace"])
           .agg(start_time=(TIME_COL,"min"),
                end_time=(TIME_COL,"max"),
                rows=(TIME_COL,"size"),
                gt_pos=(label_column,"sum"),
                pred_pos=("pred_label","sum"))
           .reset_index())

summary.to_csv("eval_dataset_windows_summary.csv", index=False)
print("Saved: eval_dataset_windows_summary.csv")

display(summary)
display(eval_df[[TIME_COL, "window_id", "window_type", label_column, "pred_label", "threat_score"]].head(20))


TN windows found: 5 / 5
TP windows found: 3 / 3
Transition windows found: 1 / 2

⚠️ Not enough TRANSITION windows under (GT & pred). Trying GT-only transition windows...
GT-only TRANSITION windows found: 2 / 2

Total rows in eval_df: 120 (expected 120 if 10 windows × 12 rows)
Saved: eval_dataset_10x12_windows.csv
Saved: eval_dataset_windows_summary.csv


Unnamed: 0,window_id,window_type,source_trace,start_time,end_time,rows,gt_pos,pred_pos
0,0,TN,normal,2024-08-17 12:00:03.000,2024-08-17 12:00:59.400,12,0,0
1,1,TN,normal,2024-08-17 12:01:04.500,2024-08-17 12:02:00.900,12,0,0
2,2,TN,normal,2024-08-17 12:02:06.000,2024-08-17 12:03:02.400,12,0,0
3,3,TN,normal,2024-08-17 12:03:07.600,2024-08-17 12:04:03.900,12,0,0
4,4,TN,normal,2024-08-17 12:04:09.100,2024-08-17 12:05:05.400,12,0,0
5,5,TP,malicious,2024-08-18 07:00:00.400,2024-08-18 07:00:57.100,12,12,12
6,6,TP,malicious,2024-08-18 07:01:02.200,2024-08-18 07:01:58.700,12,12,12
7,7,TP,malicious,2024-08-18 07:02:03.900,2024-08-18 07:03:00.400,12,12,12
8,8,TRANSITION,malicious,2024-08-18 06:59:29.600,2024-08-18 07:00:26.200,12,6,6
9,9,TRANSITION,malicious,2024-08-19 06:59:33.500,2024-08-19 07:00:30.100,12,6,8


Unnamed: 0,_time,window_id,window_type,binary_label,pred_label,threat_score
0,2024-08-17 12:00:03.000,0,TN,0,0,0.00052
1,2024-08-17 12:00:08.100,0,TN,0,0,0.00052
2,2024-08-17 12:00:13.200,0,TN,0,0,0.00052
3,2024-08-17 12:00:18.400,0,TN,0,0,0.00052
4,2024-08-17 12:00:23.500,0,TN,0,0,0.00052
5,2024-08-17 12:00:28.600,0,TN,0,0,0.00052
6,2024-08-17 12:00:33.800,0,TN,0,0,0.00052
7,2024-08-17 12:00:38.900,0,TN,0,0,0.00052
8,2024-08-17 12:00:44.000,0,TN,0,0,0.00052
9,2024-08-17 12:00:49.100,0,TN,0,0,0.00052


In [17]:
import pandas as pd
import numpy as np

MAX_GAP_SECONDS = 6
WINDOW_BEFORE = 6
WINDOW_AFTER = 5  # total rows = 6 + 5 + 1 = 12

N_FP_EXAMPLES = 2
N_FN_EXAMPLES = 2
RANDOM_SEED = 7
np.random.seed(RANDOM_SEED)

# Use the true label column name from your bundle
GT_COL = label_column
TIME_COL = "_time"

def mark_fp_fn(df):
    df = df.copy()
    df["gt_label"] = df[GT_COL].astype(int)
    df["is_fp"] = ((df["gt_label"] == 0) & (df["pred_label"] == 1)).astype(int)
    df["is_fn"] = ((df["gt_label"] == 1) & (df["pred_label"] == 0)).astype(int)
    return df

def extract_window(df, center_idx):
    start = max(0, center_idx - WINDOW_BEFORE)
    end = min(len(df) - 1, center_idx + WINDOW_AFTER)
    w = df.iloc[start:end+1].copy()

    # keep only if contiguous
    gaps = w[TIME_COL].diff().dt.total_seconds().fillna(0)
    if (gaps > MAX_GAP_SECONDS).any():
        return pd.DataFrame()
    return w

def sample_events(df, flag_col, n):
    """Return row indices of FP or FN events; sampled without replacement."""
    idxs = df.index[df[flag_col] == 1].tolist()
    if not idxs:
        return []
    if len(idxs) <= n:
        return idxs
    return list(np.random.choice(idxs, size=n, replace=False))

# Prepare dataframes (ensure time sorted)
df_normal2 = df_normal.sort_values(TIME_COL).reset_index(drop=True)
df_mal2 = df_malicious.sort_values(TIME_COL).reset_index(drop=True)

df_normal2 = mark_fp_fn(df_normal2)
df_mal2 = mark_fp_fn(df_mal2)

# Collect FP examples (typically from normal trace, but could exist in either)
fp_idxs = sample_events(df_normal2, "is_fp", N_FP_EXAMPLES) + sample_events(df_mal2, "is_fp", N_FP_EXAMPLES)
fp_idxs = fp_idxs[:N_FP_EXAMPLES]

# Collect FN examples (typically from malicious trace, but could exist in either)
fn_idxs = sample_events(df_mal2, "is_fn", N_FN_EXAMPLES) + sample_events(df_normal2, "is_fn", N_FN_EXAMPLES)
fn_idxs = fn_idxs[:N_FN_EXAMPLES]

print("FP indices selected:", fp_idxs)
print("FN indices selected:", fn_idxs)

windows = []
event_id = 0

def add_meta(w, source, center_type, center_idx):
    global event_id
    w = w.copy()
    w["source_trace"] = source
    w["center_type"] = center_type   # FP or FN
    w["center_index"] = center_idx
    w["event_id"] = event_id
    return w

for idx in fp_idxs:
    w = extract_window(df_normal2, idx)
    if not w.empty:
        windows.append(add_meta(w, "normal", "FP", idx))
        event_id += 1

for idx in fn_idxs:
    w = extract_window(df_mal2, idx)
    if not w.empty:
        windows.append(add_meta(w, "malicious", "FN", idx))
        event_id += 1

fpfn_df = pd.concat(windows, ignore_index=True) if windows else pd.DataFrame()
print("Total rows in fpfn_df:", len(fpfn_df))

# Save
out_fpfn = "fpfn_examples_windows.csv"
fpfn_df.to_csv(out_fpfn, index=False)
print("Saved:", out_fpfn)

# Preview
if not fpfn_df.empty:
    display(fpfn_df[[TIME_COL, "event_id", "source_trace", "center_type", "gt_label", "pred_label", "threat_score"]].head(30))


FP indices selected: [50473, 74561]
FN indices selected: [np.int64(66109), np.int64(65990)]
Total rows in fpfn_df: 48
Saved: fpfn_examples_windows.csv


Unnamed: 0,_time,event_id,source_trace,center_type,gt_label,pred_label,threat_score
0,2024-08-20 17:37:41.000,0,normal,FP,0,0,0.002657
1,2024-08-20 17:37:46.100,0,normal,FP,0,0,0.001049
2,2024-08-20 17:37:51.200,0,normal,FP,0,0,0.001407
3,2024-08-20 17:37:56.400,0,normal,FP,0,0,0.001285
4,2024-08-20 17:38:01.500,0,normal,FP,0,0,0.000836
5,2024-08-20 17:38:06.600,0,normal,FP,0,0,0.001381
6,2024-08-20 17:38:11.700,0,normal,FP,0,1,0.998936
7,2024-08-20 17:38:16.900,0,normal,FP,0,0,0.0014
8,2024-08-20 17:38:22.000,0,normal,FP,0,0,0.00262
9,2024-08-20 17:38:27.100,0,normal,FP,0,0,0.00175
