# Feature Importance

With test data

In [None]:
trace = "../data/4_individual_traces/"
leakage_path = "leakages/"
out_path = "tops"
scenarios = [""] 

With real data

In [None]:
#trace = "../data/traffic_captures"
#leakage_path = "leakages/"
#out_path = "tops"
#scenarios = ["configuration00_default"] # TODO add the scenerio

### Extract information

In [2]:
import os
import rf
import pickle
import extract

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [6]:

for scenario in scenarios:
    trace_path = os.path.join(trace, scenario)
    out_path = os.path.join(leakage_path, scenario)
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    extract.main(trace_path, out_path)

100%|█████████████████████████████████████████| 400/400 [00:21<00:00, 18.47it/s]


### Apply random forest

In [9]:
def load_features(path_to_features, tr_split):
    """
    Prepare monitored data for training and test sets.
    """

    # load features dataset
    #X_tr, Y_tr = load_data(os.path.join(path_to_features, 'train'), ".features", " ")
    #X_ts, Y_ts = load_data(os.path.join(path_to_features, 'test'), ".features", " ")

    X, Y = rf.load_data(path_to_features, ".features", " ")


    return X, Y
    
# Feature groups mapping
feature_groups = {
    'pkt_count': [(0, 13)],
    'interarrival': [(13, 25)],
    'totaltime': [(25, 37)],
    'ngram': [(37, 161)],
    'transposition': [(161, 765)],
    'pkt_distribution': [(2553, 2778)],
    'burst': [(2778, 2789)],
    "input_related": [(2, 3), (4, 5), (7, 8), (9, 10), (12, 13), (21, 25), (33, 37)],
    "output_related": [(1, 2), (3, 4), (6, 7), (8, 9), (11, 12), (17, 21), (29, 33)],
    "together": [(1, 3), (3, 5), (6, 8), (8, 10), (11, 13), (17, 25), (29, 37)]
}

def train_and_evaluate_cv(X, Y, out, n_splits=5):
    """
    Perform stratified k-fold cross-validation and evaluate performance across feature groups.
    """
    results = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, Y)):
        print(f"Fold {fold + 1}/{n_splits}")

        X_tr, Y_tr = X[train_idx], Y[train_idx]
        X_ts, Y_ts = X[test_idx], Y[test_idx]

        for name, ranges in feature_groups.items():
            print(f"Evaluating feature group: {name}")
            X_tr_subset_list = []
            X_ts_subset_list = []

            for start, end in ranges:
                if end > X_tr.shape[1]:  # Ensure indices don't exceed dataset dimensions
                    print(f"Skipping {name} range ({start}, {end}): Feature index out of bounds!")
                    continue

                X_tr_subset_list.append(X_tr[:, start:end])
                X_ts_subset_list.append(X_ts[:, start:end])

            # Merge all feature subsets for this category
            if X_tr_subset_list and X_ts_subset_list:
                X_tr_subset = np.hstack(X_tr_subset_list)
                X_ts_subset = np.hstack(X_ts_subset_list)
            else:
                print(f"Skipping {name}: Feature subset has zero features!")
                continue

            model = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True, random_state=1)
            model.fit(X_tr_subset, Y_tr)

            pred = model.predict_proba(X_ts_subset)
            top_1 = np.mean(np.argmax(pred, axis=1) == Y_ts) * 100
            top_2 = np.mean([Y_ts[i] in np.argsort(pred[i])[-2:] for i in range(len(Y_ts))]) * 100
            top_5 = np.mean([Y_ts[i] in np.argsort(pred[i])[-5:] for i in range(len(Y_ts))]) * 100

            results.append([fold + 1, name, f"{top_1:.1f}%", f"{top_2:.1f}%", f"{top_5:.1f}%"])

    # Save results
    results_df = pd.DataFrame(results, columns=["Fold", "Features", "Top-1", "Top-2", "Top-5"])
    with open(out, "wb") as f:
        pickle.dump(results_df, f)

    print(f"Results saved in {out}")
    return results_df

def classify(features, out, train=0.8, n_splits=5):
    X, Y = load_features(features, train)
    print("Performing 5-fold cross-validation...")

    results_df = train_and_evaluate_cv(X, Y, out, n_splits)
    print(results_df.to_string(index=False))
    return results_df


In [10]:
import os

if not os.path.exists(out_path):
    os.makedirs(out_path)
for scenario in scenarios:
    print(scenario)
    features = f"{leakage_path}/{scenario}/"
    out = f"{out_path}/{scenario}.pkl"
    topn = classify(features, out, train=0.8)
    


Performing 5-fold cross-validation...
Fold 1/5
Evaluating feature group: pkt_count
Evaluating feature group: interarrival
Evaluating feature group: totaltime
Evaluating feature group: ngram
Evaluating feature group: transposition
Evaluating feature group: pkt_distribution
Evaluating feature group: burst
Evaluating feature group: input_related
Evaluating feature group: output_related
Evaluating feature group: together
Fold 2/5
Evaluating feature group: pkt_count
Evaluating feature group: interarrival
Evaluating feature group: totaltime
Evaluating feature group: ngram
Evaluating feature group: transposition
Evaluating feature group: pkt_distribution
Evaluating feature group: burst
Evaluating feature group: input_related
Evaluating feature group: output_related
Evaluating feature group: together
Fold 3/5
Evaluating feature group: pkt_count
Evaluating feature group: interarrival
Evaluating feature group: totaltime
Evaluating feature group: ngram
Evaluating feature group: transposition
Eva