# Feature Rank, Subset Search and Model Pruning

In this notebook, we leverage feature ranking algorithms to assign an importance score to each feature in the dataset.

Then, we'll use that score to perform the *Stochastic Subset Search*: find smaller sub-optimal feature subsets by randomly (weighted) moving into the solution space, trying different combination of features.

In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import threadpoolctl
import matplotlib as mpl
from sklearn.metrics import accuracy_score

from intellect.model.torch.model import Mlp
from intellect.model.torch.pruning import globally_unstructured_connections_l1
from intellect.ranking import rank_metric_permutation_sklearn, prune_and_subset_search, subset_search, prune_search, subset_search_and_prune
from intellect.io import load, dump, create_dir, TimeoutIterator
from intellect.inspect import set_seed
from intellect.dataset import portions_from_data, Dataset

threadpoolctl.threadpool_limits(limits=2);
mpl.rcParams['figure.dpi']= 70
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 10)

In [2]:
# parameters

# dataset, previously trained model and new output directory
DATASET = "./dataset_shrinked.h5"
TRAIN_MODEL = "train_output/oracle.pt"
OUTPUT_DIR = "rank_prune_output/"

# dataset parameters, should be equal to those in the previous step (notebook)
BENIGN_LABELS = ["BENIGN"]
DATASET_PORTIONS = (0.6, 0.1, 0.1, 0.2)

# traffic categories that only this client (organization) has
CLIENT_CATEGORIES = ["BENIGN", "DDoS"]

# sizes of the feature subsets to search
TARGET_SUBSET_RATIOS = (0.1, 0.3, 0.5, 0.8)
# ratios of connections to be pruned from the network
PRUNE_RATIOS = np.linspace(0.05, 1, 19, endpoint=False).round(2).tolist()
# explored solution for each pruning/subset ratio
EXPLORED_PER_RATIO = 100
# maximum performance drop ratio accepted
# in the previous notebook, we trained a network that achieves 0.97 Accuracy, so we can allow us
# to search with a maximum drop of 25% in accuracy.
PERFORMANCE_DROP_ACCEPTED_RATIO = 0.25

Load the dataset. In this phase, we are interested in the finetune portion, so let's just forget the other ones.

In [3]:
def get_dataset():
    set_seed()
    return portions_from_data(DATASET, normalize=True, benign_labels=BENIGN_LABELS, ratios=DATASET_PORTIONS)
_, _, finetune, _ = get_dataset()

Now, remove unused categories from this portion, and balance samples among the remaining ones.

In [4]:
finetune_client = finetune.filter_categories(CLIENT_CATEGORIES).balance_categories()

In [5]:
oracle = Mlp.load(TRAIN_MODEL)
baseline_all = accuracy_score(finetune.y, oracle.predict(finetune.X))
baseline_client = accuracy_score(finetune_client.y, oracle.predict(finetune_client.X))
print("Only client traffic", baseline_client, "->", baseline_client * (1 - PERFORMANCE_DROP_ACCEPTED_RATIO))
print("All traffic", baseline_all, "->", baseline_all * (1 - PERFORMANCE_DROP_ACCEPTED_RATIO))

Only client traffic 0.980980980980981 -> 0.7357357357357357
All traffic 0.9707572027126258 -> 0.7280679020344694


Define few utility functions for Stochastic Subset Search.

In [6]:
def only_pruning(ds_only_client_traffic: bool = True):
    ds = finetune_client if ds_only_client_traffic else finetune
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    set_seed()
    all_accepted = {i: v for i, v, j in TimeoutIterator(prune_search(
        oracle, ds, globally_unstructured_connections_l1, PRUNE_RATIOS,
        metric=accuracy_score, performance_drop_ratio=PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=60)
        if j is True}
    new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"])
    for k, v in all_accepted.items():
        new_df_comb.loc[len(new_df_comb)] = [v, k]
    new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
    dump(new_df_comb, OUTPUT_DIR + f"traffic_{traffic_prefix}_pruning_ratios_only.csv")
    print(f"Only Pruning for traffic {traffic_prefix} found {len(new_df_comb)} models")

def only_stochastic_search(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True):
    ds = finetune_client if ds_only_client_traffic else finetune
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank["importance"].to_dict()

    for subset_size_ratio in TARGET_SUBSET_RATIOS:
        set_seed()
        all_accepted = {i: v for i, v, j in TimeoutIterator(subset_search(
            oracle, ds, subset_size_ratio, EXPLORED_PER_RATIO,
            rank=rank, metric=accuracy_score, performance_drop_ratio=PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=60)
            if j is True}
        new_df_comb = pd.DataFrame(columns=["Accuracy"] + df_rank.index.values.tolist())
        for k, v in all_accepted.items():
            copied = df_rank.T.loc["importance"].copy()
            copied[~copied.index.isin(k)] = np.nan
            copied[copied.index.isin(k)] = 1
            copied["Accuracy"] = v
            new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_subsets_features_for_subsetsize_{subset_size_ratio}.csv")
        print(f"Only Stochastic with rank {rank_prefix} and traffic {traffic_prefix} and Subset size ratio {subset_size_ratio} found {len(new_df_comb)} models")

def stochastic_search_then_pruning(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True):
    ds = finetune_client if ds_only_client_traffic else finetune
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank["importance"].to_dict()

    for subset_size_ratio in TARGET_SUBSET_RATIOS:
        set_seed()
        new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"] + df_rank.index.values.tolist())
        for k, v, prune_ratio in TimeoutIterator(subset_search_and_prune(
                oracle,globally_unstructured_connections_l1, ds, PRUNE_RATIOS, subset_size_ratio, EXPLORED_PER_RATIO,
                rank=rank, metric=accuracy_score, performance_drop_ratio=PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=60):
            copied = df_rank.T.loc["importance"].copy()
            copied[~copied.index.isin(k)] = np.nan
            copied[copied.index.isin(k)] = 1
            copied["Prune Ratio"] = prune_ratio
            copied["Accuracy"] = v
            new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_combo_subsets_features_pruned_models_for_subsetsize_{subset_size_ratio}.csv")
        print(f"Stochastic search and then Pruning with rank {rank_prefix} and traffic {traffic_prefix} and Subset Size {subset_size_ratio} found (%prune: #accepted models)", new_df_comb["Prune Ratio"].value_counts().sort_index().to_dict())

def pruning_then_stochastic_search(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True):
    ds = finetune_client if ds_only_client_traffic else finetune
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank["importance"].to_dict()

    for subset_size_ratio in TARGET_SUBSET_RATIOS:
        set_seed()
        all_accepted = {i: v for i, v in TimeoutIterator(prune_and_subset_search(
            oracle,globally_unstructured_connections_l1, ds, PRUNE_RATIOS, subset_size_ratio, EXPLORED_PER_RATIO,
            rank=rank, metric=accuracy_score, performance_drop_ratio=PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=60)}
        new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"] + df_rank.index.values.tolist())
        for prune_ratio, per_prune_ratio_accepted in all_accepted.items():
            for k, v in per_prune_ratio_accepted.items():
                copied = df_rank.T.loc["importance"].copy()
                copied[~copied.index.isin(k)] = np.nan
                copied[copied.index.isin(k)] = 1
                copied["Prune Ratio"] = prune_ratio
                copied["Accuracy"] = v
                new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_combo_pruned_models_subsets_features_for_subsetsize_{subset_size_ratio}.csv")
        print(f"Prune and the Stochastic search with rank {rank_prefix} and traffic {traffic_prefix} and Subset Size {subset_size_ratio} found  ->", new_df_comb["Prune Ratio"].value_counts().sort_index().to_dict())

In [12]:
create_dir(OUTPUT_DIR)

## Feature ranking

Scenario in which the ranking process is performed only on the specific traffic categories of the organization.

In [8]:
set_seed()
rank = rank_metric_permutation_sklearn(oracle, finetune_client)
df = pd.DataFrame({"importance": rank.values()}, index=rank.keys()).sort_values("importance", ascending=False)
dump(df, OUTPUT_DIR + "rank_few_c.csv")
df.transpose()

Unnamed: 0,Average Packet Size,Bwd Packet Length Std,Packet Length Variance,ACK Flag Count,Bwd Packet Length Min,PSH Flag Count,Avg Bwd Segment Size,Bwd Packet Length Mean,Packet Length Mean,Packet Length Std,Down/Up Ratio,Max Packet Length,Bwd Packet Length Max,Fwd IAT Max,Init_Win_bytes_forward,Bwd IAT Total,Idle Max,Idle Mean,Flow IAT Max,Flow IAT Std,Idle Min,Fwd IAT Total,Fwd Packet Length Std,URG Flag Count,Init_Win_bytes_backward,Fwd IAT Std,Idle Std,Fwd Packets/s,Fwd Packet Length Max,Fwd IAT Mean,Bwd IAT Max,SYN Flag Count,Bwd IAT Min,Fwd PSH Flags,Bwd Packets/s,Flow IAT Mean,Fwd Packet Length Min,Min Packet Length,Fwd IAT Min,Bwd IAT Std,Bwd IAT Mean,Fwd Packet Length Mean,FIN Flag Count,RST Flag Count,CWE Flag Count,Active Min,Active Max,Flow Packets/s,Flow Bytes/s,Active Std,Active Mean,Total Length of Bwd Packets,Total Length of Fwd Packets,min_seg_size_forward,ECE Flag Count,act_data_pkt_fwd,Subflow Bwd Bytes,Subflow Fwd Bytes,Fwd URG Flags,Flow IAT Min,Total Backward Packets,Bwd Header Length,Fwd Header Length,Total Fwd Packets
importance,0.220721,0.184885,0.173073,0.102402,0.076577,0.072172,0.071672,0.071371,0.064364,0.06036,0.059459,0.058358,0.054254,0.036837,0.036336,0.032833,0.02963,0.025926,0.025826,0.024525,0.020621,0.019219,0.018819,0.018619,0.014715,0.011912,0.011011,0.008208,0.008208,0.007708,0.007307,0.006406,0.004004,0.003504,0.003203,0.002703,0.002402,0.002302,0.002102,0.002002,0.001702,0.001401,0.001301,0.000601,0.0003,0.0002,0.0001,0.0001,0.0001,0.0001,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Scenario in which the ranking process includes all the traffic from all possible organizations. In a real scenario, the Federated Learning service provider can potentially asks to each
organization to compute the ranking on their traffic and then aggregate. Another alternative would be to ask for a small anonymized portion of data and compute it.

In [9]:
set_seed()
rank = rank_metric_permutation_sklearn(oracle, finetune)
df = pd.DataFrame({"importance": rank.values()}, index=rank.keys()).sort_values("importance", ascending=False)
dump(df, OUTPUT_DIR + "rank_all_c.csv")
df.transpose()

Unnamed: 0,Average Packet Size,Init_Win_bytes_forward,PSH Flag Count,Bwd Packet Length Std,Min Packet Length,Bwd Packet Length Min,Fwd Packet Length Min,Bwd IAT Total,Down/Up Ratio,ACK Flag Count,URG Flag Count,Avg Bwd Segment Size,Bwd Packet Length Mean,Bwd IAT Max,Init_Win_bytes_backward,Fwd Packets/s,Packet Length Mean,Flow IAT Std,Packet Length Variance,Flow IAT Max,Idle Max,Fwd Packet Length Max,Fwd Packet Length Mean,Flow Bytes/s,Idle Min,Fwd IAT Min,Fwd Packet Length Std,Bwd IAT Std,Fwd IAT Max,Idle Mean,Packet Length Std,Fwd PSH Flags,Bwd IAT Mean,Fwd IAT Total,Fwd IAT Std,SYN Flag Count,Fwd IAT Mean,Max Packet Length,Flow IAT Mean,Bwd Packet Length Max,Flow IAT Min,Bwd IAT Min,FIN Flag Count,Bwd Packets/s,RST Flag Count,Idle Std,Flow Packets/s,CWE Flag Count,Subflow Fwd Bytes,Total Length of Fwd Packets,Active Std,Active Max,Active Min,Active Mean,ECE Flag Count,min_seg_size_forward,Total Backward Packets,Subflow Bwd Bytes,Total Length of Bwd Packets,act_data_pkt_fwd,Fwd URG Flags,Total Fwd Packets,Fwd Header Length,Bwd Header Length
importance,0.240292,0.094748,0.0728,0.064027,0.05811,0.05589,0.051099,0.047475,0.037221,0.033666,0.032582,0.026962,0.026699,0.023088,0.021692,0.020889,0.020263,0.01844,0.018064,0.017045,0.016794,0.015338,0.014911,0.014695,0.013688,0.012868,0.012747,0.012578,0.012181,0.011943,0.009658,0.009196,0.007848,0.007753,0.007676,0.007658,0.007594,0.007002,0.006579,0.005991,0.005754,0.005261,0.004522,0.002739,0.00146,0.000713,0.000691,0.000657,0.000397,0.000397,0.000393,0.000289,0.000276,9.5e-05,7.3e-05,6.9e-05,3e-05,1.7e-05,1.7e-05,1.3e-05,9e-06,4e-06,0.0,0.0


We notice that the feature importance changes significantly, as in the first case the importance refers to only the BENIGN and DDoS traffic, while in the latter all the categories are kept in the loop.

## Stochastic Subset Search on Client Categories with ranking computed on **Client Categories**

First prune then stochastic search.

In [23]:
pruning_then_stochastic_search(rank_only_client_traffic=True, ds_only_client_traffic=True)

Prune and the Stochastic search with rank few_c and traffic few_c and Subset Size 0.1 found  -> {0.05: 18, 0.1: 22, 0.15: 21, 0.2: 18, 0.25: 22, 0.3: 11, 0.35: 11, 0.4: 23, 0.45: 20, 0.5: 31, 0.55: 33, 0.6: 28, 0.65: 38, 0.7: 46, 0.75: 47, 0.8: 31}
Prune and the Stochastic search with rank few_c and traffic few_c and Subset Size 0.3 found  -> {0.05: 100, 0.1: 96, 0.15: 97, 0.2: 84, 0.25: 88, 0.3: 83, 0.35: 69, 0.4: 90, 0.45: 91, 0.5: 96, 0.55: 99, 0.6: 96, 0.65: 96, 0.7: 99, 0.75: 91, 0.8: 92}
Prune and the Stochastic search with rank few_c and traffic few_c and Subset Size 0.5 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.7: 100, 0.75: 100, 0.8: 100}
Prune and the Stochastic search with rank few_c and traffic few_c and Subset Size 0.8 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.

First stochastic search then pruning.

In [7]:
stochastic_search_then_pruning(rank_only_client_traffic=True, ds_only_client_traffic=True)

Stochastic search and then Pruning with rank few_c and traffic few_c and Subset Size 0.1 found (%prune: #accepted models) {0.05: 18, 0.1: 18, 0.15: 18, 0.2: 18, 0.25: 18, 0.3: 18, 0.35: 18, 0.4: 18, 0.45: 18, 0.5: 18, 0.55: 18, 0.6: 18, 0.65: 18, 0.7: 18, 0.75: 18, 0.8: 18}
Stochastic search and then Pruning with rank few_c and traffic few_c and Subset Size 0.3 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.7: 100, 0.75: 100, 0.8: 100}
Stochastic search and then Pruning with rank few_c and traffic few_c and Subset Size 0.5 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.7: 100, 0.75: 100, 0.8: 100}
Stochastic search and then Pruning with rank few_c and traffic few_c and Subset Size 0.8 found (%prune: #accepted models) {0.05: 100, 0.1: 100,

Only stochastic search.

In [8]:
only_stochastic_search(rank_only_client_traffic=True, ds_only_client_traffic=True)

Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.1 found 18 models
Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.3 found 100 models
Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.5 found 100 models
Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.8 found 100 models


Only pruning.

In [25]:
only_pruning(ds_only_client_traffic=True)

Only Pruning for traffic few_c found 16 models


## Stochastic Subset Search on Client Categories with ranking computed on All Traffic Categories

First prune then stochastic search.

In [12]:
pruning_then_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=True)

Prune and the Stochastic search with rank all_c and traffic few_c and Subset Size 0.1 found  -> {0.05: 12, 0.1: 13, 0.15: 13, 0.2: 5, 0.25: 11, 0.3: 9, 0.35: 7, 0.4: 11, 0.45: 12, 0.5: 9, 0.55: 11, 0.6: 15, 0.65: 12, 0.7: 13, 0.75: 23, 0.8: 17}
Prune and the Stochastic search with rank all_c and traffic few_c and Subset Size 0.3 found  -> {0.05: 80, 0.1: 84, 0.15: 84, 0.2: 80, 0.25: 76, 0.3: 81, 0.35: 61, 0.4: 70, 0.45: 77, 0.5: 73, 0.55: 85, 0.6: 79, 0.65: 73, 0.7: 81, 0.75: 78, 0.8: 67}
Prune and the Stochastic search with rank all_c and traffic few_c and Subset Size 0.5 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 99, 0.3: 99, 0.35: 99, 0.4: 99, 0.45: 98, 0.5: 100, 0.55: 100, 0.6: 99, 0.65: 100, 0.7: 98, 0.75: 93, 0.8: 92}
Prune and the Stochastic search with rank all_c and traffic few_c and Subset Size 0.8 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.7: 100, 0.75: 

First stochastic search then pruning.

In [11]:
stochastic_search_then_pruning(rank_only_client_traffic=False, ds_only_client_traffic=True)

Stochastic search and then Pruning with rank all_c and traffic few_c and Subset Size 0.1 found (%prune: #accepted models) {0.05: 12, 0.1: 12, 0.15: 12, 0.2: 12, 0.25: 12, 0.3: 12, 0.35: 12, 0.4: 12, 0.45: 12, 0.5: 12, 0.55: 12, 0.6: 12, 0.65: 12, 0.7: 12, 0.75: 12, 0.8: 12}
Stochastic search and then Pruning with rank all_c and traffic few_c and Subset Size 0.3 found (%prune: #accepted models) {0.05: 80, 0.1: 80, 0.15: 80, 0.2: 80, 0.25: 80, 0.3: 80, 0.35: 80, 0.4: 80, 0.45: 80, 0.5: 80, 0.55: 80, 0.6: 80, 0.65: 80, 0.7: 80, 0.75: 80, 0.8: 80}
Stochastic search and then Pruning with rank all_c and traffic few_c and Subset Size 0.5 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100, 0.65: 100, 0.7: 100, 0.75: 100, 0.8: 100}
Stochastic search and then Pruning with rank all_c and traffic few_c and Subset Size 0.8 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2:

Only stochastic search.

In [10]:
only_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=True)

Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.1 found 12 models
Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.3 found 80 models
Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.5 found 100 models
Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.8 found 100 models


Only pruning.

In [9]:
only_pruning(ds_only_client_traffic=True)

Only Pruning for traffic few_c found 16 models


In case you want to test a scenario in which the organization has access to all the traffic, just repeat the whole notebook setting **ds_only_client_traffic=False**.

Right now, for our scenario it is not needed.