# Feature Rank, Subset Search and Model Pruning

In this notebook, we leverage feature ranking algorithms to assign an importance score to each feature in the dataset.

Then, we'll use that score to perform the *Stochastic Subset Search*: find smaller sub-optimal feature subsets by randomly (weighted) moving into the solution space, trying different combination of features.

In [2]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

import matplotlib as mpl
from sklearn.metrics import accuracy_score

from intellect.model.sklearn.model import EnhancedMlpRegressor
from intellect.model.sklearn.pruning import globally_unstructured_connections_random
from intellect.ranking import rank_metric_permutation_sklearn, prune_and_subset_search, subset_search, prune_search, subset_search_and_prune, sequential_backward_elimination
from intellect.io import load, dump, create_dir, TimeoutIterator
from intellect.inspect import set_seed
import os

import config

# to decrease size of the notebook
mpl.rcParams['figure.dpi']= 70
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 10)

# Add these lines when working on local machine with few cores
import threadpoolctl
threadpoolctl.threadpool_limits(limits=int(os.cpu_count()/4));

In [3]:
_, validation, _ = config.get_dataset()

Now, remove unused categories from this portion, and balance samples among the remaining ones.

In [4]:
validation_client = validation.filter_categories(config.CLIENT_CATEGORIES).balance_categories()

In [None]:
create_dir(config.RANK_PRUNE_OUTPUT_DIR)

In [5]:
oracle = EnhancedMlpRegressor.load(config.TRAINING_OUTPUT_MODEL)
baseline_all = accuracy_score(validation.y, oracle.predict(validation.X))
baseline_client = accuracy_score(validation_client.y, oracle.predict(validation_client.X))
df = pd.DataFrame({"Original Accuracy": [baseline_client, baseline_all], "Worst Accuracy": [baseline_client * (1 - config.PERFORMANCE_DROP_ACCEPTED_RATIO), baseline_all * (1 - config.PERFORMANCE_DROP_ACCEPTED_RATIO)]},
                  index=["Client Traffic", "All Traffic"])
dump(df, config.RANK_PRUNE_OUTPUT_DIR + "baseline.csv")
df

Unnamed: 0,Original Accuracy,Worst Accuracy
Client Traffic,0.981,0.63765
All Traffic,0.977578,0.635426


Define few utility functions for Stochastic Subset Search.

In [6]:
def only_pruning(ds_only_client_traffic: bool = True, prune_method=config.PRUNE_METHOD):
    ds = validation_client if ds_only_client_traffic else validation
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"])
    set_seed()
    for k, v, j in TimeoutIterator(prune_search(
        oracle.clone(init=False), ds, prune_method, config.PRUNE_RATIOS,
        metric=accuracy_score, performance_drop_ratio=config.PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=config.TIME_LIMIT):
        if not j:
            continue
        new_df_comb.loc[len(new_df_comb)] = [v, k]
    new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
    dump(new_df_comb, config.RANK_PRUNE_OUTPUT_DIR + f"traffic_{traffic_prefix}_pruning_ratios_only_{prune_method.__name__}.csv")
    print(f"Only Pruning for traffic {traffic_prefix} found {len(new_df_comb)} models with {prune_method.__name__}")

def only_stochastic_search(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True):
    ds = validation_client if ds_only_client_traffic else validation
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank[str(ds.n_features)].to_dict()

    for subset_size_ratio in config.TARGET_SUBSET_RATIOS:
        set_seed()
        new_df_comb = pd.DataFrame(columns=["Accuracy"] + df_rank.index.values.tolist())
        for k, v, j in TimeoutIterator(subset_search(
            oracle.clone(init=False), ds, subset_size_ratio, config.EXPLORED_PER_RATIO,
            rank=rank, metric=accuracy_score, performance_drop_ratio=config.PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=config.TIME_LIMIT):
            if not j:
                continue
            copied = df_rank.T.loc[str(ds.n_features)].copy()
            copied[~copied.index.isin(k)] = np.nan
            copied[copied.index.isin(k)] = 1
            copied["Accuracy"] = v
            new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_subsets_features_for_subsetsize_{subset_size_ratio}.csv")
        print(f"Only Stochastic with rank {rank_prefix} and traffic {traffic_prefix} and Subset size ratio {subset_size_ratio} found {len(new_df_comb)} models")

def stochastic_search_then_pruning(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True, prune_method=config.PRUNE_METHOD):
    ds = validation_client if ds_only_client_traffic else validation
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank[str(ds.n_features)].to_dict()

    for subset_size_ratio in config.TARGET_SUBSET_RATIOS:
        set_seed()
        new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"] + df_rank.index.values.tolist())
        for k, v, prune_ratio in TimeoutIterator(subset_search_and_prune(
                oracle.clone(init=False),prune_method, ds, config.PRUNE_RATIOS, subset_size_ratio, config.EXPLORED_PER_RATIO,
                rank=rank, metric=accuracy_score, performance_drop_ratio=config.PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=config.TIME_LIMIT):
            copied = df_rank.T.loc[str(ds.n_features)].copy()
            copied[~copied.index.isin(k)] = np.nan
            copied[copied.index.isin(k)] = 1
            copied["Prune Ratio"] = prune_ratio
            copied["Accuracy"] = v
            new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_combo_subsets_features_pruned_models_for_subsetsize_{subset_size_ratio}_{prune_method.__name__}.csv")
        print(f"Stochastic search and then Pruning with algorithm {prune_method.__name__} rank {rank_prefix} and traffic {traffic_prefix} and Subset Size {subset_size_ratio} found (%prune: #accepted models)", new_df_comb["Prune Ratio"].value_counts().sort_index().to_dict())

def pruning_then_stochastic_search(rank_only_client_traffic: bool = True, ds_only_client_traffic: bool = True, prune_method=config.PRUNE_METHOD):
    ds = validation_client if ds_only_client_traffic else validation
    rank_prefix = "few_c" if rank_only_client_traffic else "all_c"
    traffic_prefix = "few_c" if ds_only_client_traffic else "all_c"
    df_rank: pd.DataFrame = load(config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}.csv", index_col=0)
    rank = df_rank[str(ds.n_features)].to_dict()

    for subset_size_ratio in config.TARGET_SUBSET_RATIOS:
        set_seed()
        new_df_comb = pd.DataFrame(columns=["Accuracy", "Prune Ratio"] + df_rank.index.values.tolist())
        for prune_ratio, k, v in TimeoutIterator(prune_and_subset_search(
                oracle.clone(init=False),prune_method, ds, config.PRUNE_RATIOS, subset_size_ratio, config.EXPLORED_PER_RATIO,
                rank=rank, metric=accuracy_score, performance_drop_ratio=config.PERFORMANCE_DROP_ACCEPTED_RATIO), time_limit=config.TIME_LIMIT):
            copied = df_rank.T.loc[str(ds.n_features)].copy()
            copied[~copied.index.isin(k)] = np.nan
            copied[copied.index.isin(k)] = 1
            copied["Prune Ratio"] = prune_ratio
            copied["Accuracy"] = v
            new_df_comb.loc[len(new_df_comb)] = copied
        new_df_comb = new_df_comb.sort_values(by="Accuracy", ascending=False)
        dump(new_df_comb, config.RANK_PRUNE_OUTPUT_DIR + f"rank_{rank_prefix}_traffic_{traffic_prefix}_combo_pruned_models_subsets_features_for_subsetsize_{subset_size_ratio}_{prune_method.__name__}.csv")
        print(f"Prune and the Stochastic search with algorithm {prune_method.__name__} with rank {rank_prefix} and traffic {traffic_prefix} and Subset Size {subset_size_ratio} found  ->", new_df_comb["Prune Ratio"].value_counts().sort_index().to_dict())

## Feature Ranking and Recursive Subset Search

In [48]:
def recursive_ss(ds_only_client_traffic=False, fixed_rank=False, remove_zero_first=True):
    ds = validation_client if ds_only_client_traffic else validation
    t = "few_c" if ds_only_client_traffic else "all_c"
    t2 = "fixed_rank" if fixed_rank else "iterative_rank"
    t3 = "zero_first" if remove_zero_first else "zero_not_first"
    out_name = config.RANK_PRUNE_OUTPUT_DIR + f"sequential_{t}_{t2}_{t3}.csv"
    
    if fixed_rank:
        kwargs = {"fixed_rank": rank_metric_permutation_sklearn(oracle, ds)}
    else:
        kwargs = {"rank_algorithm": config.RANK_METHOD}
    
    asd = pd.DataFrame(columns=["Accuracy"] + ds.features, index=pd.Index([], name="#Features"))
    for score, _, scores in sequential_backward_elimination(oracle, ds, remove_zero_first=remove_zero_first, **kwargs):
        asd.loc[len(scores)] = {"Accuracy": score, **scores}
    dump(asd, out_name)

In [41]:
recursive_ss(ds_only_client_traffic=True, fixed_rank=True, remove_zero_first=True)

In [42]:
recursive_ss(ds_only_client_traffic=True, fixed_rank=True, remove_zero_first=False)

In [45]:
recursive_ss(ds_only_client_traffic=False, fixed_rank=True, remove_zero_first=True)

In [44]:
recursive_ss(ds_only_client_traffic=False, fixed_rank=True, remove_zero_first=False)

In [49]:
recursive_ss(ds_only_client_traffic=True, fixed_rank=False, remove_zero_first=True)

In [50]:
recursive_ss(ds_only_client_traffic=True, fixed_rank=False, remove_zero_first=False)

In [51]:
recursive_ss(ds_only_client_traffic=False, fixed_rank=False, remove_zero_first=True)

In [52]:
recursive_ss(ds_only_client_traffic=False, fixed_rank=False, remove_zero_first=False)

## Prof of Awareness while Choosing the Pruning Algorithm

Now, we just want to show the effect of choosing a wrong pruning algorithm. While for the L1-norm use case it is possible to further prune the model with a quite high pruning ratio, choosing a random pruning method will immediately make the resulting model unusable.

In [53]:
only_pruning(ds_only_client_traffic=False, prune_method=config.PRUNE_METHOD)
only_pruning(ds_only_client_traffic=False, prune_method=globally_unstructured_connections_random)
only_pruning(ds_only_client_traffic=True, prune_method=config.PRUNE_METHOD)
only_pruning(ds_only_client_traffic=True, prune_method=globally_unstructured_connections_random)

Only Pruning for traffic all_c found 12 models with globally_unstructured_connections_l1
Only Pruning for traffic all_c found 2 models with globally_unstructured_connections_random
Only Pruning for traffic few_c found 12 models with globally_unstructured_connections_l1
Only Pruning for traffic few_c found 2 models with globally_unstructured_connections_random


Look at the visualization notebook (2_1_interpret_pruning_outcome).

As a result, not all the pruning randomly are the same and lead to the same result, we should use them wisely.

## Stochastic Subset Search on Client Categories with ranking computed on **Client Categories**

First prune then stochastic search.

In [59]:
pruning_then_stochastic_search(rank_only_client_traffic=True, ds_only_client_traffic=True)

Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank few_c and traffic few_c and Subset Size 0.1 found  -> {0.05: 60, 0.1: 57, 0.15: 59, 0.2: 53, 0.25: 68, 0.3: 51, 0.35: 45, 0.4: 51, 0.45: 48, 0.5: 44, 0.55: 36, 0.6: 28}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank few_c and traffic few_c and Subset Size 0.3 found  -> {0.05: 100, 0.1: 97, 0.15: 99, 0.2: 99, 0.25: 97, 0.3: 92, 0.35: 88, 0.4: 95, 0.45: 97, 0.5: 95, 0.55: 92, 0.6: 67}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank few_c and traffic few_c and Subset Size 0.5 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank few_c and traffic few_c and Subset Size 0.8 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 1

First stochastic search then pruning.

In [10]:
stochastic_search_then_pruning(rank_only_client_traffic=True, ds_only_client_traffic=True)

Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank few_c and traffic few_c and Subset Size 0.1 found (%prune: #accepted models) {0.05: 61, 0.1: 61, 0.15: 61, 0.2: 61, 0.25: 61, 0.3: 61, 0.35: 61, 0.4: 61, 0.45: 61, 0.5: 61, 0.55: 61, 0.6: 61}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank few_c and traffic few_c and Subset Size 0.3 found (%prune: #accepted models) {0.05: 99, 0.1: 99, 0.15: 99, 0.2: 99, 0.25: 99, 0.3: 99, 0.35: 99, 0.4: 99, 0.45: 99, 0.5: 99, 0.55: 99, 0.6: 99}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank few_c and traffic few_c and Subset Size 0.5 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank few_c and traffic few_c and Subset Size 0.8 

Only stochastic search.

In [57]:
only_stochastic_search(rank_only_client_traffic=True, ds_only_client_traffic=True)

Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.1 found 61 models


Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.3 found 99 models
Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.5 found 100 models
Only Stochastic with rank few_c and traffic few_c and Subset size ratio 0.8 found 100 models


Only pruning.

In [56]:
only_pruning(ds_only_client_traffic=True)

Only Pruning for traffic few_c found 12 models with globally_unstructured_connections_l1


## Stochastic Subset Search on Client Categories with ranking computed on All Traffic Categories

First prune then stochastic search.

In [60]:
pruning_then_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=True)

Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank all_c and traffic few_c and Subset Size 0.1 found  -> {0.05: 34, 0.1: 31, 0.15: 33, 0.2: 29, 0.25: 32, 0.3: 36, 0.35: 35, 0.4: 27, 0.45: 24, 0.5: 22, 0.55: 21, 0.6: 16}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank all_c and traffic few_c and Subset Size 0.3 found  -> {0.05: 82, 0.1: 79, 0.15: 82, 0.2: 81, 0.25: 71, 0.3: 67, 0.35: 57, 0.4: 77, 0.45: 66, 0.5: 67, 0.55: 46, 0.6: 27}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank all_c and traffic few_c and Subset Size 0.5 found  -> {0.05: 100, 0.1: 99, 0.15: 100, 0.2: 100, 0.25: 99, 0.3: 99, 0.35: 100, 0.4: 100, 0.45: 96, 0.5: 97, 0.55: 84, 0.6: 49}
Prune and the Stochastic search with algorithm globally_unstructured_connections_l1 with rank all_c and traffic few_c and Subset Size 0.8 found  -> {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3:

First stochastic search then pruning.

In [11]:
stochastic_search_then_pruning(rank_only_client_traffic=False, ds_only_client_traffic=True)

Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic few_c and Subset Size 0.1 found (%prune: #accepted models) {0.05: 35, 0.1: 35, 0.15: 35, 0.2: 35, 0.25: 35, 0.3: 35, 0.35: 35, 0.4: 35, 0.45: 35, 0.5: 35, 0.55: 35, 0.6: 35}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic few_c and Subset Size 0.3 found (%prune: #accepted models) {0.05: 83, 0.1: 83, 0.15: 83, 0.2: 83, 0.25: 83, 0.3: 83, 0.35: 83, 0.4: 83, 0.45: 83, 0.5: 83, 0.55: 83, 0.6: 83}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic few_c and Subset Size 0.5 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic few_c and Subset Size 0.8 

Only stochastic search.

In [62]:
only_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=True)

Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.1 found 35 models


Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.3 found 83 models
Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.5 found 100 models
Only Stochastic with rank all_c and traffic few_c and Subset size ratio 0.8 found 100 models


Only pruning.

In [63]:
only_pruning(ds_only_client_traffic=True)

Only Pruning for traffic few_c found 12 models with globally_unstructured_connections_l1


In case you want to test a scenario in which the organization has access to all the traffic, just repeat the whole notebook setting **ds_only_client_traffic=False**.

Right now, for our scenario it is not needed.

## Stochastic Subset Search on All Categories with ranking computed on All Traffic Categories

First prune then stochastic search.

In [None]:
pruning_then_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=False)

First stochastic search then pruning.

In [12]:
stochastic_search_then_pruning(rank_only_client_traffic=False, ds_only_client_traffic=False)

Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic all_c and Subset Size 0.1 found (%prune: #accepted models) {0.05: 27, 0.1: 27, 0.15: 27, 0.2: 27, 0.25: 27, 0.3: 27, 0.35: 27, 0.4: 27, 0.45: 27, 0.5: 27, 0.55: 27, 0.6: 27}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic all_c and Subset Size 0.3 found (%prune: #accepted models) {0.05: 94, 0.1: 94, 0.15: 94, 0.2: 94, 0.25: 94, 0.3: 94, 0.35: 94, 0.4: 94, 0.45: 94, 0.5: 94, 0.55: 94, 0.6: 94}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic all_c and Subset Size 0.5 found (%prune: #accepted models) {0.05: 100, 0.1: 100, 0.15: 100, 0.2: 100, 0.25: 100, 0.3: 100, 0.35: 100, 0.4: 100, 0.45: 100, 0.5: 100, 0.55: 100, 0.6: 100}
Stochastic search and then Pruning with algorithm globally_unstructured_connections_l1 rank all_c and traffic all_c and Subset Size 0.8 

Only stochastic search.

In [65]:
only_stochastic_search(rank_only_client_traffic=False, ds_only_client_traffic=False)

Only Stochastic with rank all_c and traffic all_c and Subset size ratio 0.1 found 27 models
Only Stochastic with rank all_c and traffic all_c and Subset size ratio 0.3 found 94 models
Only Stochastic with rank all_c and traffic all_c and Subset size ratio 0.5 found 100 models
Only Stochastic with rank all_c and traffic all_c and Subset size ratio 0.8 found 100 models


Only pruning.

In [64]:
only_pruning(ds_only_client_traffic=False)

Only Pruning for traffic all_c found 12 models with globally_unstructured_connections_l1
