In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=7500, n_features=10, n_informative=2, n_redundant=0)
X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(10)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)
X_train

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
3151,-0.252584,-0.013591,-0.447815,-1.109033,1.572335,1.064175,0.152980,1.518396,-1.100784,-0.352549
4019,-0.610962,-0.305639,-1.018141,1.263229,-1.182806,1.140426,0.077562,-1.240648,0.112109,1.922564
4194,-0.813215,0.381716,2.930360,-0.554095,-0.673227,1.118500,0.218240,-1.665309,-2.543525,0.957143
1825,0.245663,-1.441893,1.603998,0.009513,0.658603,-2.659180,0.273546,1.301552,-0.336510,-1.377055
7363,-0.711018,-0.228599,-1.145449,-0.664356,0.711679,0.860489,-0.378886,-0.011911,-0.080381,-0.949823
...,...,...,...,...,...,...,...,...,...,...
5191,-1.172275,0.752273,0.609110,0.693750,-1.556221,0.240467,0.499236,0.324666,-0.255090,0.110493
5226,0.467865,-0.550061,0.556859,-0.033589,1.098075,0.227501,-1.005434,0.553411,-1.128212,0.359070
5390,0.880421,-0.110519,-0.453920,-0.362355,-1.292338,0.657810,-1.348000,-1.949395,-0.749178,0.109829
860,-1.416170,-0.428702,-0.803707,0.950967,1.849135,-2.161063,-0.311295,0.822379,0.253339,0.463723


## One by one test

In [3]:
import sys
sys.path.append("../powershap")

from powershap import PowerSHAP


from catboost import CatBoostClassifier

selector = PowerSHAP(
    model = CatBoostClassifier(verbose=0, n_estimators=250),
    power_iterations=10,
    # automatic=True,
    # limit_automatic=100,
)

In [4]:
selector.fit(X_train, y_train)

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]

Failed to converge on a solution.

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


PowerSHAP(model=<catboost.core.CatBoostClassifier object at 0x7fa1085e7850>)

In [5]:
selector.transform(X_test)

array([[ 1.31012519, -1.87059037],
       [-1.47462311, -0.5517394 ],
       [ 0.08827212, -0.18375405],
       ...,
       [ 0.79281959, -1.1191799 ],
       [-0.28081656, -1.28401896],
       [-1.19901384,  0.63098049]])

In [6]:
selector._processed_shaps_df

Unnamed: 0,impact,p_value,effect_size,power_0.01_alpha,0.95_power_its_req
col_2,4.201717,0.0,50.274433,1.0,[10.0]
col_5,0.444193,0.0,12.536038,1.0,2.52145
col_1,0.153131,0.2,1.294119,0.869836,12.317492
col_4,0.125569,0.6,0.293936,0.063345,185.251543
random_uniform_feature,0.11522,0.6,0.0,0.01,0
col_3,0.103866,0.6,0.365781,0.091208,120.597478
col_7,0.101289,0.8,0.477504,0.150837,71.908094
col_9,0.10012,0.5,0.567166,0.214,51.782326
col_0,0.096808,0.8,0.776227,0.406844,28.968916
col_6,0.093308,0.8,0.829579,0.462458,25.721338


## sklearn pipeline test

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline(
    [
        (
            "selector",
            PowerSHAP(
                CatBoostClassifier(
                    verbose=0, n_estimators=250,
                ), automatic=True, limit_automatic=100,
            ),
        ),
        ("knn", KNeighborsClassifier()),
    ]
)

pipe.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

print("Baseline", accuracy_score(KNeighborsClassifier().fit(X_train, y_train).predict(X_test), y_test))

print("PowerShap feature selection:", accuracy_score(pipe.predict(X_test), y_test))


100%|██████████| 10/10 [00:09<00:00,  1.11it/s]

Failed to converge on a solution.

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Baseline 0.9458585858585858
PowerShap feature selection: 0.9705050505050505


In [8]:
pipe[0]._processed_shaps_df

Unnamed: 0,impact,p_value,effect_size,power_0.01_alpha,0.95_power_its_req
col_2,4.201717,0.0,50.274433,1.0,[10.0]
col_5,0.444193,0.0,12.536038,1.0,2.52145
col_1,0.153131,0.2,1.294119,0.869836,12.317492
col_4,0.125569,0.6,0.293936,0.063345,185.251543
random_uniform_feature,0.11522,0.6,0.0,0.01,0
col_3,0.103866,0.6,0.365781,0.091208,120.597478
col_7,0.101289,0.8,0.477504,0.150837,71.908094
col_9,0.10012,0.5,0.567166,0.214,51.782326
col_0,0.096808,0.8,0.776227,0.406844,28.968916
col_6,0.093308,0.8,0.829579,0.462458,25.721338
