In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=750, n_classes=2, n_features=10, n_informative=2, n_redundant=1)
X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(10)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)
X_train

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
244,1.854454,0.833443,0.969051,1.370854,0.423943,-0.235240,-1.105026,1.271731,-0.209493,-1.352031
327,-0.285814,1.811977,0.584032,1.061074,-0.123379,0.020261,-0.373572,1.019402,-0.246834,0.751600
351,-1.185310,-0.801171,-0.895684,0.739357,-0.843258,-0.030773,-0.836259,0.985317,0.005659,0.054130
482,0.144783,-0.331805,1.578867,2.202363,1.840226,-1.053736,-2.590670,2.038466,-0.401613,1.507447
104,0.160092,0.097259,0.600112,0.172067,-0.893744,-1.144397,-1.276102,0.058615,2.091017,-0.068174
...,...,...,...,...,...,...,...,...,...,...
71,0.730310,-0.578648,1.498378,2.133957,-0.184468,2.347809,-0.991208,1.981790,0.200552,0.013600
106,0.532439,-0.019824,-2.073706,-1.123932,-0.176514,1.495766,1.191645,-0.772614,-0.968567,0.987894
270,0.050650,0.369259,-0.396380,0.716700,0.139321,0.213857,-1.325164,0.855510,-1.111407,-0.346818
435,0.386958,-0.098715,-0.784881,0.635646,0.414078,0.033558,0.145170,0.850236,0.074846,-0.789599


## One by one test

In [3]:
import sys
sys.path.append("../powershap")

from powershap import PowerSHAP


from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier

selector = PowerSHAP(
    model = GradientBoostingClassifier(),#CatBoostClassifier(verbose=0, n_estimators=250),
    automatic=True, limit_automatic=100,
)

In [4]:
selector.fit(X_train, y_train)

100%|██████████| 10/10 [00:02<00:00,  4.64it/s]


PowerSHAP(automatic=True, limit_automatic=100,
          model=GradientBoostingClassifier())

In [5]:
selector.transform(X_test)

Unnamed: 0,col_0,col_3,col_7
0,0.489224,-1.454365,-1.969659
1,0.720337,0.677359,0.438831
2,-0.960611,0.946587,1.224851
3,-1.126992,1.338850,1.671198
4,-0.653274,-0.407324,-0.546027
...,...,...,...
243,0.597183,-0.110746,-0.124813
244,-0.852425,-1.264962,-1.687096
245,1.777342,0.436618,0.605911
246,-0.765865,-2.365015,-3.098215


In [6]:
selector._processed_shaps_df

Unnamed: 0,impact,p_value,effect_size,power_0.01_alpha,0.95_power_its_req
col_3,4.088944,0.0,24.646242,1.0,2.187711
col_7,0.51927,0.0,2.312397,0.999906,5.957355
col_2,0.268713,0.1,0.0,0.0,0.0
col_1,0.207467,0.2,0.0,0.0,0.0
col_0,0.193726,0.0,1.573534,0.967838,9.317529
col_5,0.134823,0.5,0.0,0.0,0.0
col_4,0.114595,0.5,0.0,0.0,0.0
random_uniform_feature,0.110816,0.6,0.0,0.0,0.0
col_8,0.105243,0.4,0.0,0.0,0.0
col_9,0.083472,0.7,0.0,0.0,0.0


## sklearn pipeline test

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline(
    [
        (
            "selector",
            PowerSHAP(
                LogisticRegressionCV(), automatic=True, limit_automatic=100,
            ),
        ),
        ("knn", KNeighborsClassifier()),
    ]
)

pipe.fit(X_train, y_train)


from sklearn.metrics import accuracy_score

print("Baseline", accuracy_score(KNeighborsClassifier().fit(X_train, y_train).predict(X_test), y_test))

print("PowerShap feature selection:", accuracy_score(pipe.predict(X_test), y_test))


100%|██████████| 10/10 [00:01<00:00,  6.17it/s]
100%|██████████| 2/2 [00:00<00:00,  5.95it/s]


Baseline 0.9193548387096774
PowerShap feature selection: 0.9233870967741935


In [8]:
pipe[0]._processed_shaps_df

Unnamed: 0,impact,p_value,effect_size,power_0.01_alpha,0.95_power_its_req
col_7,0.839674,0.0,3.095136,1.0,4.636912
col_3,0.757481,0.0,2.992045,1.0,4.757554
col_2,0.192849,0.0,2.178695,0.999985,6.326421
col_0,0.088903,0.0,1.770004,0.998557,8.007366
col_1,0.088158,0.0,1.865502,0.999441,7.514943
col_8,0.068528,0.0,1.767823,0.998526,8.019552
col_4,0.05327,0.0,1.583373,0.992485,9.24005
col_6,0.043305,0.083333,0.0,0.0,0.0
col_5,0.041054,0.083333,0.0,0.0,0.0
col_9,0.038898,0.083333,0.0,0.0,0.0
