In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 10

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
data_X, data_y = data.data, data.target

X = pd.DataFrame(data=data_X, columns=data.feature_names)
y = pd.Series(data_y)

X["category"] = np.where(X["mean smoothness"] <= 0.1, "A", "B")

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0
)

In [3]:
from scorepyo.binarizers import EBMBinarizer


binarizer = EBMBinarizer(max_number_binaries_by_features=3, keep_negative=True)
binarizer.fit(X_train, y_train, categorical_features="auto", to_exclude_features=None)

X_train_binarized = binarizer.transform(X_train)
X_test_binarized = binarizer.transform(X_test)

In [4]:
from mrmr import mrmr_classif

selected_features = mrmr_classif(X=X_train_binarized, y=y_train, K=10)

100%|██████████| 10/10 [00:00<00:00, 14.21it/s]


In [5]:
?mrmr_classif

[1;31mSignature:[0m
[0mmrmr_classif[0m[1;33m([0m[1;33m
[0m    [0mX[0m[1;33m,[0m[1;33m
[0m    [0my[0m[1;33m,[0m[1;33m
[0m    [0mK[0m[1;33m,[0m[1;33m
[0m    [0mrelevance[0m[1;33m=[0m[1;34m'f'[0m[1;33m,[0m[1;33m
[0m    [0mredundancy[0m[1;33m=[0m[1;34m'c'[0m[1;33m,[0m[1;33m
[0m    [0mdenominator[0m[1;33m=[0m[1;34m'mean'[0m[1;33m,[0m[1;33m
[0m    [0mcat_features[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcat_encoding[0m[1;33m=[0m[1;34m'leave_one_out'[0m[1;33m,[0m[1;33m
[0m    [0monly_same_domain[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mreturn_scores[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;33m-[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mshow_progress[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
MRMR feature selection for a classification task
Parameters
----------


In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import (
    average_precision_score,
    precision_recall_curve,
    precision_score,
)
from sklearn.model_selection import train_test_split

from scorepyo._utils import fast_numba_auc
from scorepyo.calibration import VanillaCalibrator
from scorepyo.models import EBMRiskScore
from scorepyo.ranking import LogOddsDensity, MRMRRank


def test_end_2_end():
    # assert True

    data = load_breast_cancer()
    data_X, data_y = data.data, data.target

    X = pd.DataFrame(data=data_X, columns=data.feature_names)
    X["category"] = np.where(X["mean smoothness"] <= 0.1, "A", "B")
    y = pd.Series(data_y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=0
    )

    X_test["category"] = "C"

    min_point_value = -2
    max_point_value = 3
    nb_max_features = 4

    ranker = LogOddsDensity()
    ranker = MRMRRank()

    optim_method = fast_numba_auc

    scorepyo_model = EBMRiskScore(
        min_point_value=min_point_value,
        max_point_value=max_point_value,
        nb_max_features=nb_max_features,
        nb_additional_features=6,
        enumeration_maximization_metric=optim_method,
        ranker=ranker,
        calibrator=VanillaCalibrator(),
    )

    scorepyo_model.fit(
        X_train,
        y_train,
        X_calib=None,
        y_calib=None,
        categorical_features=["category"],
    )

    scorepyo_model.summary()

    # y_proba = scorepyo_model.predict_proba(X_test)[:, 1].reshape(-1, 1)

    # # precision_recall_curve(y_test.astype(int), y_proba)
    # average_precision = np.round(
    #     average_precision_score(y_test.astype(int), y_proba), 3
    # )

    # print(f"Average precision : \n{average_precision}")

    # precision_test = precision_score(y_test.astype(int), y_proba > 0.5)

    # print(f"Precision@0.5: \n{precision_test}")

In [11]:
test_end_2_end()

| FEATURE-POINT CARD |
| Feature              | Description                  | Point(s)   |       |
|:---------------------|:-----------------------------|:-----------|:------|
| worst concave points | worst concave points >= 0.14 | -2         | ...   |
| worst radius         | worst radius >= 16.66        | -2         | + ... |
| area error           | area error >= 33.35          | -1         | + ... |
| mean concavity       | mean concavity >= 0.1        | -1         | + ... |
|                      |                              | SCORE=     | ...   |


|     SCORE CARD     |
| SCORE   | -6    | -5    | -4    | -3     | -2     | -1     | 0      |
|:--------|:------|:------|:------|:-------|:-------|:-------|:-------|
| RISK    | 0.00% | 5.00% | 9.09% | 25.00% | 63.16% | 78.57% | 98.68% |
