In [1]:
import pprint

import numpy as np
import pandas as pd
from attrs import asdict

from skrough.algorithms import hooks
from skrough.algorithms.key_names import (
    CONFIG_CHAOS_FUN,
    CONFIG_EPSILON,
    CONFIG_SELECT_ATTRS_CHAOS_SCORE_BASED_MAX_COUNT,
    CONFIG_SELECT_RANDOM_MAX_COUNT,
    INPUT_DATA_X,
    INPUT_DATA_Y,
    VALUES_CHAOS_SCORE_APPROX_THRESHOLD,
    VALUES_X,
    VALUES_X_COUNTS,
    VALUES_Y,
    VALUES_Y_COUNT,
)
from skrough.algorithms.meta import describe, processing, stage
from skrough.chaos_measures import entropy
from skrough.checks import check_if_approx_reduct
from skrough.dataprep import (
    prepare_factorized_array,
    prepare_factorized_data,
    prepare_factorized_vector,
)
from skrough.structs.attrs_subset import AttrsSubset

## Dataset

Let's prepare a sample data set - "Play Golf Dataset".

In [2]:
df = pd.DataFrame(
    np.array(
        [
            ["sunny", "hot", "high", "weak", "no"],
            ["sunny", "hot", "high", "strong", "no"],
            ["overcast", "hot", "high", "weak", "yes"],
            ["rain", "mild", "high", "weak", "yes"],
            ["rain", "cool", "normal", "weak", "yes"],
            ["rain", "cool", "normal", "strong", "no"],
            ["overcast", "cool", "normal", "strong", "yes"],
            ["sunny", "mild", "high", "weak", "no"],
            ["sunny", "cool", "normal", "weak", "yes"],
            ["rain", "mild", "normal", "weak", "yes"],
            ["sunny", "mild", "normal", "strong", "yes"],
            ["overcast", "mild", "high", "strong", "yes"],
            ["overcast", "hot", "normal", "weak", "yes"],
            ["rain", "mild", "high", "strong", "no"],
        ],
        dtype=object,
    ),
    columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"
x, x_counts, y, y_count = prepare_factorized_data(df, TARGET_COLUMN)

In [3]:
from sklearn.preprocessing import KBinsDiscretizer

df = pd.read_csv(
    "../../../workspace/data/microarray/anthracyclineTaxaneChemotherapy_processed.csv",
    index_col=0,
)
df_dec = df.pop("target")

est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
df = est.fit_transform(df)

# x, x_counts, y, y_count = prepare_factorized_data(df, TARGET_COLUMN)

In [4]:
x, x_counts = prepare_factorized_array(df)
y, y_count = prepare_factorized_vector(df_dec)

In [5]:
x.shape

(160, 61359)

In [6]:
grow_stage = stage.Stage.from_hooks(
    stop_hooks=[
        hooks.stop_hooks.stop_hook_approx_threshold,
    ],
    init_hooks=None,
    pre_candidates_hooks=[
        hooks.pre_candidates_hooks.pre_candidates_hook_remaining_attrs,
    ],
    candidates_hooks=[
        # hooks.common.process_elements.process_elements_hook_pass_everything,
        hooks.common.process_elements.create_process_elements_hook_random_choice(
            CONFIG_SELECT_RANDOM_MAX_COUNT
        )
    ],
    select_hooks=[
        hooks.select_hooks.select_hook_attrs_chaos_score_based,
    ],
    filter_hooks=None,
    inner_init_hooks=None,
    inner_stop_hooks=hooks.inner_stop_hooks.inner_stop_hook_empty,
    inner_process_hooks=hooks.inner_process_hooks.inner_process_hook_add_first_attr,
    finalize_hooks=None,
)
from skrough.structs.state import ProcessingState


def xy(
    state: ProcessingState,
) -> None:
    state.values[VALUES_X] = x
    state.values[VALUES_X_COUNTS] = x_counts
    state.values[VALUES_Y] = y
    state.values[VALUES_Y_COUNT] = y_count
    state.values[VALUES_CHAOS_SCORE_APPROX_THRESHOLD] = 0.0001


get_approx_reduct = processing.ProcessingMultiStage.from_hooks(
    init_multi_stage_hooks=[
        xy,
        hooks.init_hooks.init_hook_single_group_index,
        hooks.init_hooks.init_hook_result_attrs_empty,
        # hooks.init_hooks.init_hook_approx_threshold,
    ],
    stages=[grow_stage],
    finalize_hooks=None,
    prepare_result_fun=hooks.prepare_result_hooks.prepare_result_hook_attrs_subset,
)

In [20]:
import importlib

importlib.reload(hooks)

eps = 0.0
chaos_measure = entropy
result: AttrsSubset = get_approx_reduct(
    input_data={
        # INPUT_DATA_X: x,
        # INPUT_DATA_Y: y,
    },
    config={
        CONFIG_CHAOS_FUN: chaos_measure,
        CONFIG_EPSILON: eps,
        CONFIG_SELECT_ATTRS_CHAOS_SCORE_BASED_MAX_COUNT: 1,
        CONFIG_SELECT_RANDOM_MAX_COUNT: 10000,
    },
)
result

AttrsSubset(attrs=[17678, 1629, 37496, 40556, 49921, 49933])

In [57]:
check_if_approx_reduct(
    x, x_counts, y, y_count, attrs=result.attrs, chaos_fun=chaos_measure, epsilon=eps
)

False

In [8]:
get_approx_reduct.get_config_keys()

['config_chaos_fun']