In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    np.array(
        [
            ["sunny", "hot", "high", "weak", "no"],
            ["sunny", "hot", "high", "strong", "no"],
            ["overcast", "hot", "high", "weak", "yes"],
            ["rain", "mild", "high", "weak", "yes"],
            ["rain", "cool", "normal", "weak", "yes"],
            ["rain", "cool", "normal", "strong", "no"],
            ["overcast", "cool", "normal", "strong", "yes"],
            ["sunny", "mild", "high", "weak", "no"],
            ["sunny", "cool", "normal", "weak", "yes"],
            ["rain", "mild", "normal", "weak", "yes"],
            ["sunny", "mild", "normal", "strong", "yes"],
            ["overcast", "mild", "high", "strong", "yes"],
            ["overcast", "hot", "normal", "weak", "yes"],
            ["rain", "mild", "high", "strong", "no"],
        ],
        dtype=object,
    ),
    columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"
df_dec = df.pop(TARGET_COLUMN)

In [15]:
from __future__ import annotations

import skrough.typing as rght
from skrough.algorithms import hooks
from skrough.algorithms.key_names import (
    CONFIG_CANDIDATES_SELECT_RANDOM_MAX_COUNT,
    CONFIG_CHAOS_FUN,
    CONFIG_CONSECUTIVE_EMPTY_ITERATIONS_MAX_COUNT,
    CONFIG_DAAR_ALLOWED_RANDOMNESS,
    CONFIG_DAAR_PROBES_COUNT,
    CONFIG_SELECT_ATTRS_CHAOS_SCORE_BASED_MAX_COUNT,
    CONFIG_SET_APPROX_THRESHOLD_TO_CURRENT,
    INPUT_DATA_X,
    INPUT_DATA_X_COUNTS,
    INPUT_DATA_Y,
    INPUT_DATA_Y_COUNT,
)
from skrough.algorithms.meta import processing
from skrough.algorithms.reusables.attrs_daar import daar_stage
from skrough.algorithms.reusables.attrs_reduction import reduction_stage
from skrough.chaos_measures import entropy
from skrough.dataprep import prepare_factorized_array, prepare_factorized_vector

_get_approx_reduct_daar_heuristic = processing.ProcessingMultiStage.from_hooks(
    init_multi_stage_hooks=[
        hooks.init_hooks.init_hook_pass_data,
        hooks.init_hooks.init_hook_single_group_index,
        hooks.init_hooks.init_hook_result_attrs_empty,
    ],
    stages=[daar_stage, reduction_stage],
    finalize_hooks=None,
    prepare_result_fun=hooks.prepare_result_hooks.prepare_result_hook_attrs_subset,
)


def get_approx_reduct_daar_heuristic(
    x,
    y,
    chaos_measure: rght.ChaosMeasure,
    candidates_count: int | None = None,
    consecutive_daar_reps: int = 1,
    allowed_randomness: float | None = None,
    probes_count: int | None = None,
    n_reducts: int = 1,
    seed: rght.Seed = None,
    n_jobs: int | None = None,
):
    x, x_counts = prepare_factorized_array(x)
    y, y_count = prepare_factorized_vector(y)

    n_attrs = x.shape[1]
    if allowed_randomness is None and n_attrs > 0:
        allowed_randomness = 1 / n_attrs
    if probes_count is None:
        probes_count = max(n_attrs, 100)

    result = _get_approx_reduct_daar_heuristic.call_parallel(
        n_times=n_reducts,
        input_data={
            INPUT_DATA_X: x,
            INPUT_DATA_X_COUNTS: x_counts,
            INPUT_DATA_Y: y,
            INPUT_DATA_Y_COUNT: y_count,
        },
        config={
            CONFIG_CHAOS_FUN: chaos_measure,
            CONFIG_SELECT_ATTRS_CHAOS_SCORE_BASED_MAX_COUNT: 1,
            CONFIG_CANDIDATES_SELECT_RANDOM_MAX_COUNT: candidates_count,
            CONFIG_DAAR_PROBES_COUNT: probes_count,
            CONFIG_DAAR_ALLOWED_RANDOMNESS: allowed_randomness,
            CONFIG_CONSECUTIVE_EMPTY_ITERATIONS_MAX_COUNT: consecutive_daar_reps,
            CONFIG_SET_APPROX_THRESHOLD_TO_CURRENT: True,
        },
        seed=seed,
        n_jobs=n_jobs,
    )
    return result


get_approx_reduct_daar_heuristic(df.to_numpy(), df_dec, entropy, allowed_randomness=0.2)

[0]
[0, 1, 2, 3]


[AttrsSubset(attrs=[0])]

In [10]:
# from skrough.algorithms.reducts import get_approx_reduct_greedy_heuristic
# from skrough.chaos_measures import entropy

get_approx_reduct_daar_heuristic(df.to_numpy(), df_dec, 0.1, 4, entropy)
# get_approx_reduct_greedy_heuristic(df, df_dec, 0.0, 100, entropy, n_reducts=1, n_jobs=1)

[AttrsSubset(attrs=[22058, 27331, 8231, 57977, 26353, 17737, 10200])]

In [4]:
_get_approx_reduct_daar_heuristic.get_config_keys()

['config_consecutive_empty_iterations_max_count', 'config_chaos_fun']

In [16]:
from skrough.algorithms.bireducts import get_bireduct_greedy_heuristic
from skrough.chaos_measures import entropy

res = get_bireduct_greedy_heuristic(
    df, df_dec, 0.99, 100, entropy, n_bireducts=10, n_jobs=5
)
res

[ObjsAttrsSubset(objs=[1, 2, 3, 4, 5, 6, 7, 11, 12, 14, 15, 18, 19, 21, 23, 24, 26, 27, 28, 32, 33, 34, 35, 36, 37, 39, 42, 44, 45, 46, 47, 48, 50, 51, 53, 55, 58, 59, 60, 63, 64, 65, 70, 73, 75, 77, 79, 80, 81, 82, 84, 85, 88, 90, 94, 96, 97, 98, 100, 101, 103, 104, 105, 107, 108, 109, 110, 111, 112, 114, 115, 122, 124, 125, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 145, 146, 151, 154, 155, 157, 158], attrs=[58041]),
 ObjsAttrsSubset(objs=[0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 18, 20, 21, 22, 26, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 47, 50, 51, 52, 54, 56, 57, 59, 61, 63, 67, 68, 69, 70, 72, 74, 76, 77, 78, 79, 80, 81, 82, 85, 86, 87, 91, 93, 95, 97, 99, 104, 105, 107, 108, 109, 112, 113, 115, 116, 117, 118, 120, 123, 125, 126, 127, 129, 131, 133, 134, 135, 138, 139, 141, 144, 145, 146, 148, 149, 151, 153, 154, 155, 156, 157, 158], attrs=[57365]),
 ObjsAttrsSubset(objs=[0, 3, 6, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 26, 27, 28, 30, 