In [None]:
import json
import pathlib

import attr
import config
import numpy as np
import pandas as pd
import tqdm.notebook

import skrough.reducts.greedy_heuristic_reduct

# from skrough.bireducts.sampling_heuristic_bireduct import SamplingHeuristicBireduct
from skrough.bireducts.greedy_heuristic_bireduct import GreedyHeuristicBireduct

In [None]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"
N_BIREDUCTS = 1000

# FILEPATH_IN = DATA_DIR / 'train_utf.csv'
# FILENAME_OUT_TEMPLATE = 'bireducts_n_{n_bireducts}_candidate_attrs_{candidate_n_attrs}_max_attrs_{max_n_attrs}.json'

FILEPATH_IN = DATA_DIR / "train_utf_with_reordered_cols.csv"
FILENAME_OUT_TEMPLATE = "bireducts_redphase_reordered_cols_n_{n_bireducts}_candidate_attrs_{candidate_n_attrs}_max_attrs_{max_n_attrs}_eps_{epsilon}.json"

In [None]:
def compute_bireducts(
    filepath,
    results_dir,
    filename_out_template,
    n_bireducts,
    candidate_n_attrs,
    max_n_attrs,
    epsilon,
    sep=SEP,
):
    df = pd.read_csv(filepath, sep=sep)
    df_dec = df.pop("target")
    df = df.astype("category")
    df = df.apply(lambda x: x.cat.codes)

    #     shr = SamplingHeuristicBireduct(candidate_n_attrs=candidate_n_attrs, max_n_attrs=max_n_attrs, epsilon=epsilon)
    #     shr.fit(df, df_dec, check_data_consistency=False)

    ghr = GreedyHeuristicBireduct(
        candidate_n_attrs=candidate_n_attrs, max_n_attrs=max_n_attrs, epsilon=epsilon
    )
    ghr.fit(df, df_dec, check_data_consistency=False)

    bireducts = []
    for i in tqdm.notebook.tnrange(n_bireducts):
        #         bireducts.append(attr.asdict(shr.get_bireduct()))
        bireducts.append(attr.asdict(ghr.get_bireduct()))

    with (
        results_dir
        / filename_out_template.format(
            n_bireducts=n_bireducts,
            candidate_n_attrs=candidate_n_attrs,
            max_n_attrs=max_n_attrs,
            epsilon=epsilon,
        )
    ).open("w") as f:
        json.dump(bireducts, f)

In [None]:
# compute_bireducts(FILEPATH_IN, TMP_DIR, FILENAME_OUT_TEMPLATE,
#                   n_bireducts=N_BIREDUCTS, candidate_n_attrs=50, max_n_attrs=30, epsilon=0.0)
# compute_bireducts(FILEPATH_IN, TMP_DIR, FILENAME_OUT_TEMPLATE,
#                   n_bireducts=N_BIREDUCTS, candidate_n_attrs=50, max_n_attrs=30, epsilon=0.2)
compute_bireducts(
    FILEPATH_IN,
    TMP_DIR,
    FILENAME_OUT_TEMPLATE,
    n_bireducts=N_BIREDUCTS,
    candidate_n_attrs=100,
    max_n_attrs=30,
    epsilon=0.2,
)
# compute_bireducts(FILEPATH_IN, TMP_DIR, FILENAME_OUT_TEMPLATE,
#                   n_bireducts=N_BIREDUCTS, candidate_n_attrs=100, max_n_attrs=30, epsilon=0.4)