In [None]:
import itertools
import json
import pathlib
import pickle

import attr
import config
import numpy as np
import pandas as pd
import sklearn.utils
import tqdm.notebook

import skrough
from skrough.metrics.gini_impurity import gini_impurity
from skrough.utils.group_index import (
    compute_dec_distribution,
    compute_homogeneity,
    split_groups,
)

In [None]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"

EVAL_SETUP_1 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_10_10000.json",
}

EVAL_SETUP_2 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_20_10000.json",
}

EVAL_SETUP_3 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_30_10000.json",
}

EVAL_SETUP_4 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_50_10000.json",
}

EVAL_SETUP_5 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_daar.json",
}

EVAL_SETUP_6 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_reordered_cols_10_10000.json",
}

EVAL_SETUP_7 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_reordered_cols_20_10000.json",
}

EVAL_SETUP_8 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_reordered_cols_30_10000.json",
}

EVAL_SETUP_100 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_10_10000_draw_objects_mk2.json",
}

EVAL_SETUP_101 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_20_10000_draw_objects_mk2.json",
}

EVAL_SETUP_115 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_10.json",
}

EVAL_SETUP_116 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_20.json",
}

EVAL_SETUP_125 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_n_10000_sample_attrs_100_max_attrs_10.json",
}

EVAL_SETUP_126 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_n_10000_sample_attrs_100_max_attrs_20.json",
}


EVAL_SETUP_200 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepath": TMP_DIR / "bireducts_daar_draw_objects_mk2.json",
}

EVAL_SETUP_210 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_100_max_attrs_10.json",
}

EVAL_SETUP_211 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_10.json",
}

EVAL_SETUP_212 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20.json",
}

EVAL_SETUP_213 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_400_max_attrs_20.json",
}


EVAL_SETUP_250 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20_eps_0.3.json",
}


EVAL_SETUP_251 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_reordered_cols_n_10000_candidate_attrs_500_max_attrs_5_eps_0.3.json",
}


EVAL_SETUP_300 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.0.json",
}
EVAL_SETUP_301 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.2.json",
}
EVAL_SETUP_302 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.2.json",
}
EVAL_SETUP_303 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.4.json",
}

EVAL_SETUP_400 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json",
}


EVAL_SETUP_500 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepath": TMP_DIR
    / "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json",
}

In [None]:
def __prepare_values(values):
    factorized_values, uniques = pd.factorize(values)
    uniques = len(uniques)
    return factorized_values, uniques


def prepare_df(filepath=None, df=None):
    if df is None:
        df = pd.read_csv(filepath, sep=SEP)
    df_dec = df.pop("target")
    df = df.astype("category")
    df = df.apply(lambda x: x.cat.codes)

    x = df
    y = df_dec
    x, y = sklearn.utils.check_X_y(x, y, multi_output=False)
    data = np.apply_along_axis(__prepare_values, 0, x)
    x = np.vstack(data[0]).T
    x_count_distinct = data[1]
    y, y_count_distinct = __prepare_values(y)

    return x, x_count_distinct, y, y_count_distinct


def _compute_chaos_score(group_index, n_groups, xx, yy, yy_count_distinct):
    distribution = compute_dec_distribution(
        group_index, n_groups, yy, yy_count_distinct
    )
    return gini_impurity(distribution, len(xx))


def get_chaos_score(xx, xx_count_distinct, yy, yy_count_distinct, attrs):
    group_index = np.zeros(len(xx), dtype=np.int_)
    n_groups = 1
    for attr in attrs:
        group_index, n_groups = split_groups(
            group_index,
            n_groups,
            xx[:, attr],
            xx_count_distinct[attr],
            compress_group_index=True,
        )
    result = _compute_chaos_score(group_index, n_groups, xx, yy, yy_count_distinct)
    return result


def get_results(eval_setup, local_scope=False, sep=SEP):
    df_columns = pd.read_csv(eval_setup["data_filepath"], sep=sep, nrows=0).columns[:-1]

    xx, xx_count_distinct, yy, yy_count_distinct = prepare_df(
        eval_setup["data_filepath"]
    )
    results = {}
    filepath = eval_setup["bireducts_filepath"]
    with filepath.open("r") as f:
        bireducts = json.load(f)
    counts = np.zeros(xx.shape[1])
    scores = np.zeros(xx.shape[1])
    scores_2 = np.zeros(xx.shape[1])
    for bireduct in tqdm.notebook.tqdm(bireducts):
        bireduct_objects = bireduct["objects"]
        bireduct_all_attrs = set(bireduct["attributes"])
        xxx = xx
        yyy = yy
        if local_scope:
            xxx = xxx[bireduct_objects]
            yyy = yyy[bireduct_objects]
        starting_chaos_score = get_chaos_score(
            xxx, xx_count_distinct, yyy, yy_count_distinct, bireduct_all_attrs
        )
        counts[bireduct["attributes"]] += 1
        for attr in bireduct["attributes"]:
            attrs_to_check = bireduct_all_attrs.difference([attr])
            current_chaos_score = get_chaos_score(
                xxx, xx_count_distinct, yyy, yy_count_distinct, attrs_to_check
            )
            score_val = current_chaos_score - starting_chaos_score
            scores[attr] += score_val
            scores_2[attr] += score_val * len(bireduct_objects) / xx.shape[0]
    avg = np.divide(scores, counts, out=np.zeros_like(scores), where=counts > 0)
    avg_2 = np.divide(scores_2, counts, out=np.zeros_like(scores_2), where=counts > 0)
    results[filepath.name] = pd.DataFrame(
        {
            "column": df_columns,
            "score": scores,
            "score2": scores_2,
            "count": counts,
            "avg": avg,
            "avg2": avg_2,
        }
    ).sort_values(["score"], ascending=False)
    del bireducts
    return results


def do_eval(eval_setup, scope):
    assert scope in ("global", "local")
    results_dir = eval_setup["bireducts_filepath"].parent
    results_filename = f"{eval_setup['bireducts_filepath'].stem}_eval_scope_{scope}.pkl"
    results = get_results(eval_setup, local_scope=True if scope == "local" else False)
    with (results_dir / results_filename).open("wb") as f:
        pickle.dump(results, f)

In [None]:
# do_eval(EVAL_SETUP_1, 'local')
# do_eval(EVAL_SETUP_1, 'global')
# do_eval(EVAL_SETUP_2, 'local')
# do_eval(EVAL_SETUP_2, 'global')
# do_eval(EVAL_SETUP_3, 'local')
# do_eval(EVAL_SETUP_3, 'global')
# do_eval(EVAL_SETUP_4, 'local')
# do_eval(EVAL_SETUP_4, 'global')
# do_eval(EVAL_SETUP_5, 'local')
# do_eval(EVAL_SETUP_5, 'global')
# do_eval(EVAL_SETUP_6, 'local')
# do_eval(EVAL_SETUP_6, 'global')
# do_eval(EVAL_SETUP_7, 'local')
# do_eval(EVAL_SETUP_7, 'global')
# do_eval(EVAL_SETUP_8, 'local')
# do_eval(EVAL_SETUP_8, 'global')

# do_eval(EVAL_SETUP_100, 'local')
# do_eval(EVAL_SETUP_100, 'global')
# do_eval(EVAL_SETUP_101, 'local')
# do_eval(EVAL_SETUP_101, 'global')
# do_eval(EVAL_SETUP_115, 'local')
# do_eval(EVAL_SETUP_115, 'global')
# do_eval(EVAL_SETUP_116, 'local')
# do_eval(EVAL_SETUP_116, 'global')
# do_eval(EVAL_SETUP_125, 'local')
# do_eval(EVAL_SETUP_125, 'global')
# do_eval(EVAL_SETUP_126, 'local')
# do_eval(EVAL_SETUP_126, 'global')
# do_eval(EVAL_SETUP_200, 'local')
# do_eval(EVAL_SETUP_200, 'global')

# do_eval(EVAL_SETUP_210, 'local')
# do_eval(EVAL_SETUP_210, 'global')
# do_eval(EVAL_SETUP_211, 'local')
# do_eval(EVAL_SETUP_211, 'global')
# do_eval(EVAL_SETUP_212, 'local')
# do_eval(EVAL_SETUP_212, 'global')
# do_eval(EVAL_SETUP_213, 'local')
# do_eval(EVAL_SETUP_213, 'global')

# do_eval(EVAL_SETUP_250, 'local')
# do_eval(EVAL_SETUP_250, 'global')
# do_eval(EVAL_SETUP_251, 'local')
# do_eval(EVAL_SETUP_251, 'global')

# do_eval(EVAL_SETUP_300, 'local')
# do_eval(EVAL_SETUP_300, 'global')
# do_eval(EVAL_SETUP_301, 'local')
# do_eval(EVAL_SETUP_301, 'global')
# do_eval(EVAL_SETUP_302, 'local')
# do_eval(EVAL_SETUP_302, 'global')
# do_eval(EVAL_SETUP_303, 'local')
# do_eval(EVAL_SETUP_303, 'global')

# do_eval(EVAL_SETUP_400, 'local')
# do_eval(EVAL_SETUP_400, 'global')

do_eval(EVAL_SETUP_500, "local")
do_eval(EVAL_SETUP_500, "global")