In [None]:
import json
import pathlib

import attr
import config
import numpy as np
import pandas as pd
import tqdm.notebook
from numpy import random

import skrough

In [None]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"

EVAL_SETUP_1 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_10_10000.json",
        TMP_DIR / "bireducts_20_10000.json",
        TMP_DIR / "bireducts_30_10000.json",
        TMP_DIR / "bireducts_50_10000.json",
        TMP_DIR / "bireducts_daar.json",
    ],
}

EVAL_SETUP_2 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_10_10000.json",
        TMP_DIR / "bireducts_reordered_cols_20_10000.json",
        TMP_DIR / "bireducts_reordered_cols_30_10000.json",
    ],
}


EVAL_SETUP_3 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_10.json",
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_20.json",
    ],
}

EVAL_SETUP_4 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_100_max_attrs_10.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_10.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20_eps_0.3.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_400_max_attrs_20.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_500_max_attrs_5_eps_0.3.json",
    ],
}


EVAL_SETUP_5 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.0.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.2.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.2.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.4.json",
    ],
}

EVAL_SETUP_6 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json",
    ],
}

In [None]:
def get_results(eval_setup, sep=SEP):
    df = pd.read_csv(eval_setup["data_filepath"], sep=sep)
    df_dec = df.pop("target")
    df = df.astype("category")
    df = df.apply(lambda x: x.cat.codes)

    results = {}
    for filepath in eval_setup["bireducts_filepaths"]:
        with filepath.open("r") as f:
            bireducts = json.load(f)
        bireducts_max_attrs_len = max([len(b["attributes"]) for b in bireducts])
        counts = np.zeros(len(df.columns))
        scores = np.zeros(len(df.columns))
        scores_2 = np.zeros(len(df.columns))
        for bireduct in tqdm.notebook.tqdm(bireducts):
            bireducts_attrs_len = len(bireduct["attributes"])
            bireduct_attrs_score = (
                1 - (bireducts_attrs_len / bireducts_max_attrs_len) / 2
            )
            for i, attribute in enumerate(bireduct["attributes"]):
                counts[attribute] += 1
                score_val = bireduct_attrs_score * (1 - i / bireducts_attrs_len)
                scores[attribute] += score_val
                scores_2[attribute] += score_val * len(bireduct["objects"]) / len(df)
        avg = np.divide(scores, counts, out=np.zeros_like(scores), where=counts > 0)
        avg_2 = np.divide(
            scores_2, counts, out=np.zeros_like(scores_2), where=counts > 0
        )
        results[filepath.name] = pd.DataFrame(
            {
                "column": df.columns,
                "score": scores,
                "score2": scores_2,
                "count": counts,
                "avg": avg,
                "avg2": avg_2,
            }
        ).sort_values(["score"], ascending=False)

    del df, bireducts
    return results

In [None]:
# results_1 = get_results(EVAL_SETUP_1)
# results_2 = get_results(EVAL_SETUP_2)
# results_3 = get_results(EVAL_SETUP_3)
# results_4 = get_results(EVAL_SETUP_4)
# results_5 = get_results(EVAL_SETUP_5)
results_6 = get_results(EVAL_SETUP_6)

In [None]:
results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]

In [None]:
x = results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]
x = x.sort_values(["count"], ascending=False)
# x = x[x['count'] ]
seq = np.arange(1, len(x) + 1)
tmp = x["column"].str.contains("reordered_")
seq[tmp].mean()

In [None]:
seq[~tmp].mean()

In [None]:
results = results_6
sort_column = "score3"
normal = "normal = "
reordered = "reordered = "

for k, res in results.items():
    res["score3"] = res["score2"] * res["avg2"]
    res = res.sort_values([sort_column], ascending=False)

    #     first = res[res['count'] != 0].index.to_list()
    #     second = res[res['count'] == 0].index.to_list()
    #     random.shuffle(second)
    #     res = res.loc[list(first) + list(second)]

    res = res[res["count"] != 0]

    seq = np.arange(1, len(res) + 1)
    tmp = res["column"].str.contains("reordered_")
    print(
        f"{k}\nmean column rank\n"
        f"{normal:<15}{seq[~tmp].mean()}\n"
        f"{reordered:<15}{seq[tmp].mean()}\n"
    )

In [None]:
x = results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]
x["score3"] = x["score2"] * x["avg2"]
x.sort_values("score", ascending=False)[:15]

In [None]:
x