In [1]:
import json
import pathlib

import attr
import config
import numpy as np
import pandas as pd
import tqdm.notebook

import skrough

In [7]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"

EVAL_SETUP_1 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_10_10000.json",
        TMP_DIR / "bireducts_20_10000.json",
        TMP_DIR / "bireducts_30_10000.json",
        TMP_DIR / "bireducts_50_10000.json",
        TMP_DIR / "bireducts_daar.json",
    ],
}

EVAL_SETUP_2 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_10_10000.json",
        TMP_DIR / "bireducts_reordered_cols_20_10000.json",
        TMP_DIR / "bireducts_reordered_cols_30_10000.json",
    ],
}


EVAL_SETUP_3 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_10.json",
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_20.json",
    ],
}


EVAL_SETUP_4 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json",
    ],
}

In [8]:
def get_results(eval_setup, sep=SEP):
    df = pd.read_csv(eval_setup["data_filepath"], sep=sep)
    df_dec = df.pop("target")
    df = df.astype("category")
    df = df.apply(lambda x: x.cat.codes)

    results = {}
    for filepath in eval_setup["bireducts_filepaths"]:
        with filepath.open("r") as f:
            bireducts = json.load(f)
        counts = np.zeros(len(df.columns))
        weights = np.zeros(len(df.columns))
        for bireduct in tqdm.notebook.tqdm(bireducts):
            counts[bireduct["attributes"]] += 1
            w = len(bireduct["objects"]) / len(df)
            weights[bireduct["attributes"]] += w
        average_weight = np.divide(
            weights, counts, out=np.zeros_like(weights), where=counts > 0
        )
        results[filepath.name] = pd.DataFrame(
            {
                "column": df.columns,
                "weight": weights,
                "count": counts,
                "average_weight": average_weight,
            }
        ).sort_values(["weight"], ascending=False)

    del df, bireducts
    return results

In [9]:
# results_1 = get_results(EVAL_SETUP_1)
# results_2 = get_results(EVAL_SETUP_2)
# results_3 = get_results(EVAL_SETUP_3)
results_4 = get_results(EVAL_SETUP_4)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [31]:
x = results_4[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]
x.sort_values("weight", ascending=False)[:15]

Unnamed: 0,column,weight,count,average_weight
96,number_of_company_following,111.891516,128.0,0.874152
74,recruitment_developer_in_role_name,97.250212,112.0,0.868305
81,developer_in_title,89.775384,102.0,0.880151
190,country_continent..EU,85.446233,97.0,0.880889
191,country_continent..Other,84.174924,96.0,0.876822
218,country_developers_count..NA,79.167625,92.0,0.860518
120,country_developers_count,71.512723,82.0,0.872106
207,country_cluster..Other,69.880718,79.0,0.884566
474,skill_programming_platforms,69.061741,79.0,0.874199
36,developer_in_any_job_title,65.219667,74.0,0.881347


In [55]:
x = results_3[
    "bireducts_data_reordered_cols_n_10000_sample_attrs_100_max_attrs_10.json"
]
x = x.sort_values(["count"], ascending=False)
# x = x[x['count'] ]
seq = np.arange(1, len(x) + 1)
tmp = x["column"].str.contains("reordered_")
seq[tmp].mean()

908.3329015544041

In [51]:
seq[~tmp].mean()

25.5