In [3]:
import json
import pathlib

import attr
import config
import numpy as np
import pandas as pd
import tqdm.notebook
from numpy import random

import skrough

In [4]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"

EVAL_SETUP_1 = {
    "data_filepath": DATA_DIR / "train_utf.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_10_10000.json",
        TMP_DIR / "bireducts_20_10000.json",
        TMP_DIR / "bireducts_30_10000.json",
        TMP_DIR / "bireducts_50_10000.json",
        TMP_DIR / "bireducts_daar.json",
    ],
}

EVAL_SETUP_2 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_10_10000.json",
        TMP_DIR / "bireducts_reordered_cols_20_10000.json",
        TMP_DIR / "bireducts_reordered_cols_30_10000.json",
    ],
}


EVAL_SETUP_3 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_10.json",
        TMP_DIR / "bireducts_reordered_cols_n_10000_sample_attrs_100_max_attrs_20.json",
    ],
}

EVAL_SETUP_4 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_100_max_attrs_10.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_10.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_200_max_attrs_20_eps_0.3.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_400_max_attrs_20.json",
        TMP_DIR
        / "bireducts_reordered_cols_n_10000_candidate_attrs_500_max_attrs_5_eps_0.3.json",
    ],
}


EVAL_SETUP_5 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.0.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_50_max_attrs_30_eps_0.2.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.2.json",
        TMP_DIR
        / "bireducts_redphase_reordered_cols_n_1000_candidate_attrs_100_max_attrs_30_eps_0.4.json",
    ],
}

EVAL_SETUP_6 = {
    "data_filepath": DATA_DIR / "train_utf_with_reordered_cols.csv",
    "bireducts_filepaths": [
        TMP_DIR
        / "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json",
    ],
}

In [5]:
def get_results(eval_setup, sep=SEP):
    df = pd.read_csv(eval_setup["data_filepath"], sep=sep)
    df_dec = df.pop("target")
    df = df.astype("category")
    df = df.apply(lambda x: x.cat.codes)

    results = {}
    for filepath in eval_setup["bireducts_filepaths"]:
        with filepath.open("r") as f:
            bireducts = json.load(f)
        bireducts_max_attrs_len = max([len(b["attributes"]) for b in bireducts])
        counts = np.zeros(len(df.columns))
        scores = np.zeros(len(df.columns))
        scores_2 = np.zeros(len(df.columns))
        for bireduct in tqdm.notebook.tqdm(bireducts):
            bireducts_attrs_len = len(bireduct["attributes"])
            bireduct_attrs_score = (
                1 - (bireducts_attrs_len / bireducts_max_attrs_len) / 2
            )
            for i, attribute in enumerate(bireduct["attributes"]):
                counts[attribute] += 1
                score_val = bireduct_attrs_score * (1 - i / bireducts_attrs_len)
                scores[attribute] += score_val
                scores_2[attribute] += score_val * len(bireduct["objects"]) / len(df)
        avg = np.divide(scores, counts, out=np.zeros_like(scores), where=counts > 0)
        avg_2 = np.divide(
            scores_2, counts, out=np.zeros_like(scores_2), where=counts > 0
        )
        results[filepath.name] = pd.DataFrame(
            {
                "column": df.columns,
                "score": scores,
                "score2": scores_2,
                "count": counts,
                "avg": avg,
                "avg2": avg_2,
            }
        ).sort_values(["score"], ascending=False)

    del df, bireducts
    return results

In [6]:
# results_1 = get_results(EVAL_SETUP_1)
# results_2 = get_results(EVAL_SETUP_2)
# results_3 = get_results(EVAL_SETUP_3)
# results_4 = get_results(EVAL_SETUP_4)
# results_5 = get_results(EVAL_SETUP_5)
results_6 = get_results(EVAL_SETUP_6)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]

Unnamed: 0,column,score,score2,count,avg,avg2
81,developer_in_title,50.316955,44.208428,102.0,0.493303,0.433416
96,number_of_company_following,49.047835,42.840713,128.0,0.383186,0.334693
74,recruitment_developer_in_role_name,44.283081,38.580763,112.0,0.395385,0.344471
191,country_continent..Other,42.001190,37.095647,96.0,0.437512,0.386413
190,country_continent..EU,41.381926,36.475921,97.0,0.426618,0.376040
...,...,...,...,...,...,...
663,skill_GNU,0.000000,0.000000,0.0,0.000000,0.000000
662,skill_SQL_Azure,0.000000,0.000000,0.0,0.000000,0.000000
659,skill_Symbian,0.000000,0.000000,0.0,0.000000,0.000000
657,skill_Opera,0.000000,0.000000,0.0,0.000000,0.000000


In [11]:
x = results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]
x = x.sort_values(["count"], ascending=False)
# x = x[x['count'] ]
seq = np.arange(1, len(x) + 1)
tmp = x["column"].str.contains("reordered_")
seq[tmp].mean()

1018.5725388601036

In [12]:
seq[~tmp].mean()

526.4274611398964

In [14]:
results = results_6
sort_column = "score3"
normal = "normal = "
reordered = "reordered = "

for k, res in results.items():
    res["score3"] = res["score2"] * res["avg2"]
    res = res.sort_values([sort_column], ascending=False)

    #     first = res[res['count'] != 0].index.to_list()
    #     second = res[res['count'] == 0].index.to_list()
    #     random.shuffle(second)
    #     res = res.loc[list(first) + list(second)]

    res = res[res["count"] != 0]

    seq = np.arange(1, len(res) + 1)
    tmp = res["column"].str.contains("reordered_")
    print(
        f"{k}\nmean column rank\n"
        f"{normal:<15}{seq[~tmp].mean()}\n"
        f"{reordered:<15}{seq[tmp].mean()}\n"
    )

bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json
mean column rank
normal =       163.0035335689046
reordered =    327.0807453416149



In [29]:
x = results_6[
    "bireducts_daab_reordered_cols_n_1000_candidate_attrs_30_allowed_randomness_0.1.json"
]
x["score3"] = x["score2"] * x["avg2"]
x.sort_values("score", ascending=False)[:15]

Unnamed: 0,column,score,score2,count,avg,avg2,score3
81,developer_in_title,50.316955,44.208428,102.0,0.493303,0.433416,19.160638
96,number_of_company_following,49.047835,42.840713,128.0,0.383186,0.334693,14.33849
74,recruitment_developer_in_role_name,44.283081,38.580763,112.0,0.395385,0.344471,13.289958
191,country_continent..Other,42.00119,37.095647,96.0,0.437512,0.386413,14.33424
190,country_continent..EU,41.381926,36.475921,97.0,0.426618,0.37604,13.716421
218,country_developers_count..NA,40.490332,34.838892,92.0,0.440112,0.378684,13.192917
474,skill_programming_platforms,38.073593,33.223942,79.0,0.481944,0.420556,13.972535
207,country_cluster..Other,35.348016,31.083302,79.0,0.447443,0.39346,12.230021
149,role_category_cluster..Other,34.209668,29.23714,67.0,0.510592,0.436375,12.758363
143,recruitment_role_category..PRODUCT,32.970996,28.383134,75.0,0.439613,0.378442,10.741364


In [150]:
x

Unnamed: 0,column,score,score2,count,avg,avg2
96,number_of_company_following,194.800150,190.172753,461.0,0.422560,0.412522
74,recruitment_developer_in_role_name,119.549784,116.712970,243.0,0.491974,0.480300
190,country_continent..EU,77.175075,75.326913,157.0,0.491561,0.479789
120,country_developers_count,76.611888,74.806721,153.0,0.500731,0.488933
36,developer_in_any_job_title,73.309441,71.563059,142.0,0.516264,0.503965
...,...,...,...,...,...,...
594,skill_SPICE,0.000000,0.000000,0.0,0.000000,0.000000
593,skill_SNMP,0.000000,0.000000,0.0,0.000000,0.000000
592,skill_SMTP,0.000000,0.000000,0.0,0.000000,0.000000
591,skill_SIP,0.000000,0.000000,0.0,0.000000,0.000000
