In [None]:
import contextlib
from dataclasses import replace

import numpy as np
from aim import Repo, Run, Text
from aim.sdk.types import QueryReportMode
from data_sources import (
    get_discretized_prepared,
    get_microarray_data_shuffled,
    get_synthetic_data_shuffled,
    get_toolbox_data_shuffled,
)
from ranks_model_scores import (
    BireductsHParams,
    CorrelationHParams,
    HParamsBase,
    XGBoostHParams,
    get_bireducts_scores,
    get_correlation_scores,
    get_xgboost_scores,
)

In [None]:
FINISHED_TAG = "finished"
EXPERIMENT_NAME = "exp1"

repo = Repo(".")


def aim_run_present(repo: Repo, experiment, hparams: HParamsBase):
    query = [
        f'run.experiment == "{experiment}"',
        hparams.asquery(),
        f"run.tags.count('{FINISHED_TAG}')",
    ]
    query = " and ".join(query)
    result = list(
        repo.query_runs(
            query,
            report_mode=QueryReportMode.DISABLED,
        ).iter_runs()
    )
    return len(result) > 0

In [None]:
# tuples representing filenames and appropriate functions to load and prepare shuffled
# data

data_files = [
    ("acuteLymphoblasticLeukemia_processed.csv", get_microarray_data_shuffled),
    ("anthracyclineTaxaneChemotherapy_processed.csv", get_microarray_data_shuffled),
    ("brainTumour_processed.csv", get_microarray_data_shuffled),
    ("BurkittLymphoma_processed.csv", get_microarray_data_shuffled),
    ("gingivalPeriodontits_processed.csv", get_microarray_data_shuffled),
    ("heartFailurFactors_processed.csv", get_microarray_data_shuffled),
    ("hepatitisC_processed.csv", get_microarray_data_shuffled),
    ("humanGlioma_processed.csv", get_microarray_data_shuffled),
    ("ovarianTumour_processed.csv", get_microarray_data_shuffled),
    ("septicShock_processed.csv", get_microarray_data_shuffled),
    ("skinPsoriatic_processed.csv", get_microarray_data_shuffled),
    ("recruitment_data.csv", get_toolbox_data_shuffled),
    ("synthetic1.csv", get_synthetic_data_shuffled),
    ("synthetic2.csv", get_synthetic_data_shuffled),
    ("synthetic3.csv", get_synthetic_data_shuffled),
]

# bireducts

In [None]:
bireducts_hparams_base = BireductsHParams(
    filename="_changeme",
    n_bins=3,
    chaos_fun="gini_impurity",
    epsilon=0.0,
    attrs_max_count=-1,
    candidates_count=100,
    selected_count=1,
    consecutive_daar_reps=1,
    allowed_randomness=0.05,
    probes_count=100,
    n_bireducts=1000,
)

In [None]:
for attrs_max_count in [3, 7, 15, 31]:
    for filename, _get_shuffled_data_function in data_files:
        bireducts_hparams = replace(
            bireducts_hparams_base,
            filename=filename,
            attrs_max_count=attrs_max_count,
        )
        if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=bireducts_hparams):
            continue
        print(bireducts_hparams.filename)
        with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
            run.add_tag("bireducts")
            df, df_dec = _get_shuffled_data_function(bireducts_hparams.filename)
            column_names = df.columns
            x, x_counts, y, y_count = get_discretized_prepared(
                df, df_dec, n_bins=bireducts_hparams.n_bins
            )
            bireducts_scores, bireducts = get_bireducts_scores(
                x,
                x_counts,
                y,
                y_count,
                column_names=column_names,
                hparams=bireducts_hparams,
                seed=None,
                n_jobs=6,
            )
            run["hparams"] = bireducts_hparams.asdict()
            run.track(Text(bireducts_scores.to_json()), name="scores")
            objs_size_list = [len(b.objs) for b in bireducts]
            attrs_size_list = [len(b.attrs) for b in bireducts]
            for objs_size in objs_size_list:
                run.track(objs_size, name="objs_size")
            for attrs_size in attrs_size_list:
                run.track(attrs_size, name="attrs_size")
            run.track(np.mean(objs_size_list), name="mean_objs_size")
            run.track(np.median(objs_size_list), name="median_objs_size")
            run.track(np.mean(attrs_size_list), name="mean_attrs_size")
            run.track(np.median(attrs_size_list), name="median_attrs_size")
            run.add_tag(FINISHED_TAG)

# XGBoost

In [None]:
xgboost_hparams_base = XGBoostHParams(
    filename="_changeme",
    num_boost_round=1000,
    learning_rate=0.001,
    max_depth=-1,
    objective="multi:softmax",
)

In [None]:
for max_depth in [2, 3, 4, 5, 10]:
    for filename, _get_shuffled_data_function in data_files:
        xgboost_hparams = replace(
            xgboost_hparams_base,
            filename=filename,
            max_depth=max_depth,
        )
        if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=xgboost_hparams):
            continue
        print(xgboost_hparams.filename)
        with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
            run.add_tag("xgboost")
            df, df_dec = _get_shuffled_data_function(xgboost_hparams.filename)
            xgboost_scores = get_xgboost_scores(
                df,
                df_dec,
                xgboost_hparams,
                n_jobs=6,
            )
            run["hparams"] = xgboost_hparams.asdict()
            run.track(Text(xgboost_scores.to_json()), name="scores")
            run.add_tag(FINISHED_TAG)

# Correlation

In [None]:
correlation_hparams_base = CorrelationHParams(
    filename="_changeme",
)

In [None]:
for filename, _get_shuffled_data_function in data_files:
    correlation_hparams = replace(correlation_hparams_base, filename=filename)
    if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=correlation_hparams):
        continue
    print(correlation_hparams.filename)
    with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
        run.add_tag("correlation")
        df, df_dec = _get_shuffled_data_function(correlation_hparams.filename)
        correlation_scores = get_correlation_scores(df, df_dec)
        run["hparams"] = correlation_hparams.asdict()
        run.track(Text(correlation_scores.to_json()), name="scores")
        run.add_tag(FINISHED_TAG)