In [1]:
import contextlib
from dataclasses import replace

from aim import Repo, Run, Text
from aim.sdk.types import QueryReportMode
from microarrays_data import get_discretized_prepared, get_microarray_data_shuffled
from ranks_model_scores import (
    BireductsHParams,
    CorrelationHParams,
    HParamsBase,
    XGBoostHParams,
    get_bireducts_scores,
    get_correlation_scores,
    get_xgboost_scores,
)

from skrough.ranks import compare_ranks

In [2]:
FINISHED_TAG = "finished"
EXPERIMENT_NAME = "exp1"

repo = Repo(".")


def aim_run_present(repo: Repo, experiment, hparams: HParamsBase):
    query = [
        f'run.experiment == "{experiment}"',
        hparams.asquery(),
        f"run.tags.count('{FINISHED_TAG}')",
    ]
    query = " and ".join(query)
    result = list(
        repo.query_runs(
            query,
            report_mode=QueryReportMode.DISABLED,
        ).iter_runs()
    )
    return len(result) > 0

In [3]:
microarray_files = [
    "acuteLymphoblasticLeukemia_processed.csv",
    "anthracyclineTaxaneChemotherapy_processed.csv",
    "brainTumour_processed.csv",
    "BurkittLymphoma_processed.csv",
    "gingivalPeriodontits_processed.csv",
    "heartFailurFactors_processed.csv",
    "hepatitisC_processed.csv",
    "humanGlioma_processed.csv",
    "ovarianTumour_processed.csv",
    "septicShock_processed.csv",
    "skinPsoriatic_processed.csv",
]

# bireducts

In [4]:
bireducts_hparams_base = BireductsHParams(
    filename="_changeme",
    chaos_fun="gini_impurity",
    epsilon=0.0,
    attrs_max_count=-1,
    candidates_count=100,
    selected_count=1,
    consecutive_daar_reps=1,
    allowed_randomness=0.05,
    probes_count=100,
    # n_bireducts=1000,
    n_bireducts=3,
)

In [15]:
for attrs_max_count in [3, 7, 15, 31]:
    for microarray_filename in microarray_files:
        bireducts_hparams = replace(
            bireducts_hparams_base,
            filename=microarray_filename,
            attrs_max_count=attrs_max_count,
        )
        if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=bireducts_hparams):
            continue
        print(bireducts_hparams.filename)
        with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
            run.add_tag("bireducts")
            df, df_dec = get_microarray_data_shuffled(bireducts_hparams.filename)
            column_names = df.columns
            x, x_counts, y, y_count = get_discretized_prepared(df, df_dec)
            bireducts_scores = get_bireducts_scores(
                x,
                x_counts,
                y,
                y_count,
                column_names=column_names,
                hparams=bireducts_hparams,
                seed=None,
                n_jobs=3,
            )
            run["hparams"] = bireducts_hparams.asdict()
            run.track(Text(bireducts_scores.to_json()), name="scores")
            run.add_tag(FINISHED_TAG)

# XGBoost

In [4]:
xgboost_hparams_base = XGBoostHParams(
    filename="_changeme",
    # num_boost_round=1000,
    num_boost_round=3,
    learning_rate=0.001,
    max_depth=-1,
    objective="multi:softmax",
)

In [8]:
for max_depth in [2, 3, 4, 5, 10]:
    for microarray_filename in microarray_files:
        xgboost_hparams = replace(
            xgboost_hparams_base,
            filename=microarray_filename,
            max_depth=max_depth,
        )
        if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=xgboost_hparams):
            continue
        print(xgboost_hparams.filename)
        with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
            run.add_tag("xgboost")
            df, df_dec = get_microarray_data_shuffled(xgboost_hparams.filename)
            xgboost_scores = get_xgboost_scores(df, df_dec, xgboost_hparams)
            run["hparams"] = xgboost_hparams.asdict()
            run.track(Text(xgboost_scores.to_json()), name="scores")
            run.add_tag(FINISHED_TAG)

acuteLymphoblasticLeukemia_processed.csv


# Correlation

In [10]:
correlation_hparams_base = CorrelationHParams(
    filename="_changeme",
)

In [11]:
for microarray_filename in microarray_files:
    correlation_hparams = replace(
        correlation_hparams_base, filename=microarray_filename
    )
    if aim_run_present(repo, experiment=EXPERIMENT_NAME, hparams=correlation_hparams):
        continue
    print(correlation_hparams.filename)
    with contextlib.closing(Run(repo=repo, experiment=EXPERIMENT_NAME)) as run:
        run.add_tag("correlation")
        df, df_dec = get_microarray_data_shuffled(correlation_hparams.filename)
        correlation_scores = get_correlation_scores(df, df_dec)
        run["hparams"] = correlation_hparams.asdict()
        run.track(Text(correlation_scores.to_json()), name="scores")
        run.add_tag(FINISHED_TAG)

acuteLymphoblasticLeukemia_processed.csv
