In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sdv.tabular import CTGAN, GaussianCopula
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import sys
sys.path.insert(0, "../")

# import logging
# logger = logging.getLogger()
# logger.setLevel(logging.INFO)


In [3]:
from data_iq.dataiq_class import *
from src.utils.data_loader import *
from src.utils.utils import *


In [4]:
def compare_datasets(dataset, nest=20):

    accs = []
    ranks = []

    rawaccs = []
    rawranks = []

    # repeat multiple times and average
    for i in tqdm(range(11)):

        acc_ranks = []
        easy_ranks = []

        results_iq = {}

        models = [GaussianCopula(), CTGAN()]
        model_names = ["GaussianCopula()", "CTGAN()"]

        (
            train_loader,
            train_data,
            X_train,
            y_train,
            X_test,
            y_test,
            X_train_pd,
            y_train_pd,
            X_test_pd,
            y_test_pd,
            nlabels,
            corr_vals,
            column_ids,
            df,
        ) = load_dataset(dataset)

        if dataset == "covid":
            X_train_pd = X_train_pd.drop(columns=["Race", "SG_UF_NOT"])
            nest = 100

        conf_thresh = 0.5

        for idx, model in enumerate(models):
            modelname = model_names[idx]

            df = deepcopy(X_train_pd)
            df["y"] = deepcopy(y_train_pd)

            # Fit synthetic data model used to generate the comparison dataset
            model.fit(df)

            # Sample data to train
            nrows = int(len(df) * 0.1)
            sampled_data = model.sample(num_rows=nrows)
            xt = sampled_data.drop(columns=["y"])
            yt = sampled_data["y"]

            # TRAIN CLASSIFIER & APPLY DATA-IQ

            # TRAIN ON SYNTHETIC
            clf = xgb.XGBClassifier(n_estimators=nest)
            clf.fit(xt, yt)

            # TEST ON REAL
            preds = clf.predict(X_test)

            # ASSESS DATA-IQ & GET GROUPS
            dataiq = DataIQ_SKLearn(X=xt, y=yt)
            for i in range(1, nest):
                dataiq.on_epoch_end(clf=clf, iteration=i)

            aleatoric_train = dataiq.aleatoric
            confidence_train = dataiq.confidence

            mid_val = (
                (np.max(aleatoric_train) - np.min(aleatoric_train)) / 2
            ) + np.min(aleatoric_train)

            hard_train = np.where(
                (confidence_train <= conf_thresh) & (aleatoric_train <= mid_val)
            )[0]
            ambig_train = np.where((aleatoric_train >= mid_val))[0]
            easy_train = np.where(
                (confidence_train >= conf_thresh) & (aleatoric_train <= mid_val)
            )[0]

            total = len(aleatoric_train)
            res = {
                "acc": accuracy_score(preds, y_test),
                "easy": len(easy_train) / total,
                "ambig": len(ambig_train) / total,
                "hard": len(hard_train) / total,
            }

            acc_ranks.append(accuracy_score(preds, y_test))
            easy_ranks.append(len(easy_train) / total)
            results_iq[modelname] = res

        # sort the ranking and accs
        ranks.append(np.argsort(easy_ranks))
        accs.append(np.argsort(acc_ranks))

        # appends the raw values
        rawranks.append(easy_ranks)
        rawaccs.append(acc_ranks)

    from statistics import mode

    print("#########################################################################")
    print("RESULTS FOR ", dataset)
    print("Highest rank model: ", model_names[mode(np.argmax(rawranks, axis=1))])
    idx = mode(np.argmax(rawranks, axis=1))
    print(
        f"Proportion EASY Examples Best: {np.mean(np.array(rawranks)[:,idx])}+-{np.std(np.array(rawranks)[:,idx])}"
    )
    print(
        f"Proportion EASY Examples Worst: {np.mean(np.array(rawranks)[:,1-idx])}+-{np.std(np.array(rawranks)[:,1-idx])}"
    )
    print("Highest acc model: ", model_names[mode(np.argmax(rawaccs, axis=1))])


In [5]:
import warnings

warnings.filterwarnings("ignore")

In [6]:
compare_datasets(dataset="fetal", nest=10)

100%|██████████| 11/11 [09:17<00:00, 50.65s/it]

#########################################################################
RESULTS FOR  fetal
Highest rank model:  GaussianCopula()
Proportion EASY Examples Best: 0.5245700245700246+-0.09002072752333024
Proportion EASY Examples Worst: 0.40909090909090917+-0.1345615479361666
Highest acc model:  GaussianCopula()





In [9]:
compare_datasets(dataset="covid")

100%|██████████| 11/11 [42:17<00:00, 230.72s/it]

#########################################################################
RESULTS FOR  covid
Highest rank model:  CTGAN()
Proportion EASY Examples Best: 0.7098411297440423+-0.04322549263190073
Proportion EASY Examples Worst: 0.6315092674315976+-0.029506530345211793
Highest acc model:  CTGAN()





In [22]:
compare_datasets(dataset="support")

100%|██████████| 11/11 [1:47:38<00:00, 587.10s/it]

#########################################################################
RESULTS FOR  support
Highest rank model:  CTGAN()
Proportion EASY Examples Best: 0.5847485847485847+-0.20108960676156154
Proportion EASY Examples Worst: 0.3684648684648684+-0.03455510295324102
Highest acc model:  CTGAN()





In [16]:
compare_datasets(dataset="prostate")

100%|██████████| 11/11 [1:51:51<00:00, 610.17s/it]

#########################################################################
RESULTS FOR  prostate
Highest rank model:  CTGAN()
Proportion EASY Examples Best: 0.6215249266862171+-0.059004623262449485
Proportion EASY Examples Worst: 0.30158357771260996+-0.030938847268059504
Highest acc model:  CTGAN()



