In [1]:
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


from src.dips_selector import *
from src.data_loader import *
from src.baseline_functions import *
from src.data_loader import * 

import traceback

# Setup

In [2]:

overall_result_dicts = []
overall_data_dicts = []
overall_model_dicts = []

dips_metric = 'aleatoric'
dips_ythresh = 0.2

algorithm_list=['Fully Supervised', 'Supervised_Learning','Pseudo_Labeling'] # Others: UPS, CSA, FlexMatch, SLA
        
dataset_name = 'compas'
seed=42
nest=100
prop_lab = 0.1
prop_data=1
num_XGB_models=5
numTrials=1
numIters=5
upper_threshold=0.8
verbose=False

# Run

In [3]:
try:
    for i in tqdm(range(numTrials)):
    
        seed+=1
        print(f"Trial {i+1}/{numTrials}")
        results = {}
        data = {}
        models = {}

        df_feat, df_label, df = get_data(dataset=dataset_name, prop=prop_data)

        x_train, x_test, y_train, y_test = train_test_split(
            df_feat, df_label, test_size=0.2, random_state=seed
        )

        x_train, x_unlabeled, y_train, y_unlabeled = train_test_split(
            x_train, y_train, train_size=prop_lab, random_state=seed
        )

        x_unlabeled, x_test, y_test, x_train, y_train = (
            np.asarray(x_unlabeled),
            np.asarray(x_test),
            np.asarray(y_test),
            np.asarray(x_train),
            np.asarray(y_train),
        )
            

        datasize = x_train.shape

        total_samples = len(x_train) + len(x_test) + len(x_unlabeled)

        print(f"# total samples = {total_samples} ({prop_data} - prop)")

        print(f"# training points = {y_train.shape[0]}")

        print(f"# test points = {y_test.shape[0]}")

        print(f"# unlabelled points = {x_unlabeled.shape[0]}")



        # # Supervised learning - Train an XGBoost model
        param = {}
        param["booster"] = "gbtree"
        param["objective"] = "binary:logistic"
        param["verbosity"] = 0
        param["n_estimators"] = nest
        param["silent"] = 1
        param["seed"] = seed


        print("Training Supervised model...")
        # create XGBoost instance with default hyper-parameters
        xgb = XGBClassifier(**param)

        xgb.fit(x_train, y_train)

        # evaluate the performance on the test set
        y_test_pred = xgb.predict(x_test)
        supervised_learning_accuracy = np.round(
            accuracy_score(y_test_pred, y_test) * 100, 2
        )  # round to 2 digits xx.yy %

        results["supervised_learning_accuracy"] = supervised_learning_accuracy

        # Run dips
        dips_xgb = DIPS_selector(X=x_train, y=y_train)

        for i in range(1, nest):
            # *** Characterize with dips [LINE 2] ***
            dips_xgb.on_epoch_end(clf=xgb, iteration=i)

        # *** Access metrics ***
        if dips_metric == "aleatoric":
            dips_xmetric = dips_xgb.aleatoric
        elif dips_metric == "epistemic":
            dips_xmetric = dips_xgb.variability
        elif dips_metric == "entropy":
            dips_xmetric = dips_xgb.entropy
        elif dips_metric == "mi":
            dips_xmetric = dips_xgb.mi

        confidence = dips_xgb.confidence

        assert len(confidence) == len(y_train)

        # adaptive threshold
        dips_xthresh =  0.75*(np.max(dips_xmetric)-np.min(dips_xmetric))

        easy_train, ambig_train, hard_train = get_groups(
            confidence=confidence,
            aleatoric_uncertainty=dips_xmetric,
            dips_xthresh=dips_xthresh,
            dips_ythresh=dips_ythresh,
        )

        if 'Pseudo_Labeling' in algorithm_list:

            print("Running Pseudo Labeling...")

            (
                pseudo_labeling_acc_vanilla,
                pseudo_labeling_acc_dips_begin,
                pseudo_labeling_acc_dips_full,
                pseudo_labeling_acc_dips_partial,
                artifacts

                
            ) = run_pseudo(
                x_unlabeled=x_unlabeled,
                x_test=x_test,
                y_test=y_test,
                x_train=x_train,
                y_train=y_train,
                numIters=numIters,
                upper_threshold=upper_threshold,
                nest=nest,
                seed=seed,
                easy_train=easy_train,
                dips_metric=dips_metric,
                dips_xthresh=dips_xthresh,
                dips_ythresh=dips_ythresh,
                verbose=verbose,
            )

            results["pseudo"] = {
                "vanilla": pseudo_labeling_acc_vanilla,
                "dips_full": pseudo_labeling_acc_dips_full,
            }

            data['pseudo'] = {'vanilla':artifacts['vanilla']['data'], 
                'dips_full':artifacts['full']['data'], 
                }
            
            models['pseudo'] = {'vanilla':artifacts['vanilla']['models'], 
                'dips_full':artifacts['full']['models'], 
                }

        if 'CSA' in algorithm_list:
            print("Running CSA...")
            (
                csa_acc_vanilla,
                csa_acc_dips_begin,
                csa_acc_dips_full,
                csa_acc_dips_partial,
                artifacts

            ) = run_CSA(
                x_unlabeled=x_unlabeled,
                x_test=x_test,
                y_test=y_test,
                x_train=x_train,
                y_train=y_train,
                numIters=numIters,
                num_XGB_models=num_XGB_models,
                nest=nest,
                seed=seed,
                easy_train=easy_train,
                dips_metric=dips_metric,
                dips_xthresh=dips_xthresh,
                dips_ythresh=dips_ythresh,
                verbose=verbose,
            )

            results["csa"] = {
                "vanilla": csa_acc_vanilla,
                "dips_full": csa_acc_dips_full,
            }

            data['csa'] = {'vanilla':artifacts['vanilla']['data'], 
                'dips_full':artifacts['full']['data'], 
                }
            
            models['csa'] = {'vanilla':artifacts['vanilla']['models'], 
                'dips_full':artifacts['full']['models'], 
                }


        if 'SLA' in algorithm_list:
            print("Running SLA...")
            (
                sla_acc_vanilla,
                sla_acc_dips_begin,
                sla_acc_dips_full,
                sla_acc_dips_partial,
                artifacts

            ) = run_SLA(
                x_unlabeled=x_unlabeled,
                x_test=x_test,
                y_test=y_test,
                x_train=x_train,
                y_train=y_train,
                numIters=numIters,
                num_XGB_models=num_XGB_models,
                nest=nest,
                seed=seed,
                easy_train=easy_train,
                dips_metric=dips_metric,
                dips_xthresh=dips_xthresh,
                dips_ythresh=dips_ythresh,
                verbose=verbose,
            )

            results["sla"] = {
                "vanilla": sla_acc_vanilla,
                "dips_full": sla_acc_dips_full,
    
            }

            data['sla'] = {'vanilla':artifacts['vanilla']['data'], 
                'dips_full':artifacts['full']['data'], 
                }
            models['sla'] = {'vanilla':artifacts['vanilla']['models'], 
                'dips_full':artifacts['full']['models'], 

                }



        if 'UPS' in algorithm_list:
            print("Running UPS...")
            (
                ups_acc_vanilla,
                ups_acc_dips_begin,
                ups_acc_dips_full,
                ups_acc_dips_partial,
                artifacts
            ) = run_UPS(
                x_unlabeled=x_unlabeled,
                x_test=x_test,
                y_test=y_test,
                x_train=x_train,
                y_train=y_train,
                numIters=numIters,
                num_XGB_models=num_XGB_models,
                nest=nest,
                seed=seed,
                easy_train=easy_train,
                dips_metric=dips_metric,
                dips_xthresh=dips_xthresh,
                dips_ythresh=dips_ythresh,
                verbose=verbose,
            )

            results["ups"] = {
                "vanilla": ups_acc_vanilla,
                "dips_full": ups_acc_dips_full,
            }

            data['ups'] = {'vanilla':artifacts['vanilla']['data'], 
                'dips_full':artifacts['full']['data'], 
                }
            
            models['ups'] = {'vanilla':artifacts['vanilla']['models'], 
                'dips_full':artifacts['full']['models'], 
                }


        if 'FlexMatch' in algorithm_list:
            print("Running Flex match...")
            (
                flex_acc_vanilla,
                flex_acc_dips_begin,
                flex_acc_dips_full,
                flex_acc_dips_partial,
                artifacts
            ) = run_FlexMatch(
                x_unlabeled=x_unlabeled,
                x_test=x_test,
                y_test=y_test,
                x_train=x_train,
                y_train=y_train,
                upper_threshold=upper_threshold,
                numIters=numIters,
                nest=nest,
                seed=seed,
                easy_train=easy_train,
                dips_metric=dips_metric,
                dips_xthresh=dips_xthresh,
                dips_ythresh=dips_ythresh,
                verbose=verbose,
            )

            results["flex"] = {
                "vanilla": flex_acc_vanilla,
                "dips_full": flex_acc_dips_full,
            }

            data['flex'] = {'vanilla':artifacts['vanilla']['data'], 
                'dips_full':artifacts['full']['data'], 
                }
            
            models['flex'] = {'vanilla':artifacts['vanilla']['models'],  
                'dips_full':artifacts['full']['models'], 
                }

        overall_result_dicts.append(results)
        overall_data_dicts.append(data)
        overall_model_dicts.append(models)

    overall_result_dicts, overall_data_dicts, overall_model_dicts, datasize

            
except:
    print(traceback.format_exc())


  0%|          | 0/1 [00:00<?, ?it/s]

  X, y, categorical_indicator, attribute_names = dataset.get_data(


Trial 1/1
# total samples = 5278 (1 - prop)
# training points = 422
# test points = 1056
# unlabelled points = 3800
Training Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[230 192]
n iterations 5
iteration  0
iteration  1
iteration  2




iteration  3
iteration  4




===== Pseudo_Labeling
[116  88]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[116  88]
n iterations 5
iteration  0




iteration  1
iteration  2
iteration  3
iteration  4




===== Pseudo_Labeling
[230 192]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


In [4]:
overall_result_dicts

[{'supervised_learning_accuracy': 60.7,
  'pseudo': {'vanilla': [60.7, 60.98, 62.31, 61.36, 61.84],
   'dips_full': [64.49, 66.0, 64.96, 65.25, 64.87]}}]