In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))

from tqdm import tqdm
from src.runner import run_baselines
from src.utils import process_results
from src.data_loader import *
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from random import sample
from copy import deepcopy

from src.dips_selector import *
from src.data_loader import *
from src.baseline_functions import *

import torch
import torch.nn as nn

import pandas as pd
from sklearn import cluster
import math
import scipy.stats as stats

from sklearn.mixture import GaussianMixture as GMM

from datagnosis.plugins.core.datahandler import DataHandler
from datagnosis.plugins.core.models.simple_mlp import SimpleMLP
from datagnosis.plugins import Plugins


# Utility functions

In [2]:

def fit_mixture(scores, labels, p_threshold=0.5):
    '''
    Assume the distribution of scores: bimodal gaussian mixture model
    
    return clean labels
    that belongs to the clean cluster by fitting the score distribution to GMM
    '''
    
    clean_labels = []
    indexes = np.array(range(len(scores)))
    for cls in np.unique(labels):
        cls_index = indexes[labels==cls]
        feats = scores[labels==cls]
        feats_ = np.ravel(feats).astype(np.float).reshape(-1, 1)
        gmm = GMM(n_components=2, covariance_type='full', tol=1e-6, max_iter=100)
        
        gmm.fit(feats_)
        prob = gmm.predict_proba(feats_)
        prob = prob[:,gmm.means_.argmax()]
        clean_labels = prob > p_threshold 
        clean_labels = np.where(clean_labels==1)[0]
    
    return clean_labels


def get_mean_vector(features, labels):
    mean_vector_dict = {}
    with tqdm(total=len(np.unique(labels))) as pbar:
        for index in np.unique(labels):
            v = np.mean(features[labels==index], axis=0)
            mean_vector_dict[index] = v
            pbar.update(1)
            
    return mean_vector_dict
            
def get_singular_vector(features, labels):
    '''
    To get top1 sigular vector in class-wise manner by using SVD of hidden feature vectors
    features: hidden feature vectors of data (numpy)
    labels: correspoding label list
    '''
    
    singular_vector_dict = {}
    with tqdm(total=len(np.unique(labels))) as pbar:
        for index in np.unique(labels):
            _, _, v = np.linalg.svd(features[labels==index])
            singular_vector_dict[index] = v[0]
            pbar.update(1)

    return singular_vector_dict


def get_score(singular_vector_dict, features, labels, normalization=True):
    '''
    Calculate the score providing the degree of showing whether the data is clean or not.
    '''
    if normalization:
        scores = [np.abs(np.inner(singular_vector_dict[labels[indx]], feat/np.linalg.norm(feat))) for indx, feat in enumerate(tqdm(features))]
    else:
        scores = [np.abs(np.inner(singular_vector_dict[labels[indx]], feat)) for indx, feat in enumerate(tqdm(features))]
        
    return np.array(scores)

def extract_topk(scores, labels, k):
    '''
    k: ratio to extract topk scores in class-wise manner
    To obtain the most prominsing clean data in each classes
    
    return selected labels 
    which contains top k data
    '''
    
    indexes = torch.tensor(range(len(labels)))
    selected_labels = []
    for cls in np.unique(labels):
        num = int(p * np.sum(labels==cls))
        _, sorted_idx = torch.sort(scores[labels==cls], descending=True)
        selected_labels += indexes[labels==cls][sorted_idx[:num]].numpy().tolist()
        
    return torch.tensor(selected_labels, dtype=torch.int64)


def fine(current_features, current_labels, fit='kmeans', prev_features=None, prev_labels=None, p_threshold=0.5, norm=True, eigen=True):
    '''
    prev_features, prev_labels: data from the previous round
    current_features, current_labels: current round's data
    
    return clean labels
    
    if you insert the prev_features and prev_labels to None,
    the algorthm divides the data based on the current labels and current features
    
    '''
    if eigen is True:
        if prev_features is not None and prev_labels is not None:
            vector_dict = get_singular_vector(prev_features, prev_labels)
        else:
            vector_dict = get_singular_vector(current_features, current_labels)
    else:
        if prev_features is not None and prev_labels is not None:
            vector_dict = get_mean_vector(prev_features, prev_labels)
        else:
            vector_dict = get_mean_vector(current_features, current_labels)

    scores = get_score(vector_dict, features = current_features, labels = current_labels, normalization=norm)
    

    clean_labels = fit_mixture(scores, current_labels, p_threshold=p_threshold)

    return clean_labels




class UCI_MLP(nn.Module):
    def __init__(self, num_features, num_outputs, dropout=0, batch_norm=False):
        super(UCI_MLP, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        self.batch_norm = batch_norm
        d = num_features + 1
        self.fc1 = nn.Linear(num_features, d)
        self.bn1 = nn.BatchNorm1d(d)
        self.relu1 = nn.ReLU(inplace=False)
        self.fc2 = nn.Linear(d, d)
        self.bn2 = nn.BatchNorm1d(d)
        self.relu2 = nn.ReLU(inplace=False)
        self.fc3 = nn.Linear(d, num_outputs)

    def forward(self, x):
        batch_size = x.shape[0]
        out = self.fc1(x)
        if self.batch_norm and batch_size > 1:
          out = self.bn1(out)
        out = self.relu1(out)
        out = self.dropout(out)
        out = self.fc2(out)
        if self.batch_norm and batch_size > 1:
          out = self.bn2(out)
        h_output = self.relu2(out)
        h_output = self.dropout(h_output)
        out = self.fc3(h_output)
        return out


def selector(X, y, method='loss', epochs=100, n_classes=2):

            y = y.reshape(-1)
            n_classes = len(np.unique(y))
            datahander = DataHandler(X, y, batch_size=len(y))

            # creating our model object, which we both want to use downstream, but also we will use to judge the hardness of the data points
            model = UCI_MLP(num_features=X.shape[1], num_outputs=n_classes)
            
            #model = SimpleMLP(input_dim = X.shape[1], output_dim=2)

            # creating our optimizer and loss function objects
            learning_rate = 0.01
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

            if method=='dips':
                y = y.reshape(-1)
                datahander = DataHandler(X, y, batch_size=32)

                hcm = Plugins().get(
                    "dips",
                    model=model,
                    criterion=criterion,
                    optimizer=optimizer,
                    lr=learning_rate,
                    epochs=epochs,
                    num_classes=n_classes,
                    logging_interval=1,
                )

                #Now plot the distribution of dips_xmetric
                dips_xthresh = 0.15
            elif method=='loss':
                hcm = Plugins().get(
                    "large_loss",
                    model=model,
                    criterion=criterion,
                    optimizer=optimizer,
                    lr=learning_rate,
                    epochs=epochs,
                    num_classes=n_classes,
                    logging_interval=1,
                )
                print('LOSS...')

            

            elif method=='filter':
                hcm = Plugins().get(
                    "filtering",
                    model=model,
                    criterion=criterion,
                    optimizer=optimizer,
                    lr=learning_rate,
                    epochs=epochs,
                    num_classes=n_classes,
                    logging_interval=1,
                    total_samples = len(y)
                )
                print('Forgetting...')

            elif method=='basicfilter':
                hcm = Plugins().get(
                    "basicfilter",
                    model=model,
                    criterion=criterion,
                    optimizer=optimizer,
                    lr=learning_rate,
                    epochs=epochs,
                    num_classes=n_classes,
                    logging_interval=1,
                    total_samples = len(y)
                )
                print('Forgetting...')

            if method!='fine':
                hcm.fit(
                    datahandler=datahander,
                    use_caches_if_exist=False,
                )

    
            if method=='dips':
                confidence, dips_xmetric = hcm.scores
                dips_ythresh=0.2
                easy_train, ambig_train, hard_train = get_groups(
                    confidence=confidence,
                    aleatoric_uncertainty=dips_xmetric,
                    dips_xthresh=dips_xthresh,
                    dips_ythresh=dips_ythresh,
                )
            elif method=='loss':
                scores = hcm.scores
                threshold = np.percentile(scores,99)
                easy_train = np.where(scores<threshold)[0]
            elif method=='filter' or method=='basicfilter':
                scores = hcm.scores
                easy_train = np.where(scores==1)[0]

            if method == 'fine':
                try:
                    easy_train = fine(current_features=X, current_labels=y)
                except:
                    easy_train = np.arange(len(y))

            if len(np.unique(y[easy_train])) != len(np.unique(y)):
                # find one id of each unique label and append to easy_train
                for label in np.unique(y):
                    easy_train = np.append(easy_train, np.where(y==label)[0][0])
                
                # remove duplicates in easy_train
                easy_train = np.unique(easy_train)

            if len(easy_train)<2*len(np.unique(y)):
                easy_train = np.arange(len(y))

            return easy_train



In [3]:
import traceback

methods = ['filter','basicfilter','loss','fine']

#dataset_tuples = [('seer', '1.0'), ('adult', '0.66'), ('cutract', '1.0'), ('covid', '0.66'),   ('maggic', '0.66'), ("compas", "1.0"), ("agaricus-lepiota", '1.0'), ('German-credit', '1.0'), ("higgs", "0.1"),   ('drug', '0.1'), ("blog", "0.2"),("credit", "1.0")]

dataset_tuples = [('seer', '1.0'), ('adult', '0.66'), ('cutract', '1.0'), ('covid', '0.66'),   ('maggic', '0.66'), ("compas", "1.0"), ("agaricus-lepiota", '1.0'), ('German-credit', '1.0'), ("higgs", "0.1"),   ('drug', '0.1'), ("blog", "0.2"),("credit", "1.0")]
dataset_tuples = [('seer', '1.0')]

from src.utils import (
    append_acc_early_termination,
    get_train_test_unlabeled,)

for dataset_tuple in dataset_tuples:
    results_store = {}
    dataset = dataset_tuple[0]
    prop_data= float(dataset_tuple[1])
    for method in methods:
        
        try:

            print(f"running dataset {dataset} with {method}")
            overall_result_dicts = []
            overall_data_dicts = []
            overall_model_dicts = []

            
            dips_metric = 'aleatoric'
            dips_ythresh = 0.2

            algorithm_list=['Fully Supervised', 'Supervised_Learning','Pseudo_Labeling']
                    
            seed=0
            nest=100

            num_XGB_models=5
            numTrials=3
            numIters=5
            upper_threshold=0.8
            verbose=False
            loss=True
            epochs = 20


            for i in tqdm(range(numTrials)):
                try:
                    split_prop = 1

                    if dataset in ["seer", "cutract", "covid", "support", "adult", "bank", "drug","credit", "metabric", "fraud", "maggic", 'bank', 'cover', 'higgs', 'contraceptive', 'blog', "telescope", "bio", "eye", "compas", "marketing",]:
                        df_feat, df_label, df = get_data(dataset=dataset, prop=prop_data)

                        x_train, x_test, y_train, y_test = train_test_split(
                        df_feat, df_label, test_size=0.2, random_state=seed
                        )

                        x_train, x_unlabeled, y_train, y_unlabeled = train_test_split(
                            x_train, y_train, train_size=0.1, random_state=seed
                        )
         
                    else:
                        prop_lab = 0.1
                        path_to_file = "./data/all_data.pickle"
                        (
                            x_train,
                            y_train,
                            x_test,
                            y_test,
                            x_unlabeled,
                            y_unlabeled,
                        ) = get_train_test_unlabeled(dataset, prop_lab, path_to_file, random_state=seed)

                    
                    seed+=1

                    print(f"Trial {i+1}/{numTrials}")
                    results = {}
                    data = {}
                    models = {}


                    x_unlabeled, x_test, y_test, x_train, y_train = (
                        np.asarray(x_unlabeled),
                        np.asarray(x_test),
                        np.asarray(y_test),
                        np.asarray(x_train),
                        np.asarray(y_train),
                    )
                        

                    datasize = x_train.shape

                    total_samples = len(x_train) + len(x_test) + len(x_unlabeled)

                    print(f"# total samples = {total_samples} ({prop_data} - prop)")

                    print(f"# training points = {y_train.shape[0]}")

                    print(f"# test points = {y_test.shape[0]}")

                    print(f"# unlabelled points = {x_unlabeled.shape[0]}")



                    # # Supervised learning - Train an XGBoost model
                    param = {}
                    param["booster"] = "gbtree"
                    param["objective"] = "binary:logistic"
                    param["verbosity"] = 0
                    param["n_estimators"] = nest
                    param["silent"] = 1
                    param["seed"] = seed

                    print("Training Fully Supervised model...")
                    # create XGBoost instance with default hyper-parameters
                    xgb = XGBClassifier(**param)
                    all_x = np.concatenate((x_train, x_unlabeled))
                    all_y = np.concatenate((y_train, y_unlabeled))
                    
                    xgb.fit(all_x, all_y)

                    # evaluate the performance on the test set
                    y_test_pred = xgb.predict(x_test)
                    fully_supervised_learning_accuracy = np.round(
                        accuracy_score(y_test_pred, y_test) * 100, 2
                    )  # round to 2 digits xx.yy %

                    results["fully_supervised_learning_accuracy"] = fully_supervised_learning_accuracy


                    print("Training Supervised model...")
                    # create XGBoost instance with default hyper-parameters
                    xgb = XGBClassifier(**param)

                    xgb.fit(x_train, y_train)

                    # evaluate the performance on the test set
                    y_test_pred = xgb.predict(x_test)
                    supervised_learning_accuracy = np.round(
                        accuracy_score(y_test_pred, y_test) * 100, 2
                    )  # round to 2 digits xx.yy %

                    results["supervised_learning_accuracy"] = supervised_learning_accuracy

                    # RUN DCAI
                
                    easy_train = selector(X=x_train, y=y_train, method=method, epochs=epochs)


                    print("Training Preprocess + Supervised model...")
                    # create XGBoost instance with default hyper-parameters
                    xgb = XGBClassifier(**param)

                    xgb.fit(x_train[easy_train], y_train[easy_train])

                    # evaluate the performance on the test set
                    y_test_pred = xgb.predict(x_test)
                    supervised_learning_accuracy_easy = np.round(
                        accuracy_score(y_test_pred, y_test) * 100, 2
                    )  # round to 2 digits xx.yy %

                    results["supervised_learning_accuracy_easy"] = supervised_learning_accuracy_easy
                    
                    dips_xthresh= 0.15
                    if 'Pseudo_Labeling' in algorithm_list:

                        print("Running Pseudo Labeling...")

                        (
                            pseudo_labeling_acc_vanilla,
                            pseudo_labeling_acc_dips_begin,
                            pseudo_labeling_acc_dips_full,
                            pseudo_labeling_acc_dips_full2,
                            artifacts

                            
                        ) = run_pseudo(
                            x_unlabeled=x_unlabeled,
                            x_test=x_test,
                            y_test=y_test,
                            x_train=x_train,
                            y_train=y_train,
                            numIters=numIters,
                            upper_threshold=upper_threshold,
                            nest=nest,
                            seed=seed,
                            easy_train=easy_train,
                            dips_metric=dips_metric,
                            dips_xthresh=dips_xthresh,
                            dips_ythresh=dips_ythresh,
                            verbose=verbose,
                            method=method,
                            epochs=epochs,
                        )

                        results["pseudo"] = {
                            "vanilla": pseudo_labeling_acc_vanilla,
                            "dips_begin": pseudo_labeling_acc_dips_begin,
                            "dips_full": pseudo_labeling_acc_dips_full,
                            "dips_full2": pseudo_labeling_acc_dips_full2,
                        }

                        # data['pseudo'] = {'vanilla':artifacts['vanilla']['data'], 
                        #     'dips_begin':artifacts['begin']['data'], 
                        #     'dips_full':artifacts['full1']['data'], 
                        #     'dips_full2':artifacts['full2']['data']}
                        
                        # models['pseudo'] = {'vanilla':artifacts['vanilla']['models'], 
                        #     'dips_begin':artifacts['begin']['models'], 
                        #     'dips_full':artifacts['full1']['models'], 
                        #     'dips_full2':artifacts['full2']['models']}

            
                    overall_result_dicts.append(results)
                #     overall_data_dicts.append(data)
                #     overall_model_dicts.append(models)

                # overall_result_dicts, overall_data_dicts, overall_model_dicts, datasize

                except Exception as e:
                    import traceback
                    print(traceback.format_exc())
                    print(e)
                    continue

            from src.utils import process_results
            results = process_results(results_list=overall_result_dicts,numIters=numIters, end_score=True)

            results_store[method] = results
            results_store['dataset'] = dataset
                        
            

        except Exception as e:
            import traceback
            print(traceback.format_exc())
            print(e)
            continue
    
    # save results_store to pickle
    import pickle
    filename = f"../results/lnl_{dataset}.pickle"

    with open(filename, 'wb') as f:
        pickle.dump(results_store, f)

            

running dataset seer with filter


  0%|          | 0/3 [00:00<?, ?it/s]

Trial 1/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
(1600,) 1600
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[793 803]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[793 803]
n iterations 5
iteration  0
(1596,) 1596
iteration  1
(6397,) 6397
iteration  2
(8957,) 8957
iteration  3
(10366,) 10366
iteration  4
(11117,) 11117
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
(1600,) 1600
iteration  1
(6400,) 6400
iteration  2
(8961,) 8961
iteration  3
(10370,) 10370
iteration  4
(11122,) 11122


 33%|███▎      | 1/3 [01:56<03:53, 116.71s/it]

Trial 2/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
(1600,) 1600
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[784 810]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[784 810]
n iterations 5
iteration  0
(1594,) 1594
iteration  1
(6395,) 6395
iteration  2
(8955,) 8955
iteration  3
(10364,) 10364
iteration  4
(11116,) 11116
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
(1600,) 1600
iteration  1
(6400,) 6400
iteration  2
(8961,) 8961
iteration  3
(10370,) 10370
iteration  4
(11121,) 11121


 67%|██████▋   | 2/3 [04:12<02:07, 127.89s/it]

Trial 3/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
(1600,) 1600
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[768 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[768 831]
n iterations 5
iteration  0
(1599,) 1599
iteration  1
(6400,) 6400
iteration  2
(8961,) 8961
iteration  3
(10370,) 10370
iteration  4
(11122,) 11122
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
(1600,) 1600
iteration  1
(6400,) 6400
iteration  2
(8961,) 8961
iteration  3
(10370,) 10370
iteration  4
(11121,) 11121


100%|██████████| 3/3 [06:27<00:00, 129.09s/it]


running dataset seer with basicfilter


  0%|          | 0/3 [00:00<?, ?it/s]

Trial 1/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[745 553]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[745 553]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


 33%|███▎      | 1/3 [02:24<04:49, 144.88s/it]

Trial 2/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[738 560]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[738 560]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


 67%|██████▋   | 2/3 [04:41<02:19, 139.96s/it]

Trial 3/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
Forgetting...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[698 434]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[698 434]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


100%|██████████| 3/3 [06:52<00:00, 137.37s/it]


running dataset seer with loss


  0%|          | 0/3 [00:00<?, ?it/s]

Trial 1/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
LOSS...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[780 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[780 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


 33%|███▎      | 1/3 [03:01<06:03, 181.64s/it]

Trial 2/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
LOSS...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[772 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[772 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


 67%|██████▋   | 2/3 [06:18<03:10, 190.85s/it]

Trial 3/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...
LOSS...
Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[753 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[753 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4


100%|██████████| 3/3 [10:14<00:00, 204.93s/it]


running dataset seer with fine


  0%|          | 0/3 [00:00<?, ?it/s]

Trial 1/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...


100%|██████████| 2/2 [00:00<00:00, 48.73it/s]
100%|██████████| 1600/1600 [00:00<00:00, 40653.56it/s]


Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[225 218]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[225 218]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 181.72it/s]
100%|██████████| 443/443 [00:00<00:00, 59712.59it/s]


iteration  1


100%|██████████| 2/2 [00:00<00:00,  5.91it/s]
100%|██████████| 5244/5244 [00:00<00:00, 53546.13it/s]


iteration  2


100%|██████████| 2/2 [00:00<00:00,  2.20it/s]
100%|██████████| 7805/7805 [00:00<00:00, 47048.38it/s]


iteration  3


100%|██████████| 2/2 [00:01<00:00,  1.19it/s]
100%|██████████| 9213/9213 [00:00<00:00, 41739.53it/s]


iteration  4


100%|██████████| 2/2 [00:02<00:00,  1.08s/it]
100%|██████████| 9772/9772 [00:00<00:00, 33438.74it/s]


===== Pseudo_Labeling
[796 804]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 35.94it/s]
100%|██████████| 1600/1600 [00:00<00:00, 32965.18it/s]


iteration  1


100%|██████████| 2/2 [00:01<00:00,  1.57it/s]
100%|██████████| 6400/6400 [00:00<00:00, 29494.74it/s]


iteration  2


100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
100%|██████████| 8961/8961 [00:00<00:00, 22291.06it/s]


iteration  3


100%|██████████| 2/2 [00:02<00:00,  1.47s/it]
100%|██████████| 10370/10370 [00:00<00:00, 31631.27it/s]


iteration  4


100%|██████████| 2/2 [00:02<00:00,  1.39s/it]
100%|██████████| 11122/11122 [00:00<00:00, 35566.60it/s]
 33%|███▎      | 1/3 [01:15<02:31, 75.92s/it]

Trial 2/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...


100%|██████████| 2/2 [00:00<00:00, 42.90it/s]
100%|██████████| 1600/1600 [00:00<00:00, 36204.02it/s]


Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[194 210]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[194 210]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 217.58it/s]
100%|██████████| 404/404 [00:00<00:00, 22467.80it/s]


iteration  1


100%|██████████| 2/2 [00:00<00:00,  3.85it/s]
100%|██████████| 5205/5205 [00:00<00:00, 43588.26it/s]


iteration  2


100%|██████████| 2/2 [00:01<00:00,  1.32it/s]
100%|██████████| 7766/7766 [00:00<00:00, 38070.72it/s]


iteration  3


100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
100%|██████████| 9174/9174 [00:00<00:00, 38750.73it/s]


iteration  4


100%|██████████| 2/2 [00:02<00:00,  1.29s/it]
100%|██████████| 9926/9926 [00:00<00:00, 33960.56it/s]


===== Pseudo_Labeling
[788 812]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 39.96it/s]
100%|██████████| 1600/1600 [00:00<00:00, 37065.25it/s]


iteration  1


100%|██████████| 2/2 [00:00<00:00,  2.64it/s]
100%|██████████| 6400/6400 [00:00<00:00, 38524.81it/s]


iteration  2


100%|██████████| 2/2 [00:01<00:00,  1.37it/s]
100%|██████████| 8961/8961 [00:00<00:00, 52637.11it/s]


iteration  3


100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
100%|██████████| 10370/10370 [00:00<00:00, 52306.86it/s]


iteration  4


100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
100%|██████████| 11121/11121 [00:00<00:00, 46713.15it/s]
 67%|██████▋   | 2/3 [02:30<01:15, 75.26s/it]

Trial 3/3
# total samples = 20000 (1.0 - prop)
# training points = 1600
# test points = 4000
# unlabelled points = 14400
Training Fully Supervised model...
Training Supervised model...


100%|██████████| 2/2 [00:00<00:00, 42.62it/s]
100%|██████████| 1600/1600 [00:00<00:00, 27641.05it/s]


Training Preprocess + Supervised model...
Running Pseudo Labeling...
===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[195 209]
n iterations 5
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
===== Pseudo_Labeling
[195 209]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 260.99it/s]
100%|██████████| 404/404 [00:00<00:00, 42321.21it/s]


iteration  1


100%|██████████| 2/2 [00:00<00:00,  4.93it/s]
100%|██████████| 5205/5205 [00:00<00:00, 53323.80it/s]


iteration  2


100%|██████████| 2/2 [00:00<00:00,  2.01it/s]
100%|██████████| 7766/7766 [00:00<00:00, 61478.14it/s]


iteration  3


100%|██████████| 2/2 [00:01<00:00,  1.25it/s]
100%|██████████| 9175/9175 [00:00<00:00, 47427.99it/s]


iteration  4


100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
100%|██████████| 9927/9927 [00:00<00:00, 40385.04it/s]


===== Pseudo_Labeling
[769 831]
n iterations 5
iteration  0


100%|██████████| 2/2 [00:00<00:00, 32.56it/s]
100%|██████████| 1600/1600 [00:00<00:00, 33578.78it/s]


iteration  1


100%|██████████| 2/2 [00:00<00:00,  2.46it/s]
100%|██████████| 6400/6400 [00:00<00:00, 42077.03it/s]


iteration  2


100%|██████████| 2/2 [00:01<00:00,  1.03it/s]
100%|██████████| 8961/8961 [00:00<00:00, 31196.05it/s]


iteration  3


100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
100%|██████████| 10370/10370 [00:00<00:00, 20841.72it/s]


iteration  4


100%|██████████| 2/2 [00:03<00:00,  1.53s/it]
100%|██████████| 11121/11121 [00:00<00:00, 38202.74it/s]
100%|██████████| 3/3 [03:49<00:00, 76.55s/it]


In [4]:
results_store

{'filter': {'fully_supervised_learning_accuracy': {'acc_mean': 84.71,
   'acc_se': 0.3099999999999993},
  'supervised_learning_accuracy': {'acc_mean': 81.51333333333334,
   'acc_se': 0.33348329959851386},
  'supervised_learning_accuracy_easy': {'acc_mean': 81.04,
   'acc_se': 0.3686461718233367},
  'pseudo': {'vanilla_mean': 81.92666666666666,
   'vanilla_se': 0.32793969635352765,
   'dips_begin_mean': 82.24,
   'dips_begin_se': 0.2666458325194688,
   'dips_full_mean': 81.80333333333333,
   'dips_full_se': 0.3888587289892192}},
 'dataset': 'seer',
 'basicfilter': {'fully_supervised_learning_accuracy': {'acc_mean': 84.71,
   'acc_se': 0.3099999999999993},
  'supervised_learning_accuracy': {'acc_mean': 81.51333333333334,
   'acc_se': 0.33348329959851386},
  'supervised_learning_accuracy_easy': {'acc_mean': 77.61,
   'acc_se': 1.818635019274987},
  'pseudo': {'vanilla_mean': 81.92666666666666,
   'vanilla_se': 0.32793969635352765,
   'dips_begin_mean': 78.21666666666667,
   'dips_begin_se