## Pool-based Active Learning - Evaluation Study

The main purpose of this tutorial is to show how a realistic comparision study can be realized using 'scikit-activeml'. In this tutorial, we use a self-supervised learning model DINOv2 from [1] to creat a (to be continund)

In [1]:
#!pip install -U matplotlib
#!pip install -U scikit-learn
#!pip install iteration_utilities

In [2]:
import sys
sys.path.append("/mnt/stud/home/jcheng/scikit-activeml/")
print(sys.path)

['/mnt/stud/home/jcheng/scikit-activeml/tutorials', '/mnt/stud/home/jcheng/miniconda3/envs/scikit-activeml/lib/python310.zip', '/mnt/stud/home/jcheng/miniconda3/envs/scikit-activeml/lib/python3.10', '/mnt/stud/home/jcheng/miniconda3/envs/scikit-activeml/lib/python3.10/lib-dynload', '', '/mnt/stud/home/jcheng/miniconda3/envs/scikit-activeml/lib/python3.10/site-packages', '/mnt/stud/home/jcheng/scikit-activeml/']


In [3]:
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from skactiveml.classifier import SklearnClassifier
from skactiveml.pool import UncertaintySampling, RandomSampling, DiscriminativeAL, CoreSet, TypiClust, Badge
from skactiveml.utils import call_func, MISSING_LABEL

import warnings
mlp.rcParams["figure.facecolor"] = "white"
warnings.filterwarnings("ignore")

## Data Set Generation

Introduction about DINOv2 to get embedding dataset. (To be continuend)

In [4]:
#!pip3 install torch torchvision torchaudio
#!pip install tqdm

In [5]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tqdm import tqdm

In [6]:
transforms = transforms.Compose(
        [transforms.Resize(256),
         transforms.CenterCrop(224),
         transforms.ToTensor(),
         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]
    )

batch_size = 4

install the corresponding data_set (CIFAR10)

In [7]:
cifar10_trainset = datasets.CIFAR10(root="./data", train=True, download=True,transform=transforms)
cifar10_trainloader = torch.utils.data.DataLoader(cifar10_trainset, batch_size=batch_size, shuffle=True, num_workers=2)

cifar10_testset = datasets.CIFAR10(root="./data", train=False, download=True, transform=transforms)
cifar10_testloader = torch.utils.data.DataLoader(cifar10_testset, batch_size=batch_size, shuffle=False, num_workers=2)

cifar10_classes = ['plane', 'car', 'bird', 'cat', 
                   'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

Files already downloaded and verified
Files already downloaded and verified


Compute the Embedding for Images with DINOv2

In [8]:
dinov2_vits14 = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")

Using cache found in /mnt/stud/home/jcheng/.cache/torch/hub/facebookresearch_dinov2_main


In [9]:
cifar10_train_embedding_list = []
cifar10_train_label_list = []
cifar10_test_embedding_list = []
cifar10_test_label_list = []

In [12]:
with torch.no_grad():
    for i, data in tqdm(enumerate(cifar10_trainloader), total=len(cifar10_trainloader), desc="Train"):
        image, label = data

        embeddings = dinov2_vits14(image)
        cifar10_train_embedding_list.append(embeddings)
        cifar10_train_label_list.append(label)
    
    for i, data in tqdm(enumerate(cifar10_testloader), total=len(cifar10_testloader), desc="Test"):
        image, label = data

        embeddings = dinov2_vits14(image)
        cifar10_test_embedding_list.append(embeddings)
        cifar10_test_label_list.append(label)
    
    cifar10_X_Train = torch.cat(cifar10_train_embedding_list, dim=0).numpy()
    cifar10_y_Train_true = torch.cat(cifar10_train_label_list, dim=0).numpy()
    cifar10_X_Test = torch.cat(cifar10_test_embedding_list, dim=0).numpy()
    cifar10_y_Test = torch.cat(cifar10_test_label_list, dim=0).numpy()
    

Train: 100%|██████████| 12500/12500 [29:26<00:00,  7.07it/s] 
Train: 100%|██████████| 2500/2500 [05:58<00:00,  6.97it/s]


Save the embedding feature in seperat file, creat a new folder with name 'embedding_data'

In [41]:
np.save('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_X_train.npy', cifar10_X_Train)
np.save('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_y_train.npy', cifar10_y_Train_true)
np.save('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_X_test.npy', cifar10_X_Test)
np.save('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_y_test.npy', cifar10_y_Test)

If you already complete these step before, please load your data here

In [36]:
cifar10_X_Train = np.load('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_X_train.npy')
cifar10_y_Train_true = np.load('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_y_train.npy')
cifar10_X_Test = np.load('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_X_test.npy')
cifar10_y_Test = np.load('/mnt/stud/home/jcheng/scikit-activeml/tutorials/embedding_data/cifar10_dinov2_y_test.npy')

## Random Seed Management

In [21]:
master_random_state = np.random.RandomState(0)

def gen_seed(random_state:np.random.RandomState):
    return random_state.randint(0, 2**31)

def gen_random_state(random_state:np.random.RandomState):
    return np.random.RandomState(gen_seed(random_state))

## Classification Models and Query Strategies

In [25]:
classifier_factory_functions = {
    'LogisticRegression': lambda classes, random_state: SklearnClassifier(
        LogisticRegression(),
        classes=classes,
        random_state=gen_seed(random_state)
    )
}

In [26]:
query_strategy_factory_functions = {
    'RandomSampling': lambda random_state: RandomSampling(random_state=gen_seed(random_state)),
    'UncertaintySampling': lambda random_state: UncertaintySampling(random_state=gen_seed(random_state)),
    'DiscriminativeAL': lambda random_state: DiscriminativeAL(random_state=gen_seed(random_state)),
    'CoreSet': lambda random_state: CoreSet(random_state=gen_seed(random_state)),
    'TypiClust': lambda random_state: TypiClust(random_state=gen_seed(random_state)),
    'Badge': lambda random_state: Badge(random_state=gen_seed(random_state))
}

In [27]:
def create_classifier(name, classes, random_state):
    return classifier_factory_functions[name](classes, random_state)

def create_query_strategy(name, random_state):
    return query_strategy_factory_functions[name](random_state)

## Experiment Parameters

In [45]:
n_reps = 1
n_training_dataset = len(cifar10_X_Train)
#n_cycles = int(0.5 * n_training_dataset)
n_cycles = 500
classifier_names = classifier_factory_functions.keys()
query_strategy_names = query_strategy_factory_functions.keys()

## Experiment Loop

In [None]:
results = {}

for clf_name in classifier_names:
    for qs_name in query_strategy_names:
        accuracies = np.full((n_reps, n_cycles), np.nan)
        for i_rep in range(n_reps):
            cifar10_y_Train = np.full(shape=cifar10_y_Train_true.shape, fill_value=MISSING_LABEL)
            
            clf = create_classifier(clf_name, classes=np.arange(len(cifar10_classes)), random_state=gen_random_state(master_random_state))
            qs = create_query_strategy(qs_name, random_state=gen_random_state(master_random_state))
            clf.fit(cifar10_X_Train, cifar10_y_Train)
            
            for c in tqdm(range(n_cycles), desc=f'Repeat {i_rep + 1} in {clf_name} with {qs_name}'):
                query_idx = call_func(qs.query, X=cifar10_X_Train, y=cifar10_y_Train, batch_size=1, clf=clf, discriminator=clf)
                cifar10_y_Train[query_idx] = cifar10_y_Train_true[query_idx]
                clf.fit(cifar10_X_Train, cifar10_y_Train)
                accuracies[i_rep, c] = clf.score(cifar10_X_Test, cifar10_y_Test)
        
        results[(clf_name, qs_name)] = accuracies

Repeat 1 in LogisticRegression with RandomSampling: 100%|██████████| 500/500 [01:55<00:00,  4.34it/s]
Repeat 1 in LogisticRegression with UncertaintySampling: 100%|██████████| 500/500 [04:21<00:00,  1.91it/s]
Repeat 1 in LogisticRegression with DiscriminativeAL:  23%|██▎       | 117/500 [05:27<18:51,  2.95s/it]

## Resulting Plotting

In [None]:
for clf_name in classifier_names:
    for qs_name in query_strategy_names:
        key = (clf_name, qs_name)
        result = results[key]
        reshaped_result = result.reshape((-1, n_cycles))
        errorbar_mean = np.mean(reshaped_result, axis=0)
        errorbar_std = np.std(reshaped_result, axis=0)
        plt.errorbar(np.arange(n_cycles), errorbar_mean, errorbar_std, label=f"({np.mean(errorbar_mean):.4f}) {qs_name}", alpha=0.5)
    plt.title(clf_name)
    plt.legend(loc='lower right')
    plt.xlabel('cycle')
    plt.ylabel('accuracy')
    plt.show()