# MNIST Active Learning POC

General idea: Achieve the highest possible accuracy, with the lowest amount of train data using (supervised learning).

We start from a pool of unlabeled data and a test set. Then, we itiretaively query new samples to be annotated, and re-train the model.

In [1]:
import tensorflow as tf

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

from scipy import stats

import collections

2022-04-22 11:50:43.826818: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-22 11:50:43.826838: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## MNIST

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train, x_test = x_train[..., np.newaxis]/255.0, x_test[..., np.newaxis]/255.0

print("Number of original training examples:", len(x_train))
print("Number of original test examples:", len(x_test))

Number of original training examples: 60000
Number of original test examples: 10000


## Baseline model

In [3]:
# the batch size will be used as the number of new images to annotate
BATCH_SIZE = 128

baseline_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10)
])
baseline_model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

baseline_model.fit(
    x=x_train,
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=6,
    validation_split=0.2,
)

2022-04-22 11:50:51.155696: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-22 11:50:51.155754: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-22 11:50:51.155778: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (eoc-001261l): /proc/driver/nvidia/version does not exist
2022-04-22 11:50:51.156092: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-22 11:50:51.325205: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Op

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa86659daf0>

In [4]:
_, baseline_metrics = baseline_model.evaluate(
    x=x_test,
    y=y_test,
)

print("SparseCategoricalAccuracy", baseline_metrics)

SparseCategoricalAccuracy 0.9724000096321106


## Randomly Added Images

In [105]:
INIT_SIZE = int(len(x_train) * 0.01)
print("Initial number of annotated images", INIT_SIZE, "out of", len(x_train))



Initial number of annotated images 600 out of 60000


In [106]:
annotated_ix = list()

annotated_ix = annotated_ix + np.random.choice(range(len(x_train)), 
                                               size=INIT_SIZE, replace=False).tolist()

x_train_annotated = x_train[annotated_ix]    
y_train_annotated = y_train[annotated_ix]

print("Size of initial set of annotated images", len(x_train_annotated))

random_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10)
])
random_model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

random_model.fit(
    x=x_train_annotated,
    y=y_train_annotated,
    batch_size=BATCH_SIZE,
    epochs=6,
    validation_split=0.2,
)

Size of initial set of annotated images 600
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa7fcf6a490>

In [107]:
_, metrics = random_model.evaluate(
    x=x_test,
    y=y_test,
)

print("SparseCategoricalAccuracy", metrics)

random_metrics = [metrics]

SparseCategoricalAccuracy 0.8216999769210815


In [109]:
for i in range(3):
    ix_pool = np.delete(range(len(x_train)), annotated_ix)

    annotated_ix = annotated_ix + np.random.choice(ix_pool, 
                                                   size=BATCH_SIZE, replace=False).tolist()

    x_train_annotated = x_train[annotated_ix]    
    y_train_annotated = y_train[annotated_ix]

    print("Current size of set of annotated images", len(x_train_annotated))

    random_model = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(10)
    ])
    random_model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    random_model.fit(
        x=x_train_annotated,
        y=y_train_annotated,
        batch_size=BATCH_SIZE,
        epochs=6,
        validation_split=0.2,
    )

    _, metrics = random_model.evaluate(
        x=x_test,
        y=y_test,
    )

    print("SparseCategoricalAccuracy", metrics)

    random_metrics.append(metrics)

Current size of set of annotated images 856
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.8374000191688538
Current size of set of annotated images 984
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.8549000024795532
Current size of set of annotated images 1112
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.8598999977111816


## Clustering

In [67]:
x_train_flat = np.reshape(
    x_train, 
    (len(x_train), x_train.shape[1] * x_train.shape[2] * x_train.shape[3])
)

# TODO: the number of cluster should be adjusted using internal/external validation metrics
kmeans = KMeans(n_clusters=10, random_state=0).fit(x_train_flat)


Initial number of annotated images 600 out of 60000


In [98]:
def norm_ratios(
    labels,
    sum_up_to):
    """ Given an array of classes (labels), returns an array of
    elements per class (ratio) that sum up to a given number.
    In other words, the sum of all returned elements is equal to
    'sum_up_to'.
    """
    labels_counts = collections.Counter(labels)
    # print("Number of elements per cluster:", dict(labels_counts))

    # select a repersentative initial set of images to be annotated
    counts = list()
    for i in range(len(labels_counts)):
        counts.append(labels_counts[i])

    counts = np.array(counts)
    _min = counts.min() if counts.min() > 0 else 1
    
    counts = counts / _min
    counts = counts / counts.sum()
    counts = counts * sum_up_to
    counts = counts.astype(int)

    # to avoid smalls clusters to be left out, we add at least one element per cluster
    counts[counts == 0] = 1

    while counts.sum() != sum_up_to:
        if counts.sum() > sum_up_to:
            counts[counts.argmax()] = counts[counts.argmax()] - 1
        elif counts.sum() < sum_up_to:
            counts[counts.argmin()] = counts[counts.argmin()] + 1
            
    return counts

In [99]:
# we will use the number of elements per cluster to select the initial batch
counts = norm_ratios(kmeans.labels_, INIT_SIZE)
print("Images by class adjusted to sum up to the initial amount of annotated data",
      counts_active)

set_ix = list()
for i in range(len(counts)):
    pos = np.where(kmeans.labels_ == i)[0]
    set_ix = set_ix + np.random.choice(pos, size=counts[i], replace=False).tolist()
    
annotated_ix = set_ix.copy()

x_train_annotated = x_train[annotated_ix]    
y_train_annotated = y_train[annotated_ix]

print("Size of initial set of annotated images", len(x_train_annotated))

Images by class adjusted to sum up to the initial amount of annotated data [14 19 19 10 10 19 10  9  9  9]
Size of initial set of annotated images 600


In [100]:
active_model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10)
])
active_model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

active_model.fit(
    x=x_train_annotated,
    y=y_train_annotated,
    batch_size=BATCH_SIZE,
    epochs=6,
    validation_split=0.2,
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa7fe1115e0>

In [101]:
_, metrics = active_model.evaluate(
    x=x_test,
    y=y_test,
)

print("SparseCategoricalAccuracy", metrics)

active_metrics = [metrics]


SparseCategoricalAccuracy 0.7160999774932861


**NOTE:** Should we use the test data to check instead?

In [102]:
for i in range(3):
    preds = active_model.predict(x_train_annotated)
    preds_class = np.argmax(preds, axis=1)
    misclassified_relative = np.where((preds_class - y_train_annotated)!=0)
    misclassified_pos = np.take(annotated_ix, misclassified_relative)
    misclassified_cluster_no = np.take(kmeans.labels_, misclassified_pos)

    # compute ratio of misclassified classes
    counts_active = norm_ratios(misclassified_cluster_no.ravel(), BATCH_SIZE)
    print("Misclasification by class adjusted to sum up to a batch", counts_active)

    # select new images to annotate with the given ratios
    ix_pool = np.delete(kmeans.labels_, annotated_ix)

    set_ix = list()
    for i in range(len(counts_active)):
        pos = np.where(ix_pool == i)[0]
        set_ix = set_ix + np.random.choice(pos, size=counts_active[i], replace=False).tolist()

    print("Adding", len(set_ix), "annotated images")

    annotated_ix = annotated_ix + set_ix

    x_train_annotated = x_train[annotated_ix]    
    y_train_annotated = y_train[annotated_ix]

    print("Current size of set of annotated images", len(x_train_annotated))
    
    active_model.fit(
        x=x_train_annotated,
        y=y_train_annotated,
        batch_size=BATCH_SIZE,
        epochs=6,
        validation_split=0.2,
    )

    _, metrics = active_model.evaluate(
        x=x_test,
        y=y_test,
    )

    print("SparseCategoricalAccuracy", metrics)

    active_metrics.append(metrics)


Misclasification by class adjusted to sum up to a batch [ 7 11  6 10  9  9  6  9  6 55]
Adding 128 annotated images
Current size of set of annotated images 728
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.8434000015258789
Misclasification by class adjusted to sum up to a batch [15 17  7 19 17 11  8  8  7 19]
Adding 128 annotated images
Current size of set of annotated images 856
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.873199999332428
Misclasification by class adjusted to sum up to a batch [ 8 33  3 24 20  8 16 12  4]
Adding 128 annotated images
Current size of set of annotated images 984
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
SparseCategoricalAccuracy 0.8841999769210815


## Results Compared