# Pool-based sampling with CIFAR10

In [None]:
pip install modal

In [187]:
import numpy as np
import math

from modAL.models import ActiveLearner, Committee
from modAL.uncertainty import uncertainty_sampling
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from functools import partial
from modAL.batch import uncertainty_batch_sampling
from modAL.multilabel import *

import tensorflow as tf
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.wrappers.scikit_learn import KerasClassifier
from modAL.models import ActiveLearner


from IPython import display
from matplotlib import pyplot as plt
%matplotlib inline

## Functions

In [67]:
# imports the cifar files
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [68]:
# returns a image rgb array, label and label's name of the image with index im_idx
def cifar10_img_info(data, meta, im_idx=0):
    im = data[b'data'][im_idx, :]

    im_r = im[0:1024].reshape(32, 32)
    im_g = im[1024:2048].reshape(32, 32)
    im_b = im[2048:].reshape(32, 32)

    img = np.dstack((im_r, im_g, im_b))
    label = data[b'labels'][im_idx]
    category = meta[b'label_names'][data[b'labels'][im_idx]]

    return img, label, category

In [69]:
# returns a image rgb array
def img_reshape(im):
    im_r = im[0:1024].reshape(32, 32)
    im_g = im[1024:2048].reshape(32, 32)
    im_b = im[2048:].reshape(32, 32)

    img = np.dstack((im_r, im_g, im_b))
    
    return img

In [70]:
# returns X and y of the dataset, X.shape = (nº of samples x features) and y.shape = (nº of samples)
def batch_to_xy(batch):
    X = batch[b"data"]
    y = batch[b"labels"]
    y = np.array(y)
    y.reshape(y.shape[0])
    return X, y

In [176]:
from keras.optimizers import SGD
def create_keras_model():
	model = Sequential()
	model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(32, 32, 3)))
	model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.2))
	model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.2))
	model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
	model.add(MaxPooling2D((2, 2)))
	model.add(Dropout(0.2))
	model.add(Flatten())
	model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
	model.add(Dropout(0.2))
	model.add(Dense(10, activation='softmax'))
	# compile model
	opt = SGD(lr=0.001, momentum=0.9)
	model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
	return model

## The dataset

In [7]:
# batch with 10000 examples
batch1 = unpickle("data_batch_1")
# meta has the labels' names
meta = unpickle("batches.meta")

## Principal component analysis

In [84]:
X_raw, Y_raw = batch_to_xy(batch1)

#APPLY PCA
#https://towardsdatascience.com/integration-of-dimension-reduction-methods-and-neural-network-for-image-classification-96281963fe24
#99% -> 658
#95% -> 217
#90% -> 99
#80% -> 21
RANDOM_STATE_SEED = 123
np.random.seed(RANDOM_STATE_SEED)
pca = PCA(n_components=21, random_state=RANDOM_STATE_SEED)
tf_X = pca.fit_transform(X=X_raw)
print(tf_X.shape)

(10000, 21)


In [85]:
# Isolate our examples for our labeled dataset.
n_labeled = tf_X.shape[0]   #Number of samples
training_indices = np.random.choice(n_labeled, size=1500)

X_train = tf_X[training_indices]
y_train = Y_raw[training_indices]

# Isolate the non-training examples we'll be querying.
X_pool = np.delete(tf_X, training_indices, axis=0)
y_pool = np.delete(Y_raw, training_indices, axis=0)

X_train.shape

(1500, 21)

## Ranked Bacth Mode

In [10]:
# Specify our core estimator.
knn = KNeighborsClassifier(n_neighbors=3)
# Pre-set our batch sampling to retrieve 3 samples at a time.
BATCH_SIZE = 30
preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)

In [11]:
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    X_training=X_train,
    y_training=y_train,
    query_strategy=preset_batch
)

In [12]:
# Record our learner's score on the raw data.
unqueried_score = learner.score(tf_X, Y_raw)
print('Ranked Batch Mode Approach Score: {acc:0.4f}'.format(acc=unqueried_score*100))

Ranked Batch Mode Approach Score: 44.4600


In [13]:
X_train.shape

(1500, 21)

## Multilabel SVM

In [14]:
learner = ActiveLearner(
    estimator=OneVsRestClassifier(SVC(probability=True, gamma='auto')),
    X_training=X_train,
    y_training = y_train,
    query_strategy=avg_score
)

In [15]:
# Record our learner's score on the raw data.
unqueried_score2 = learner.score(tf_X, Y_raw)
print('Multilabel SVM Approach Score: {acc:0.4f}'.format(acc=unqueried_score2*100))

Multilabel SVM Approach Score: 22.7200


## Query by comittee

In [16]:
# initializing Committee members
n_members = 5
learner_list = list()

qbc_X_pool = tf_X;
qbc_y_pool = Y_raw;

for member_idx in range(n_members):
    train_idx = np.random.choice(qbc_X_pool.shape[0], size=300, replace=False)
    qbc_X_train = qbc_X_pool[train_idx]
    qbc_y_train = qbc_y_pool[train_idx]

    # creating a reduced copy of the data with the known instances removed
    qbc_X_pool = np.delete(qbc_X_pool, train_idx, axis=0)
    qbc_y_pool = np.delete(qbc_y_pool, train_idx)

    # initializing learner
    learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        X_training=qbc_X_train, y_training=qbc_y_train
    )
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)


In [17]:
# Record our learner's score on the raw data.
unqueried_score3 = committee.score(tf_X,Y_raw)
print('Query by Committee Approach Score: {acc:0.4f}'.format(acc=unqueried_score3*100))

Query by Committee Approach Score: 45.5500


## Deep Bayesian - CNN

In [189]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.wrappers.scikit_learn import KerasClassifier
# create the classifier
classifier = KerasClassifier(create_keras_model)


a = X_raw[training_indices]
b = Y_raw[training_indices]

Xk_train = tf.reshape(a, [-1,32,32,3])
Xk_test = tf.reshape(X_raw, [-1,32,32,3])

Yk_train = keras.utils.to_categorical(y_train, 10)
Yk_test = keras.utils.to_categorical(Y_raw, 10)


print(Xk_test.shape)
# initialize ActiveLearner
keras = ActiveLearner(
    estimator=classifier,
    X_training=Xk_train, 
    y_training=Yk_train
)

unqueried_score4 = keras.score(Xk_test, Yk_test)
print('Query by Keras Approach Score: {acc:0.4f}'.format(acc=unqueried_score4*100))

(10000, 32, 32, 3)
Query by Keras Approach Score: 10.0500


## Results

In [190]:
print('Ranked Batch Mode Approach Score: {acc:0.4f}'.format(acc=unqueried_score*100))
print('Multilabel SVM Approach Score: {acc:0.4f}'.format(acc=unqueried_score2*100))
print('Query by Committee Approach Score: {acc:0.4f}'.format(acc=unqueried_score3*100))
print('CNN Approach Score: {acc:0.4f}'.format(acc=unqueried_score4*100))

Ranked Batch Mode Approach Score: 44.4600
Multilabel SVM Approach Score: 22.7200
Query by Committee Approach Score: 45.5500
CNN Approach Score: 10.0500
