<a href="https://colab.research.google.com/github/saimasharleen/Active-Learning-Strategies-Across-Diverse-Machine-Learning-Models/blob/main/MNIST_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install git+https://github.com/modAL-python/modAL.git

Collecting git+https://github.com/modAL-python/modAL.git
  Cloning https://github.com/modAL-python/modAL.git to /tmp/pip-req-build-26i6ndwz
  Running command git clone --filter=blob:none --quiet https://github.com/modAL-python/modAL.git /tmp/pip-req-build-26i6ndwz
  Resolved https://github.com/modAL-python/modAL.git to commit bba6f6fd00dbb862b1e09259b78caf6cffa2e755
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [12]:
import numpy as np
import joblib
import collections
from sklearn.svm import SVC
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchvision.datasets import MNIST
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling


In [13]:
# Load MNIST data
mnist_data = MNIST('.', download=True, transform=ToTensor())
dataloader = DataLoader(mnist_data, shuffle=True, batch_size=60000)
x, y = next(iter(dataloader))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 64348326.65it/s]


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 63789201.59it/s]


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 33310012.41it/s]


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 10744799.08it/s]

Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw






In [14]:
# Preprocessing
x = x.detach().cpu().numpy().reshape(x.size(0), -1)  # Flatten the images
y = y.detach().cpu().numpy()

In [15]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=10000, random_state=0, stratify=y)

In [16]:
# Define confusion matrix print function
def CF_Print(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    CF = confusion_matrix(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    print('Confusion Matrix:')
    print(CF)
    print('Accuracy: ', accuracy)
    print('Macro F1-score: ', macro_f1)
    print('Micro F1-score: ', micro_f1)
    print('Weighted F1-score: ', weighted_f1)
    print("---------------------------------")

In [18]:
# Active Learning with SVM
def AL_Retrain(n_initial, n_queries, instances):
    x_initial, x_pool, y_initial, y_pool = train_test_split(x_train, y_train, train_size=n_initial, random_state=0, stratify=y_train)

    # Initialize the learner
    learner = ActiveLearner(
        estimator=SVC(probability=True, kernel='linear'),
        query_strategy=uncertainty_sampling,
        X_training=x_initial, y_training=y_initial
    )

    print("Number of initial data: ", n_initial)
    y_pred = learner.predict(x_test)
    CF_Print(y_test, y_pred)

    # Active Learning Loop
    for idx in range(n_queries):
        print('--- Query no: ', idx+1, ' ----')
        query_idx, query_instance = learner.query(x_pool, n_instances=instances)
        learner.teach(X=x_pool[query_idx], y=y_pool[query_idx])
        print('Instance', y_pool[query_idx])
        y_pred = learner.predict(x_test)
        CF_Print(y_test, y_pred)

        # Update the pool
        x_pool = np.delete(x_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx, axis=0)

In [19]:
AL_Retrain(30, 10, 20)

Number of initial data:  30
Confusion Matrix:
[[781   0   4   0   1 104  56   6   1  34]
 [  0 906   0   1   2   5   0  23 186   1]
 [ 21  55 401   7  66 157  50  85 119  32]
 [ 70  43 120 375   3 227   6  66  48  64]
 [  5  24   0   0 730  59   5  39   2 110]
 [ 49  32   0  41  53 523  12  54  62  77]
 [ 39  24  18   0 210 330 330  29   4   2]
 [ 14  18   4   2  42   0   0 901   6  57]
 [ 13  61  20  14  76 235  14  20 446  76]
 [ 11  14   1  20 135  26   1 124   7 653]]
Accuracy:  0.6046
Macro F1-score:  0.594523041365415
Micro F1-score:  0.6046
Weighted F1-score:  0.5993992049943314
---------------------------------
--- Query no:  1  ----
Instance [2 7 6 8 6 8 2 6 6 6 7 6 6 6 2 7 6 2 6 2]
Confusion Matrix:
[[782   0  37   0   0  83  65   5   3  12]
 [  0 879  14   0   1   6   4  27 193   0]
 [ 19  38 665   4  20  79  60  69  31   8]
 [ 49  28 245 325   1 157  30  64  94  29]
 [  5  19   5   0 720  46  33  42   3 101]
 [ 42  20   9  34  47 423 127  72  70  59]
 [ 33   4  50   0 181 1

In [20]:
AL_Retrain(30, 10, 30)

Number of initial data:  30
Confusion Matrix:
[[781   0   4   0   1 104  56   6   1  34]
 [  0 906   0   1   2   5   0  23 186   1]
 [ 21  55 401   7  66 157  50  85 119  32]
 [ 70  43 120 375   3 227   6  66  48  64]
 [  5  24   0   0 730  59   5  39   2 110]
 [ 49  32   0  41  53 523  12  54  62  77]
 [ 39  24  18   0 210 330 330  29   4   2]
 [ 14  18   4   2  42   0   0 901   6  57]
 [ 13  61  20  14  76 235  14  20 446  76]
 [ 11  14   1  20 135  26   1 124   7 653]]
Accuracy:  0.6046
Macro F1-score:  0.594523041365415
Micro F1-score:  0.6046
Weighted F1-score:  0.5993992049943314
---------------------------------
--- Query no:  1  ----
Instance [5 5 5 3 2 3 6 3 3 2 5 5 5 2 5 5 3 2 6 5 2 2 6 3 2 4 6 3 2 2]
Confusion Matrix:
[[730   0   2  12   1 217  10   7   1   7]
 [  0 894   2   3   3  23   0  18 180   1]
 [  3  23 513  23  41 263   6  45  70   6]
 [ 17  12  47 657  10 216   2  31   8  22]
 [  6  11   4   0 811  67   1  24   1  49]
 [ 14   8   4 112  56 636  12  15  19  27]
 [ 

In [21]:
AL_Retrain(50, 10, 20)

Number of initial data:  50
Confusion Matrix:
[[ 898    1    5    1   17   41   12    3    2    7]
 [   0 1064    3    2    0    3    2   11   38    1]
 [  28   82  458   57  157   53   62   44   44    8]
 [  46   45  116  516    8  154    6   28   76   27]
 [   1   28    1    6  704   54   19   48    2  111]
 [ 121   46    3   67   35  356   18   37  134   86]
 [  25   31    6    2   48  171  686    6   11    0]
 [  12   79    4    2   28    1    6  836    1   75]
 [  24   86   44   85   33  100    9   14  513   67]
 [   6   41    2   18  130   36    2   99   11  647]]
Accuracy:  0.6678
Macro F1-score:  0.6564409386014295
Micro F1-score:  0.6678
Weighted F1-score:  0.6612747161436812
---------------------------------
--- Query no:  1  ----
Instance [0 3 8 2 4 2 5 8 5 9 8 2 2 5 8 8 1 3 5 5]
Confusion Matrix:
[[ 869    0   16    3   11   42   20    0    8   18]
 [   0 1046    4    0    0   21    0    2   50    1]
 [  18   70  587   77   98   39   41   23   36    4]
 [  53   47   53  562

In [22]:
AL_Retrain(50, 10, 30)

Number of initial data:  50
Confusion Matrix:
[[ 898    1    5    1   17   41   12    3    2    7]
 [   0 1064    3    2    0    3    2   11   38    1]
 [  28   82  458   57  157   53   62   44   44    8]
 [  46   45  116  516    8  154    6   28   76   27]
 [   1   28    1    6  704   54   19   48    2  111]
 [ 121   46    3   67   35  356   18   37  134   86]
 [  25   31    6    2   48  171  686    6   11    0]
 [  12   79    4    2   28    1    6  836    1   75]
 [  24   86   44   85   33  100    9   14  513   67]
 [   6   41    2   18  130   36    2   99   11  647]]
Accuracy:  0.6678
Macro F1-score:  0.6564409386014295
Micro F1-score:  0.6678
Weighted F1-score:  0.6612747161436812
---------------------------------
--- Query no:  1  ----
Instance [8 8 9 6 2 6 5 5 2 3 8 8 6 3 8 2 0 5 5 1 0 2 5 3 3 3 5 4 2 2]
Confusion Matrix:
[[ 870    0   19   12   10   60   12    0    4    0]
 [   0 1033    3    3    0   39   10    3   32    1]
 [  10   43  702   46   55   27   53   12   40    5]
 

In [23]:
AL_Retrain(100, 10, 20)

Number of initial data:  100
Confusion Matrix:
[[ 886    1    4    0   13   38   30    0    8    7]
 [   0 1098    6    1    2    4    0    4    9    0]
 [  27   54  616   21  116   11   41   42   55   10]
 [  18   34   38  764    7   66    8   20   54   13]
 [   1   23   20    0  750   15   16   26    5  118]
 [  27   30    4  138   20  587   19    4   65    9]
 [   9   28   24    4   42   71  790    1   17    0]
 [   9   39    7    0   18    2    1  899   13   56]
 [   9   51   19  114   31   67   21    8  628   27]
 [   8   39   26   14   96   15    0  126   13  655]]
Accuracy:  0.7673
Macro F1-score:  0.7616505418980798
Micro F1-score:  0.7673
Weighted F1-score:  0.7643518452322596
---------------------------------
--- Query no:  1  ----
Instance [2 8 2 3 2 9 2 5 1 5 1 3 8 8 2 2 3 5 6 8]
Confusion Matrix:
[[ 867    0   26    6    9   47   17    0    3   12]
 [   0 1085    6    1    0    1    1    3   27    0]
 [   7   43  731   36   48   21   34   31   38    4]
 [  13   34   34  79

In [24]:
AL_Retrain(300, 10, 20)

Number of initial data:  300
Confusion Matrix:
[[ 905    1    8    9   19   26    7    1   10    1]
 [   0 1104    7    2    5    2    0    2    1    1]
 [  16   43  811   18   26    4   23   21   28    3]
 [   7   31   48  806   12   48    4    6   43   17]
 [   4   13   10    2  850    3    4   10    0   78]
 [  10   32    8   90   34  666    7    3   43   10]
 [  26   13   19   11   29   21  860    1    6    0]
 [   2   34   14    0   34    1    1  915    8   35]
 [   7   56   30   77   17   52    5    5  704   22]
 [   6   17   14   17   73    9    1   54    7  794]]
Accuracy:  0.8415
Macro F1-score:  0.8391061546157041
Micro F1-score:  0.8415
Weighted F1-score:  0.8406312017169725
---------------------------------
--- Query no:  1  ----
Instance [5 5 1 8 8 3 5 0 5 0 0 0 6 0 9 6 5 7 7 5]
Confusion Matrix:
[[ 931    1    7    4   12   19   10    0    3    0]
 [   0 1099    6    1    7    6    1    2    2    0]
 [  15   41  805   19   23   11   24   28   27    0]
 [  10   40   27  77

In [25]:

AL_Retrain(300, 10, 30)

Number of initial data:  300
Confusion Matrix:
[[ 905    1    8    9   19   26    7    1   10    1]
 [   0 1104    7    2    5    2    0    2    1    1]
 [  16   43  811   18   26    4   23   21   28    3]
 [   7   31   48  806   12   48    4    6   43   17]
 [   4   13   10    2  850    3    4   10    0   78]
 [  10   32    8   90   34  666    7    3   43   10]
 [  26   13   19   11   29   21  860    1    6    0]
 [   2   34   14    0   34    1    1  915    8   35]
 [   7   56   30   77   17   52    5    5  704   22]
 [   6   17   14   17   73    9    1   54    7  794]]
Accuracy:  0.8415
Macro F1-score:  0.8391061546157041
Micro F1-score:  0.8415
Weighted F1-score:  0.8406312017169725
---------------------------------
--- Query no:  1  ----
Instance [5 3 8 0 5 3 6 5 2 0 8 0 8 8 5 5 5 8 3 7 8 0 0 2 7 9 6 1 2 0]
Confusion Matrix:
[[ 929    1    7    4    9   27    6    1    2    1]
 [   0 1099   10    1    6    4    1    2    1    0]
 [  13   30  835   25   16   10   18   14   30    2]
