### Load data & default imports

In [9]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
import scipy.stats as sstats
import multiprocessing as mp
import httpimport
from os.path import join
import os.path
from PIL import Image

import tensorflow as tf

import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten

import xgboost

from tqdm.auto import tqdm

from matplotlib import animation, pyplot, rc
import matplotlib.pyplot as plt
from matplotlib import animation, pyplot, rc

from scipy.optimize import curve_fit
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction import image
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import sklearn.linear_model

from FeaturesExtractor import FeaturesExtractor
from helpers import plot_history

with np.load('cifar.npz') as data:
    X_trn = data['train_data']
    y_trn = data['train_labels']
    X_tst = data['test_data']
    y_tst = data['test_labels']

In [None]:
""" 
Helper function.
"""


def plot_history(history):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    epochs = range(1, len(loss) + 1)

    fig, axs = plt.subplots(1, 2, figsize=(15,5))

    axs[0].plot(epochs, loss, 'r', label='Training loss')
    axs[0].plot(epochs, val_loss, 'b', label='Validation loss')
    axs[0].set_title('Training and validation loss')
    axs[0].set_xlabel('Epochs')
    axs[0].set_ylabel('Loss')
    axs[0].legend()

    axs[1].plot(epochs, acc, 'r', label='Training accuracy')
    axs[1].plot(epochs, val_acc, 'b', label='Validation accuracy')
    axs[1].set_title('Training and validation accuracy')
    axs[1].set_xlabel('Epochs')
    axs[1].set_ylabel('Accuracy')
    axs[1].legend()

    plt.show()

### Wrapper functions for easier testing

In [10]:
# All in one place ...

def preprocess(X_trn, y_trn, X_tst, y_tst, PATCH_SIZE, PATCH_NUM, STRIDE, K):

    extractor = FeaturesExtractor(PATCH_SIZE, PATCH_NUM, STRIDE, K)

    kmeans_counts = extractor.fit(X_trn)

    X_trn_features = extractor.extract(X_trn)
    X_tst_features = extractor.extract(X_tst)
    
    return X_trn_features, y_trn, X_tst_features, y_tst

def test_model(X_trn_features, y_trn_categorical, X_tst_features, y_tst_categorical, estimators, depth, subsample, v=True):
    import xgboost as xgb

    model = xgb.XGBClassifier(
        n_estimators = estimators,
        num_class = 10,
        max_depth = depth,
        seed = 69,
        eval_metric = 'merror',
        subsample=subsample
    )

    model.fit(X_trn_features, y_trn, verbose = v, eval_set = [(X_tst_features, y_tst)])

    # Save the model's weights
    model.save_model('xgboost_220_10_new_patches.bin')

    accuracy = model.score(X_tst_features, y_tst)

    if v:
        print('Accuracy:', accuracy)
    return accuracy, model

    

### Extract features and prepare training data

In [12]:
PATCH_SIZE = 16
PATCH_NUM  = 500000
STRIDE     = 16
K          = 32
X_trn_features, y_trn_categorical, X_tst_features, y_tst_categorical = preprocess(X_trn, y_trn, X_tst, y_tst, PATCH_SIZE, PATCH_NUM, STRIDE, K)

Started fitting extractor:


  0%|          | 0/1731 [00:00<?, ?it/s]

Started extracting


  0%|          | 0/50000 [00:00<?, ?it/s]

Started extracting


  0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
ESTIMATORS = 250
DEPTH = 5
SUBSAMPLE = 1.0
accuracy, model = test_model(X_trn_features, y_trn_categorical, X_tst_features, y_tst_categorical, ESTIMATORS, DEPTH, SUBSAMPLE)
print(accuracy)

[0]	validation_0-merror:0.70580
[1]	validation_0-merror:0.68530
[2]	validation_0-merror:0.66670
[3]	validation_0-merror:0.65760
[4]	validation_0-merror:0.64640
[5]	validation_0-merror:0.63760
[6]	validation_0-merror:0.63290


KeyboardInterrupt: 

In [15]:
plt.hist(kmeans_counts, bins=K)
plt.xlabel('Number of elements')
plt.ylabel('Count')
plt.title('Distribution of number of elements for centroids')
plt.show()

NameError: name 'kmeans_counts' is not defined

### Trying neural network instead of XGBoost

In [17]:
tf.keras.utils.set_random_seed(69)

M = X_trn_features.shape[1]
DROPOUT = 0.3

model = Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(M,)),
    keras.layers.Dropout(DROPOUT),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(DROPOUT),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(DROPOUT),
    keras.layers.Dense(10, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_trn_features, y_trn_categorical, batch_size=200, epochs=150, verbose=1, validation_data=(X_tst_features, y_tst_categorical))

loss, accuracy = model.evaluate(X_tst_features, y_tst_categorical, verbose=0)
print('Accuracy:', accuracy)
plot_history(history)

Epoch 1/150


2023-04-24 00:58:52.458661: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/losses.py", line 2004, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/opt/homebrew/Caskroom/miniconda/base/envs/ml/lib/python3.9/site-packages/keras/backend.py", line 5532, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (200, 1) and (200, 10) are incompatible


Explanations for the results table in the README:

* More features logreg (1)
```
PATCH_SIZE = 4
PATCH_NUM  = 1000000
STRIDE     = 2
K          = 256
```

* Slightly more features logreg (2)
```
PATCH_SIZE = 6
PATCH_NUM  = 100000
STRIDE     = 8
K          = 64
```
* Simple neural network (3)
```
PATCH_SIZE = 4
PATCH_NUM  = 1000000
STRIDE     = 2
K          = 64

Neural network achitecture:

dense 1024 -> dense 512 -> dense 10

activation : relu
loss       : categorical_crossentropy
optimizer  : adam
```
* A bit tuned neural network (4)
```
PATCH_SIZE = 12
PATCH_NUM  = 1000000
STRIDE     = 6
K          = 256

Neural network achitecture:

dense 1024 -> dense 1024 -> dense 1024 -> dense 10

dropout    : 0.2
activation : relu
loss       : categorical_crossentropy
optimizer  : adam
```
* Random Forest (5)
```
PATCH_SIZE = 7
PATCH_NUM  = 2000000
STRIDE     = 5
K          = 256

default
```
* XGBoost (6)
```
PATCH_SIZE = 8
PATCH_NUM  = 1000000
STRIDE     = 6
K          = 32

max depth  : 15
estimators : 60
```
* XGBoost (7)
```
max depth  : 6
estimators : 100
```
* XGBoost (8)
```
PATCH_SIZE = 10
PATCH_NUM  = 4000000
STRIDE     = 5
K          = 128