# Load libraries and define functions

In [50]:
# Library imports
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from skimage.io import imread
from skimage.transform import resize
from sklearn.metrics import accuracy_score
import re

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [12]:
# model/training variables. batch size must be divisor of number of training, validation, and test records.
BATCH_SIZE = 30
IMG_HEIGHT = 224
IMG_WIDTH = 224

In [45]:
# https://medium.com/@mrgarg.rajat/training-on-large-datasets-that-dont-fit-in-memory-in-keras-60a974785d71
# This generator has to be used because the dataset is too large to keep in memory so we have to load it in in batches.

class My_Custom_Generator(tf.keras.utils.Sequence) :
  
  def __init__(self, image_filenames, labels, batch_size, IMG_HEIGHT, IMG_WIDTH) :
    self.image_filenames = image_filenames
    self.labels = labels
    self.batch_size = batch_size
    self.IMG_HEIGHT = IMG_HEIGHT
    self.IMG_WIDTH = IMG_WIDTH
    
    
  def __len__(self) :
    return (np.ceil(len(self.image_filenames) / float(self.batch_size))).astype(np.int)
  
  
  def __getitem__(self, idx) :
    batch_x = self.image_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
    batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
    
    return np.array([
            np.resize(imread(str(file_name)), (self.IMG_HEIGHT, self.IMG_WIDTH, 3))
               for file_name in batch_x])/255.0, np.array(batch_y)

In [4]:
def prep_data_and_model(dataset, skip_load = False):
    """
    Function to load all the relevant data, train the model with early stopping, saving best validation accuracy snapshots, and 
        save training results to a file for later visualisation. 
    arg:
        dataset - name of the dataset files to work from
        skip_load - should the load from drive into colab be skipped
    """
    print('Loading Data details')
    # Prep datasets
    with open('/content/drive/My Drive/Data/final-book30-labels-train.csv', mode='r', encoding='utf-8', errors='ignore') as f:
        train_labels = pd.read_csv(f, delimiter=",", header=None, names=['record', 'Filename', 'Category ID'])

    with open('/content/drive/My Drive/Data/final-book30-labels-valid.csv', mode='r', encoding='utf-8', errors='ignore') as f:
        valid_labels = pd.read_csv(f, delimiter=",", header=None, names=['record', 'Filename', 'Category ID'])

    train_labels = train_labels.assign(Full_Filename = f'/content/{dataset}/'+ train_labels["Filename"])
    valid_labels = valid_labels.assign(Full_Filename = f'/content/valid_{dataset}/{dataset}/'+ valid_labels["Filename"])

    print('Loading data')
    if not skip_load:
        # Load actual data
        zip_path = f'/content/drive/My Drive/images/Train/{dataset}.zip'
        !cp "{zip_path}" .
        !unzip -q "{dataset}.zip" 
        !rm "{dataset}.zip" 

        zip_path = f'/content/drive/My Drive/images/Valid/{dataset}.zip'
        !cp "{zip_path}" .
        !unzip -q "{dataset}.zip"  -d "valid_{dataset}" 
        !rm "{dataset}.zip" 

    print('Prepping model')
    my_training_batch_generator = My_Custom_Generator(train_labels["Full_Filename"], train_labels["Category ID"], BATCH_SIZE, IMG_HEIGHT, IMG_WIDTH)
    my_validation_batch_generator = My_Custom_Generator(valid_labels["Full_Filename"], valid_labels["Category ID"], BATCH_SIZE, IMG_HEIGHT, IMG_WIDTH)

    base_model = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3),
                                               include_top=False,
                                               weights='imagenet')

    base_model.trainable = False

    model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(30, activation='softmax')
    ])

    model.compile(optimizer='Adam',
              loss='SparseCategoricalCrossentropy',
              metrics=['accuracy', tf.keras.metrics.SparseTopKCategoricalAccuracy(3)])
    
      #early stopping and checkpoints
    es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', verbose=1, patience=15)
    mc = tf.keras.callbacks.ModelCheckpoint(f'/content/drive/My Drive/Models/data_explore/{dataset}' + '_{epoch:02d}.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=True)
    cl = tf.keras.callbacks.CSVLogger(f"/content/drive/My Drive/Models/data_explore/{dataset}_model_history_log.csv", append = True)

    print('Training model')
    history = model.fit_generator(generator=my_training_batch_generator, 
                              validation_data = my_validation_batch_generator,
                              steps_per_epoch = int(len(train_labels) // BATCH_SIZE),
                              validation_steps = int(5130 // BATCH_SIZE),
                              epochs = 30,
                              verbose = 1,
                              callbacks = [es, mc, cl])
    
    return history


# Train models

In [None]:
prep_data_and_model('padded', True)

Loading Data details
Loading data
Prepping model
Training model
Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.14113, saving model to /content/drive/My Drive/Models/data_explore/padded_01.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.14113 to 0.14172, saving model to /content/drive/My Drive/Models/data_explore/padded_02.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.14172 to 0.15712, saving model to /content/drive/My Drive/Models/data_explore/padded_03.h5
Epoch 4/30
Epoch 00004: val_accuracy did not improve from 0.15712
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.15712 to 0.17076, saving model to /content/drive/My Drive/Models/data_explore/padded_05.h5
Epoch 6/30
Epoch 00006: val_accuracy did not improve from 0.17076
Epoch 7/30
Epoch 00007: val_accuracy did not improve from 0.17076
Epoch 8/30
Epoch 00008: val_accuracy improved from 0.17076 to 0.17407, saving model to /content/drive/My Drive/Models/data_explore/padded_08.h5
Epoch 9/30
Epoch 0000

<tensorflow.python.keras.callbacks.History at 0x7f0f2c6bfcc0>

In [None]:
!rm -r padded
!rm -r valid_padded

In [None]:
prep_data_and_model('noprep')

Loading Data details
Loading data
Prepping model
Training model
Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.13080, saving model to /content/drive/My Drive/Models/data_explore/noprep_01.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.13080 to 0.14191, saving model to /content/drive/My Drive/Models/data_explore/noprep_02.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.14191 to 0.14444, saving model to /content/drive/My Drive/Models/data_explore/noprep_03.h5
Epoch 4/30
Epoch 00004: val_accuracy did not improve from 0.14444
Epoch 5/30
Epoch 00005: val_accuracy did not improve from 0.14444
Epoch 6/30
Epoch 00006: val_accuracy improved from 0.14444 to 0.14756, saving model to /content/drive/My Drive/Models/data_explore/noprep_06.h5
Epoch 7/30
Epoch 00007: val_accuracy did not improve from 0.14756
Epoch 8/30
Epoch 00008: val_accuracy did not improve from 0.14756
Epoch 9/30
Epoch 00009: val_accuracy did not improve from 0.14756
Epoch 10/30
Epoch 00010: val_accu

<tensorflow.python.keras.callbacks.History at 0x7f0f15768dd8>

In [None]:
!rm -r noprep
!rm -r valid_noprep

In [None]:
prep_data_and_model('cropped')

Loading Data details
Loading data
Prepping model
Training model
Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.11404, saving model to /content/drive/My Drive/Models/data_explore/cropped_01.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.11404 to 0.12027, saving model to /content/drive/My Drive/Models/data_explore/cropped_02.h5
Epoch 3/30
Epoch 00003: val_accuracy did not improve from 0.12027
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.12027 to 0.13567, saving model to /content/drive/My Drive/Models/data_explore/cropped_04.h5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.13567 to 0.14016, saving model to /content/drive/My Drive/Models/data_explore/cropped_05.h5
Epoch 6/30
Epoch 00006: val_accuracy did not improve from 0.14016
Epoch 7/30
Epoch 00007: val_accuracy did not improve from 0.14016
Epoch 8/30
Epoch 00008: val_accuracy did not improve from 0.14016
Epoch 9/30
Epoch 00009: val_accuracy did not improve from 0.14016
Epoch 10/30
Epoch 00010: val_

<tensorflow.python.keras.callbacks.History at 0x7f0f128730b8>

In [None]:
!rm -r cropped
!rm -r valid_cropped

# Evaluate Models


In [42]:
def test_data_pred(dataset, skip_load = False):
    print('Loading Data details')
    # Prep datasets
    with open('/content/drive/My Drive/Data/final-book30-labels-test.csv', mode='r', encoding='utf-8', errors='ignore') as f:
        test_labels = pd.read_csv(f, delimiter=",", header=None, names=['record', 'Filename', 'Category ID'])

    test_labels = test_labels.assign(Full_Filename = f'/content/test_{dataset}/{dataset}/'+ test_labels["Filename"])

    print('Loading data')
    if not skip_load:
        # Load actual data

        zip_path = f'/content/drive/My Drive/images/Test/{dataset}.zip'
        !cp "{zip_path}" .
        !unzip -q "{dataset}.zip"  -d "test_{dataset}" 
        !rm "{dataset}.zip" 

    my_test_batch_generator = My_Custom_Generator(test_labels["Full_Filename"], test_labels["Category ID"], BATCH_SIZE, IMG_HEIGHT, IMG_WIDTH)
    print('Loading model')

    base_model = tf.keras.applications.MobileNetV2(input_shape=(224, 224, 3),
                                               include_top=False,
                                               weights='imagenet')

    base_model.trainable = False

    model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(30, activation='softmax')
    ])

    model.compile(optimizer='Adam',
              loss='SparseCategoricalCrossentropy',
              metrics=['accuracy', tf.keras.metrics.SparseTopKCategoricalAccuracy(3)])
    
    files = os.listdir('/content/drive/My Drive/Models/data_explore/')
    pat = re.compile(f'^.*{dataset}.*\.h5$')
    files_cut = [i for i in files if pat.match(i) ]
    model_weights = max(files_cut)

    model.load_weights(f'/content/drive/My Drive/Models/data_explore/{model_weights}')

    print('Making predition')

    y_pred = np.argmax(model.predict(my_test_batch_generator, steps = int(len(test_labels) // BATCH_SIZE)), axis=-1)


    return y_pred, test_labels["Category ID"]




In [54]:
padded_pred, y_true = test_data_pred('padded')
!rm -r test_padded
noprep_pred, y_true = test_data_pred('noprep')
!rm -r test_noprep
cropped_pred, y_true = test_data_pred('cropped')
!rm -r test_cropped

Loading Data details
Loading data
Loading model
Making predition
Loading Data details
Loading data
Loading model
Making predition
Loading Data details
Loading data
Loading model
Making predition


In [57]:
# best method based on overall top 1 accuracy is padded.
print('Padded: ', accuracy_score(padded_pred, y_true), '\n',
      'NoPrep: ',accuracy_score(noprep_pred, y_true), '\n',
      'Cropped: ',accuracy_score(cropped_pred, y_true), sep = '')


Padded: 0.1580701754385965
NoPrep: 0.14385964912280702
Cropped: 0.1287719298245614
