# Importing all the necessary libraries 

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.vgg16 import VGG16

In [None]:
import cv2
import matplotlib.pyplot as plt

In [None]:
from albumentations import (
    Compose, RandomBrightness, JpegCompression, HueSaturationValue, RandomContrast, HorizontalFlip, Rotate, GaussianBlur, Cutout
)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
tf.random.set_seed(0)

# 1. Read the dataset

In [None]:
df = pd.read_csv('../input/lego-minifigures-classification/index.csv')
df.describe()

In [None]:
df.head()

# 2. Splitting the dataset into training and validation sets
We can see that:
* path - path to the image
* class_id - class of the image
* train-valid - indication of training and validation data
We use train-valid to split the dataset into training and validation set

In [None]:
train_set = df[df['train-valid'] == 'train']
valid_set = df[df['train-valid'] == 'valid']

An overview of the training set

In [None]:
train_set

An overview of the validation set

In [None]:
valid_set

Creating the train paths list to hold all the image paths to training images

In [None]:
train_paths = []
for path in train_set['path'].values:
    train_paths.append(os.path.join('../input/lego-minifigures-classification/',path))
train_paths

Creating the valid paths list to hold all the image paths to validation images

In [None]:
valid_paths = []
for path in valid_set['path'].values:
    valid_paths.append(os.path.join('../input/lego-minifigures-classification/',path))
valid_paths

Creating the train labels list to hold all the corresponding classes of training images

In [None]:
train_labels = train_set['class_id'].values
train_labels

Creating the valid labels list to hold all the corresponding classes of validation images

In [None]:
valid_labels = valid_set['class_id'].values
valid_labels

Getting the number of classes of images from the metadata.csv file

In [None]:
dfmeta = pd.read_csv('../input/lego-minifigures-classification/metadata.csv')
no_of_classes = dfmeta.shape[0]
no_of_classes

# 3. Creating a custom data generator for our dataset.
For more information and a detailed explanation of the concept, do check out these amazing blog posts:
* https://towardsdatascience.com/implementing-custom-data-generators-in-keras-de56f013581c 
* https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

Image augmentations are also performed by the data generator using the albumentations package.
* https://albumentations.ai/docs/getting_started/image_augmentation/

We'll be using augmentations.transforms for performing augmentations like flip, rotation, blur, contrast, brightness, blur
* https://albumentations.ai/docs/api_reference/augmentations/transforms/

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, paths, labels = None, image_size = (512,512), batch_size = 32, num_classes = None, shuffle = False, transforms = False):
        self.paths = paths
        self.labels = labels
        self.image_size = image_size
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.shuffle = shuffle
        self.transforms = transforms
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.paths)//self.batch_size
    
    def __getitem__(self, index):
        indices = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
        X, y = self.__get_data(indices)
        return X, y
    
    def __get_data(self, indices):
        batch = [self.paths[k] for k in indices]
        images = []
        for i in range(self.batch_size):
            img = cv2.imread(batch[i])
            img = cv2.resize(img, self.image_size)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if self.transforms:
                img = self.transforms(image = img)['image']
            img = img/255.0
            images.append(img)
        labels = [self.labels[k] - 1 for k in indices]
        return np.array(images), np.array(labels)       
    
    # this function is called at the end of every epoch
    def on_epoch_end(self):
        self.indices = np.arange(len(self.paths))
        if self.shuffle:
            np.random.shuffle(self.indices)
            
# function to carry out image augmentation
def transforms():
    return Compose([
                    Rotate(limit=40),
                    HorizontalFlip(p=0.5),
                    RandomBrightness(limit=0.2,p=0.5),
                    RandomContrast(limit=0.2, p=0.5),
                    JpegCompression(quality_lower=85, quality_upper=100, p=0.5),
                    HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
                    GaussianBlur(blur_limit=(3, 7), always_apply=False, p=0.5),
                    Cutout(num_holes=8, max_h_size=8, max_w_size=8, fill_value=0, always_apply=False, p=0.5)
                    ])
    

In [None]:
IMAGE_SIZE = (512,512)

# 4. Specifying the train and validation data generators

In [None]:
train_generator = DataGenerator(train_paths,
                               train_labels,
                               image_size = IMAGE_SIZE,
                               batch_size = 4,
                               num_classes = no_of_classes,
                               shuffle = True,
                               transforms = transforms())

In [None]:
valid_generator = DataGenerator(valid_paths,
                               valid_labels,
                               image_size = IMAGE_SIZE,
                               batch_size = 1,
                               num_classes = no_of_classes,
                               shuffle = False)

# 5. Plotting images from the train and validation sets

Plotting a few images from the training set

In [None]:
plt.figure(figsize = (16,16))
for row in range(4):
    images, labels = train_generator[row]
    for col in range(4):
        plt.subplot(4,4,(row * 4 + col) + 1)
        plt.imshow(images[col])
        plt.title(labels[col])

Plotting a few images from the validation set

In [None]:
plt.figure(figsize = (16,16))
for i in range(16):
    image, label = valid_generator[i]
    plt.subplot(4,4,i + 1)
    plt.imshow(image[0])
    plt.title(label[0])

# Pre Trained Models in keras, tensorflow

Keras has a wide range of pre trained models to choose from and implement in our code.
To read about all the pre trained models available in keras and their usage refer:
https://keras.rstudio.com/articles/applications.html

# 6. Creating and training a VGG16 model

We will be using the VGG16 model which is built for image classification. It was developed by Visual Graphics Group (VGG) at Oxford and described in the "Very Deep Convolutional Networks for Large-Scale Image Recognition" paper.

Some useful links:
* https://keras.io/api/applications/vgg/
* https://keras.rstudio.com/reference/application_vgg.html
* https://www.analyticsvidhya.com/blog/2020/08/top-4-pre-trained-models-for-image-classification-with-python-code/

In [None]:
def create_model(input_shape):
    # initialize the base model as VGG16 model with input shape as (512,512,3)
    base_model = VGG16(input_shape = input_shape,
                       include_top = False,
                       weights = 'imagenet')

    # we do not have to train all of the layers
    for layer in base_model.layers:
        layer.trainable = False
        
    x = layers.Flatten()(base_model.output)
    x = layers.Dense(512, activation = 'relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(no_of_classes, activation = 'softmax')(x)
    
    return models.Model(base_model.input,x)

In [None]:
model = create_model((512,512,3))

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy',
             optimizer = Adam(learning_rate=0.0001),
             metrics = ['accuracy'])

Adding some callback functions from the tensorflow library
* EarlyStopping - https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
* ModelCheckpoint - https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint

In [None]:
# Stop training when the validation loss metric has stopped decreasing for 5 epochs.
early_stopping = EarlyStopping(monitor = 'val_loss',
                               patience = 5,
                               mode = 'min',
                               restore_best_weights = True)

In [None]:
# Save the model with the minimum validation loss
checkpoint = ModelCheckpoint('best_model.hdf5', 
                             monitor = 'val_loss',
                             verbose = 1,
                             mode = 'min', 
                             save_best_only = True)

In [None]:
EPOCHS = 50

In [None]:
history = model.fit(train_generator,
                    validation_data = valid_generator,
                    epochs = EPOCHS,
                    steps_per_epoch = len(train_generator),
                    validation_steps = len(valid_generator),
                    callbacks = [early_stopping, checkpoint])

# 7. Evaluating the model

In [None]:
model.summary()

Plotting the training v/s validation accuracy and training v/s validation loss

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'c-', label='Training accuracy')
plt.plot(epochs, val_acc, 'y-', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'c-', label='Training Loss')
plt.plot(epochs, val_loss, 'y-', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

Load the best trained model and check its summary

In [None]:
# load the best saved model as a new model
new_model = models.load_model('best_model.hdf5')

# Check its architecture
new_model.summary()

In [None]:
# Evaluate the restored model

actual_y = []
pred_y = []

for image, label in valid_generator:
    pred_y.extend(new_model.predict(image).argmax(axis = 1))
    actual_y.extend(label)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(actual_y, pred_y)
print('Accuracy: {:5.2f}%'.format(100*acc))