# This Notebook Summarizes Results for the Cassava Leaf Disease Classification Competition 

This notebook is part of my learning journey to deepen my knowledge in deep learning using Keras. 

**Main points of this work:**
* Implements transfer learning using EfficientNetB4 and fine tuning the weights of all layers.
* Using stratified K-fold cross-validation.
* Implementing custom image generator since the dataset is extremely large [1].
* Image augmentation using imgaug library [2].
* Label smoothing was found to improve the performance - Most likely due to noisy training data labeling.
* The custom image generator is taking random 400x400x3 crops of the during training. After each epoch, a custom validation step is applied where 6 random crops are applied per image with a less aggressive augmentation compared to test time. The final prediction is based on the sum over all 6 random crops.
* Custom function implemented to reduce learning rate on plataue based on the custom validation step.
* Custom function for keeping the best weights based on the custom validation step, very similar to early stopping.


**Other things that was tried but didn't show any improvements:**
* Using max instead of sum during validation.
* Taking the N highest values per feature during validation instead of sum.
* Fixed weights of EfficientNetB4 and adding trainable dense layers at the output.


**Other comments:**
* This notebook shows an example of training one fold. Folds are run in different notebooks due to Kaggle runtime limitation.

[1] https://medium.com/@mrgarg.rajat/training-on-large-datasets-that-dont-fit-in-memory-in-keras-60a974785d71 

[2] https://imgaug.readthedocs.io/en/latest/source/overview_of_augmenters.html

In [None]:
import numpy as np 
import pandas as pd
import time as time
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.lines as mlines
import keras
from keras import backend as K
from keras.utils import to_categorical
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.applications import MobileNetV2, EfficientNetB3, InceptionResNetV2, EfficientNetB4
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D, Flatten, BatchNormalization

from skimage.transform import rescale, resize
import seaborn as sn

import imgaug as ia
import imgaug.augmenters as iaa
from imgaug import parameters as iap

from sklearn.model_selection import StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pickle

In [None]:
# Settings 
class settings:
    Nfolds = 5
    run_folds = [4]
    imsize = 400 #assuming square images
    nsubcuts = 6
    initial_learning_rate = 0.25e-4
    reduceLR = 0.5
    patience = 2
    tolerance = 1
    label_smoothing = 0.1
    debug = False
    input_shape = (400,400,3)
    R_dropout_finalLayer = 0.4
    batchSize = 16
    numEpochs = 12
    batchSize_predict = 1

In [None]:
path = '../input/cassava-leaf-disease-classification/'
df = pd.read_csv(path+"train.csv")
df = shuffle(df, random_state=42)

#If we need to debug we can use a subset of all images:
if settings.debug: 
    df = df[1:150] #use only 150 items

# Examples of images from the training set
* Image size is (600,800,3)

In [None]:
imgPath_train = path+'train_images/'
fileList = os.listdir(path+'train_images/')

plt.figure(figsize = (40,10))
gs = gridspec.GridSpec(2, 6)
gs.update(wspace=0, hspace=0.005) # set the spacing between axes. 
for k in range(12):
    img = Image.open(path+"train_images/" + fileList[k+7])
    plt.subplot(gs[k])
    plt.imshow(img)
    plt.axis('off')

# Investigating label statistics

In [None]:
labels = np.unique(df['label'])
labels_text = ["Cassava Bacterial Blight (CBB)", "Cassava Brown Streak Disease (CBSD)","Cassava Green Mottle (CGM)","Cassava Mosaic Disease (CMD)","Healthy"]
# Calculating the prior probabilities
classProb =np.zeros(len(labels))
idx = 0
for k in labels:
    print(f"{k} contains {(df['label'] == k).sum()} samples")
    classProb[idx] = (df['label'] == k).sum()
    idx+=1

# Visualizing the results in a pie-chart:
print() #Empty line before figure
color = ['#58508d','#bc5090','#ff6361', '#ffa600','#55AF21'] 
plt.figure(figsize=(15,7))
plt.pie(classProb, shadow=True, explode=[0.4,0, 0, 0,0.0],labels=labels_text,
        autopct='%1.2f%%', colors=color, startangle=0,
        textprops={'fontsize': 14})

class_weight_vect =np.square(1 / (classProb/classProb.sum()) )# Calculate the weight per classbased on the prior probability dervied from the training data.

# Functions to return randomly cropped images
* We will use random cuts of size (400,400,3)

In [None]:
# Note: Values for cropping hardcoded for now, should be updated! 

# Images are of shape (600, 800,3) we crop them into (400,400,3)
    
#Return one random crop:
def returnOneCropped(img): 
    img_height, img_width,_ = img.shape
    y = np.random.randint(0,img_width-400)
    x = np.random.randint(0,img_height-400)
    imgCut = img[x:x+400 , y:y+400,:]    
    return imgCut

#Return batch of randomly cropped images with test time augmentation applied:
def returnCroppedBatch(img, nBatches): 
    img_height, img_width,_ = img.shape
    imgCut = np.zeros((nBatches,400,400,3))
    for k in range(nBatches):      
        y = np.random.randint(0,img_width-400)
        x = np.random.randint(0,img_height-400)

        imgCut[k,:,:,:] = aug_TTA(image = img[x:x+400 , y:y+400,:])
    
    return imgCut.astype(int)

# Set up model for transfer learning
Here we define the model using pre-trained **EfficientNetB4**. All layers are set to be trainable. Further, the output uses a Global Average Pooling layer and an output layer with softmax activation and dropout on the weights.

In [None]:
# Construct the model for transfer learning 
def return_model():
    pretrained_layers = EfficientNetB4(weights='imagenet', include_top=False, input_shape=settings.input_shape)

    #Set all layers to be trainable
    for layer in pretrained_layers.layers:
        layer.trainable = True

    R_dropout = settings.R_dropout_finalLayer
    model = Sequential()
    model.add(pretrained_layers)
    model.add(GlobalAveragePooling2D())
    model.add(Dropout(R_dropout))
    model.add(Dense(5, activation = 'softmax'))
    return model

#Custom loss used to be able to set label smoothing
def custom_loss(y_true, y_pred):
    return keras.losses.categorical_crossentropy(y_true, y_pred, label_smoothing=settings.label_smoothing)

# Augmentation 
* Different augmentation for training and validation/test

In [None]:
# Train time augmentation:
aug = iaa.Sequential([
    iaa.Fliplr(0.5),
    iaa.Flipud(0.25),
    
    iaa.Sometimes(0.5,
                  iaa.LinearContrast((0.5, 1.25))),

    iaa.Sometimes(0.95,
                  iaa.CoarseDropout((0.03, 0.15),
                  size_percent=(0.01, 0.02),
                  per_channel=0.0 )),
                        
    iaa.Sometimes(0.4,
                  iaa.Crop(percent=(0, 0.25))),
        
    iaa.Sometimes(0.95,
            iaa.Affine(
                rotate=(-45, 45),
                shear=(-18, 18),        
                mode=ia.ALL)),
    
], random_order=True)


#Validation and test time augmentation: 
aug_TTA = iaa.Sequential([    
        iaa.Fliplr(0.5),
        iaa.Flipud(0.25),    
        iaa.Sometimes(0.25,
                      iaa.Crop(percent=(0, 0.1))),
        iaa.Sometimes(0.75,
            iaa.Affine(
                translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
                rotate=(-45, 45),
                shear=(-18, 18),
                cval=(0, 255),
                mode=ia.ALL)),    
], random_order=True)

# Original image that will be used as an example to show training and validation/test augmentation****:

In [None]:
img = Image.open(path+'train_images/' + fileList[7])
plt.figure(figsize = (10,10))
plt.imshow(img)
plt.show()

# Train time augmentation example:

In [None]:
plt.figure(figsize = (40,10))
gs = gridspec.GridSpec(2, 6)
gs.update(wspace=0, hspace=0.000) # set the spacing between axes. 
img = np.array(img)
for k in range(12):
    img_plot = returnOneCropped(aug(image=img))
    plt.subplot(gs[k])
    plt.imshow(img_plot)
    plt.axis('off')    

# Validation/test time augmentation example:

In [None]:
plt.figure(figsize = (40,10))
gs = gridspec.GridSpec(2, 6)
gs.update(wspace=0, hspace=0.005) # set the spacing between axes. 
img = np.array(img)
img_plot = returnCroppedBatch(img,12)
for k in range(12):
    plt.subplot(gs[k])
    plt.imshow( img_plot[k,:,:,:])
    plt.axis('off')

# The custom image generator 

See https://medium.com/@mrgarg.rajat/training-on-large-datasets-that-dont-fit-in-memory-in-keras-60a974785d71 for info on basic structure 

In [None]:
class customImageGenerator(keras.utils.Sequence):
     
  def __init__(self, image_filenames, labels, batch_size,imsize, nsubcuts, training) :
    self.image_filenames = image_filenames
    self.labels = labels
    self.batch_size = batch_size
    self.imsize = imsize
    self.nsubcuts = nsubcuts
    self.training = training
    
    
  def __len__(self) :
    return (np.floor(len(self.image_filenames) / float(self.batch_size))).astype(np.int)
  
  
  def __getitem__(self, idx) :
    batch_x = self.image_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
    batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
    nsubcuts=self.nsubcuts
    

    batch_x_split = np.empty(shape=(self.nsubcuts*self.batch_size,self.imsize,self.imsize,3))
    batch_y_split = []

    for b in range(len(batch_x)):             
        img = np.array(Image.open(batch_x.iloc[b]))
        if  self.training==True:
            img = aug(image=img)
        if nsubcuts == 1:
            splitBatch = returnOneCropped(img)
        else:
            splitBatch = returnCroppedBatch(img, settings.nsubcuts)

        batch_x_split[self.nsubcuts*(b):self.nsubcuts*(b+1),:,:,:] = splitBatch

        if nsubcuts == 1:
            batch_y_split = np.append(batch_y_split, np.array(batch_y.iloc[b]))                
        else:
            batch_y_split = np.append(batch_y_split, (np.ones(len(splitBatch))*np.array(batch_y.iloc[b])))


    return batch_x_split, to_categorical(batch_y_split, 5)


class customImageGenerator_predict(keras.utils.Sequence):  
  def __init__(self, image_filenames, batch_size,imsize, nsubcuts) :
    self.image_filenames = image_filenames
    self.batch_size = batch_size
    self.imsize = imsize
    self.nsubcuts = nsubcuts
        
  def __len__(self) :
    return self.nsubcuts
    
  def __getitem__(self, idx) :
    batch_x = self.image_filenames
    nsubcuts=self.nsubcuts
    batch_x_split = np.empty(shape=(self.nsubcuts*self.batch_size,self.imsize,self.imsize,3))       
    img = np.array(Image.open(batch_x))
    splitBatch = returnCroppedBatch(img)
    return splitBatch

# Training of model

In [None]:
#Define the stratified KFold split:
skf = StratifiedKFold(n_splits=settings.Nfolds, random_state=42, shuffle=True)

#Prepare the data for training and pre-allocate variables: 
image_id=df.drop('label', axis=1)
label=df.label
val_acc_results     = np.zeros((skf.n_splits,settings.numEpochs))
val_acc_results_best = np.zeros((skf.n_splits,settings.numEpochs))
ctr = 0

# Start training for one or several folds:
for train_index, val_index in skf.split(image_id,label):
    if ctr in settings.run_folds:
        print('##### Starting split: ' + str(ctr+1) +'/' +str(skf.n_splits) +' #####')

        model = return_model()
        model.compile(loss=custom_loss, optimizer=keras.optimizers.Adam(learning_rate=settings.initial_learning_rate), metrics=["accuracy"])
        image_id_train, image_id_val = image_id.iloc[train_index], image_id.iloc[val_index]
        label_train, label_val       = label.iloc[train_index],    label.iloc[val_index]

        #Loop for epochs:
        currentAccuracy = 0
        patience = 1
        for k_epochs in range(settings.numEpochs):
            t = time.time()
            history = model.fit(customImageGenerator(imgPath_train+image_id_train['image_id'], label_train, settings.batchSize,  settings.imsize, 1, True),
                                          steps_per_epoch = int(len(image_id_train)/settings.batchSize), epochs=1)    
            elapsed = time.time() - t
            print('Elapsed time: ' + str(np.round(elapsed)) +'s')
            
            ##########################################
            # Custom validation step:
            ##########################################
            t = time.time()
            #Predict on the validation data, using settings.nsubcuts number of random crops per image:
            yhat = model.predict(customImageGenerator(imgPath_train+image_id_val['image_id'], label_val, settings.batchSize_predict,  settings.imsize, settings.nsubcuts, False))
            
            #Sum up the predictions and make prediction based on the max of the sum: 
            yhat_comb_sum = np.zeros((np.int(yhat.shape[0]/settings.nsubcuts),5))
            for k in range(np.int(yhat.shape[0]/settings.nsubcuts)):
                yhat_comb_sum[k,:] = np.sum(yhat[k*settings.nsubcuts:(k+1)*settings.nsubcuts],0)               
            y_hat_labels_sum = np.argmax(yhat_comb_sum,1)
            y_true = np.array(label_val)
            val_accuracy_sum = np.sum((y_hat_labels_sum-y_true)==0)/len(y_hat_labels_sum)
            print('Accuracy sum:  ' +str(val_accuracy_sum))

            elapsed = time.time() - t
            print('Elapsed time: ' + str(np.round(elapsed)) +'s')

            # Check if the validation accuracy has improved, if so save the model. Also if the settings.patience is reached, reduce the learning rate if no improvment in validation accuracy is seen:
            val_acc_results[ctr,k_epochs] = val_accuracy_sum
            if val_accuracy_sum > currentAccuracy:
                currentAccuracy = val_accuracy_sum
                model.save('./split400x400_model_5fold_' +str(ctr) +'.h5')
            elif val_accuracy_sum <= currentAccuracy:
                if patience >= settings.patience: 
                    reduceLR = settings.reduceLR
                    K.set_value(model.optimizer.lr, K.get_value(model.optimizer.lr) * reduceLR )
                    print(str(val_accuracy_sum) +' <= ' +str(currentAccuracy) +' --- Lowering the LR to: ' +str( K.get_value(model.optimizer.lr)))
                    patience = 1
                else:
                    patience+=1
            #Store the best validation accuracy result:    
            val_acc_results_best[ctr,k_epochs] = currentAccuracy

        #Save results for this fold:
        with open('results_fold_' +str(ctr) +'.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
            pickle.dump([currentAccuracy, val_acc_results_best,val_acc_results, settings], f)    
    ctr += 1


# Plotting the training results for all folds
* Note that this is the results for slightly different (minor) settings in terms of augmentation compared to this notebook.
* We store the model at the point where we have the best validation accuracy

In [None]:
color = ['#58508d','#bc5090','#ff6361', '#ffa600','#0fa6f0', '#b05fff' ] 
fs = 15
Nfolds = 5
plt.figure(figsize = (10,8))
for k in range(Nfolds):    
    with open('../input/new-cassava-trans-learn-stratkfold-fold-'+str(k) +'/results_fold_' +str(k) +'.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
        currentAccuracy_old, val_acc_results_best_old,val_acc_results_old, settings_old = pickle.load(f)
        plt.plot(val_acc_results_old[k,:],     'o--',color=color[k])
        plt.plot(val_acc_results_best_old[k,:],'o-', color=color[k])

plt.plot(val_acc_results[settings.run_folds,:][0],      'x--',color='k')
plt.plot(val_acc_results_best[settings.run_folds,:][0], 'x-',color='k')
        
plt.grid('both', linestyle='--')


#Ugly way to make a decent looking legend:
l1 = mlines.Line2D([], [], color=color[0], marker='o' ,label='Fold 1', linewidth=0)
l2 = mlines.Line2D([], [], color=color[1], marker='o' ,label='Fold 2', linewidth=0)
l3 = mlines.Line2D([], [], color=color[2], marker='o' ,label='Fold 3', linewidth=0)
l4 = mlines.Line2D([], [], color=color[3], marker='o' ,label='Fold 4', linewidth=0)
l5 = mlines.Line2D([], [], color=color[4], marker='o' ,label='Fold 5', linewidth=0)
l6 = mlines.Line2D([], [], color='k',      marker='x' ,label='This Notebook\'s fold', linewidth=0)
l7 = mlines.Line2D([], [], color='k', marker=None,label='Accuracy'            , linestyle= '--' )
l8 = mlines.Line2D([], [], color='k', marker=None,label='Accuracy saved model', linestyle= '-')
plt.legend(handles=[l1, l2, l3, l4, l5, l6, l7, l8],loc='center left', bbox_to_anchor=(1, 0.5),fontsize=fs )

plt.xlabel('Epoch #')
plt.ylabel('Accuracy')


1. # Investigate the confusion matrix for the best model in this fold

In [None]:
# Plot the classification performance in a confusion matrix
#Define the stratified KFold split:
skf = StratifiedKFold(n_splits=settings.Nfolds, random_state=42, shuffle=True)
image_id=df.drop('label', axis=1)
label=df.label
ctr = 0
for train_index, val_index in skf.split(image_id,label):
    if ctr in settings.run_folds:
        image_id_train, image_id_val = image_id.iloc[train_index], image_id.iloc[val_index]
        label_train, label_val       = label.iloc[train_index],    label.iloc[val_index]
    ctr += 1

#Using a previous fold here to get predictable results without having to re-run the whole training:
model = return_model()
model.load_weights('../input/new-cassava-trans-learn-stratkfold-fold-' +str(settings.run_folds[0]) +'/split400x400_model_5fold_' +str(settings.run_folds[0]) +'.h5')
yhat = model.predict(customImageGenerator(imgPath_train+image_id_val['image_id'], label_val, settings.batchSize_predict,  settings.imsize, settings.nsubcuts, False))
yhat_comb_sum = np.zeros((np.int(yhat.shape[0]/settings.nsubcuts),5))
for k in range(np.int(yhat.shape[0]/settings.nsubcuts)):
    yhat_comb_sum[k,:] = np.sum(yhat[k*settings.nsubcuts:(k+1)*settings.nsubcuts],0)               
y_hat_labels_sum = np.argmax(yhat_comb_sum,1)
y_true = np.array(label_val)

In [None]:
labels_num = [0,1,2,3]
cm = confusion_matrix( y_true, y_hat_labels_sum, normalize='true')
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, cmap="rocket_r", xticklabels = labels_text, yticklabels = labels_text)
plt.show()

# Illustrate the probability evolution when taking the sum of several random crops with augmentation 

* Note that the plots are showing the probability calculated from the cumulative sum

* Showing example of the accuracy on three different images


In [None]:
gs = gridspec.GridSpec(3, 1)
ctr = 0
for k in [8, 6, 1]: 
    plt.figure(figsize = (10,8))
    #plt.subplot(gs[ctr])
    yhat_this = yhat[k*settings.nsubcuts:(k+1)*settings.nsubcuts]
    plt.plot(np.cumsum(yhat_this,0)/ (1 + np.array(range(len(yhat_this))).reshape([len(yhat_this),1])),'o-',mec='gray')    
    plt.xlabel('Subcut #')
    plt.ylabel('Probability')
    plt.title('Image #' +str(k))
    ctr = ctr+1
    
    plt.legend(labels_text,loc='center left', bbox_to_anchor=(1, 0.5),fontsize=fs )

Finally we investigate the gain in validation accuracy for one fold from using N number of random crops with augmentation at test time. Note that for the full K-folds, gains were observed up to 12 random crops and similar for the public leaderboard. However, for a single fold the results may look a bit noisy and for this specific case we seem to hit the maximum validation accuracy with 4 random crops. Note that it is dependent on the random crop and the random augmentations that are applied and that fluctuations in accuracy decreases with the number of subcuts. However, naturally the processing time also increases with the number of subcuts that are applied. 

In [None]:
# Loading data that was run in a separate notebook due to memory and limited run-time on Kaggle:
run_subcutSweep = False
if run_subcutSweep:
    Nsubcut_sweep =[1, 2, 4, 6, 8, 10, 12]
    val_accuracy_sum_sweep = np.zeros(len(Nsubcut_sweep))
    #Using a previous fold here to get predictable results without having to re-run the whole training:
    skf = StratifiedKFold(n_splits=settings.Nfolds, random_state=42, shuffle=True)
    image_id=df.drop('label', axis=1)
    label=df.label
    ctr = 0
    for train_index, val_index in skf.split(image_id,label):
        if ctr in settings.run_folds:
            image_id_train, image_id_val = image_id.iloc[train_index], image_id.iloc[val_index]
            label_train, label_val       = label.iloc[train_index],    label.iloc[val_index]
        ctr += 1

    K.clear_session()
    model = return_model()
    model.load_weights('../input/new-cassava-trans-learn-stratkfold-fold-' +str(settings.run_folds[0]) +'/split400x400_model_5fold_' +str(settings.run_folds[0]) +'.h5')


    val_accuracy_sum_sweep = np.zeros(len(Nsubcut_sweep))
    for n in range(len(Nsubcut_sweep)):
        print("Number of subcuts: " + str(Nsubcut_sweep[n]))
        #yhat_comb_sum = np.zeros((Nsubcut_sweep[n],5))
        ctr = 0
        for idx in image_id_val.index:  
            img = np.array(Image.open(path+"train_images/"+image_id_val.loc[idx][0]))
            splitBatch = returnCroppedBatch(img,Nsubcut_sweep[n])
            yhat = model.predict_generator(np.array(splitBatch))
            yhat_comb_sum = np.sum(yhat,0)
            y_hat_labels_sum[ctr] = np.argmax(yhat_comb_sum,0)
            y_true[ctr] = np.array(label_val[idx])
            ctr = ctr+1
        val_accuracy_sum_sweep[n] = np.sum((y_hat_labels_sum-y_true)==0)/len(y_hat_labels_sum)
        
else:
    with open('../input/cassava-trans-learn-stratified-k-only-last-part/subcut_sweep4279.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
        Nsubcut_sweep, val_accuracy_sum_sweep, settings_old = pickle.load(f)
   
    
plt.figure(figsize = (10,5))
plt.plot(Nsubcut_sweep,val_accuracy_sum_sweep,'ko-')
val_accuracy_sum_sweep
plt.grid('both', linestyle='--')
plt.xlabel('# of random crops with augmentations per image')
plt.ylabel('Accuracy')