# [Leaf pathology detection 2021](https://www.kaggle.com/c/plant-pathology-2021-fgvc8/data)
## This notebook compares state of the art DL models as feature extractors for the plant pathology detection
## During the training, _on the fly_ data augmentation was used with tf ImageDataGenerator


Related notebooks (thanks for sharing) : 
* Used the output of [PP2021 - Duplicates Revealing](https://www.kaggle.com/nickuzmenkov/pp2021-duplicates-revealing) for duplicates in the dataset

First, lets import the relevant libraries and put the few things we want to configure

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import pandas as pd

import tensorflow as tf

import tensorflow_addons as tfa
print("Using tensorflow ", tf.__version__)

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

In [None]:
class CFG:
    classes = [
        'complex', 
        'frog_eye_leaf_spot', 
        'powdery_mildew', 
        'rust', 
        'scab',
        'healthy']
    batch_size = 16
    test_size = 0.25
    img_size = 512 # image size
    seed = 42 # random seed 
    retrain = True

## Check if files exists and if labeled files in train_csv are the same files than in train_images

If you don't have a png dir, consider adding one with : _mkdir /kaggle/pngs_

In [None]:
!mkdir /kaggle/working/pngs/

In [None]:
train_dir = '../input/plant-pathology-2021-fgvc8/train_images/'
train_csv = '../input/plant-pathology-2021-fgvc8/train.csv'
test_dir = '../input/plant-pathology-2021-fgvc8/test_images/'
duplicates = '../input/duplicatescsv/duplicates.csv'
png_dir = "/kaggle/working/pngs/" 
model_dir = "../input/models/"

print(os.path.exists(train_dir))
print(os.path.exists(train_csv))
print(os.path.exists(test_dir))
print(os.path.exists(duplicates))
print(os.path.exists(png_dir))

## Inspecting data :
* Problem statement
* Size and number of files of train and test datasets
* Number of true duplicates and mistakenly classed same images

In [None]:
imgs = os.listdir(train_dir)
df_train = pd.read_csv(train_csv)
for ind in range(df_train.shape[0]):
    if df_train['image'][ind] not in imgs:
        print("{} not in train_images".foramt(df_train['image'][id]))

In [None]:
df_duplicates = pd.read_csv(duplicates,header=None)
df_duplicates.columns = ['img1','img2']


print("The train dataset is composed of {} labeled images".format(df_train.shape[0]))

print("The test dataset is composed of {} unlabeled images".format(len(os.listdir(test_dir))))
print(df_train.head())

print("\nThere are {} duplicated images.\n".format(df_duplicates.shape[0]))

In [None]:
true_duplicates = []
false_duplicates = []
for ind in range(df_duplicates.shape[0]):
    # First, check if all images are in the train dataset
    if not df_duplicates['img1'][ind] in list(df_train["image"])  : print("{} not in training dataset".format(df_duplicates['img1'][ind]))
    elif not df_duplicates['img2'][ind] in list(df_train["image"])  : print("{} not in training dataset".format(df_duplicates['img2'][ind]))
    else : 
        # Check wether it is a True duplicate -> same labels
        # Or not and then plot those
        if np.all(df_train[df_train["image"]==df_duplicates['img2'][ind]].reset_index()["labels"] == df_train[df_train["image"]==df_duplicates['img1'][ind]].reset_index()["labels"]):
            true_duplicates.append(df_duplicates['img1'][ind])
        else :
            false_duplicates.append((df_duplicates['img1'][ind],df_duplicates['img2'][ind]))

print('There are {} true duplicates and {} false ones.'.format(len(true_duplicates),len(false_duplicates)))
print('Lets display the false duplicates')

In [None]:
count = 0
for img1, img2 in false_duplicates[:5]:
    fig, axs = plt.subplots(1,2)
    axs[0].imshow(plt.imread(train_dir+img1))
    axs[0].set_title(df_train[df_train["image"]==img1].reset_index()["labels"][0])
    axs[0].axis('off')
    axs[1].imshow(plt.imread(train_dir+img2))
    axs[1].set_title(df_train[df_train["image"]==img2].reset_index()["labels"][0])
    axs[1].axis('off')
    plt.savefig(png_dir+"compare_false_dup"+str(count)+".png")
    count += 1
    plt.show()

### Inspection results : 
* The problem is a multi label problem -> Using F1-score micro as metric (see [here](https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1) why) and  modifying dataframe
* There are some duplicated images with same labels => may harm training => deleting one of them 
* There are some duplicated images with different labels => produce noise in data => deleting the two




In [None]:
# Just check that every possible label is in CFG.classes : 
labels = [x.split(' ') for x in df_train['labels']]
labels = [l for label in labels for l in label ]

uniques = np.unique(labels)
assert len(uniques)==len(CFG.classes), 'ERROR : labels and CFG.classes mismatch'
for unique in uniques : 
    assert unique in CFG.classes , 'ERROR : labels and CFG.classes mismatch'

In [None]:
df_train['labels'] = [x.split(' ') for x in df_train['labels']]
labels = MultiLabelBinarizer(classes=CFG.classes).fit_transform(df_train['labels'].values)
labels = pd.DataFrame(columns=CFG.classes, data=labels, index=df_train.index)
df_train.drop('labels', axis = 1, inplace = True)
for col in labels.columns:
    df_train[col] = labels[col]
print(df_train.head())

In [None]:
init = df_train.shape[0]

for img1, img2 in false_duplicates:
    df_train = df_train[df_train["image"]!=img1]
    df_train = df_train[df_train["image"]!=img2]
for img in true_duplicates:
    df_train = df_train[df_train["image"]!=img]

end = df_train.shape[0]
df_train.reset_index(drop = True, inplace = True)
print("Deleted {} files".format(init-end))

## Lets now take a look at the proportion of labels in the train data

In [None]:
value_counts = lambda x: pd.Series.value_counts(x, normalize=True)
df_occurence = pd.DataFrame({
    'origin': df_train[CFG.classes].apply(value_counts).loc[1]})

bar = df_occurence.plot.barh(figsize=[15, 5], colormap='plasma')

## It is important to keep the proportion of labels in both train and validation datasets 
Lets use [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html) and compare with the usual train_test_split

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size= CFG.test_size, random_state=CFG.seed)
X = df_train['image']
y = df_train[CFG.classes]
for train_index, test_index in sss.split(X, y):
    X_train, X_test = df_train.loc[train_index], df_train.loc[test_index]

## Just for comparison : 
compare_X_train, compare_X_test = train_test_split( df_train,  test_size= CFG.test_size, random_state= CFG.seed)

In [None]:
df_occurence = pd.DataFrame({
    'origin': df_train[CFG.classes].apply(value_counts).loc[1],
    'stratified_train': X_train[CFG.classes].apply(value_counts).loc[1],
    'stratified_test': X_test[CFG.classes].apply(value_counts).loc[1],
    'compare_train': compare_X_train[CFG.classes].apply(value_counts).loc[1],
    'compare_test': compare_X_test[CFG.classes].apply(value_counts).loc[1]
})

bar = df_occurence.plot.barh(figsize=[15, 5], colormap='plasma')
plt.savefig(png_dir+'comparison_stratified.png')

## Now files are selected for train and test, lets create a tf.data.dataset 
Indeed, we cant load every image in memory and this will help during training.

Note that parse_image does : 
* Load the image
* Decode it 
* Convert into floats in 0 and 1 
* Resize the image 

During training, this will be done implicitely.
Doing so would imply recoding the logic in a server if we wanted to export the model.
## We also use data augmentation for the training dataset :
The following steps are ramdomely used only on the training dataset : 
* Vertical/Horizontal flip 
* Rotate 
* Change contrast 
* Vertical/horizontal shift (translation)

In [None]:
def pred2labels(pred, thresh = 0.5, labels = CFG.classes):

    assert len(pred)==len(labels), 'Predictions must have shape : ({},)'.format(len(labels))
    pred = [labels[i] for i in range(len(labels)) if pred[i]>thresh]
    pred = np.array(pred)
    res = ''
    for p in pred :
        if res == '':
            res += p
        else:
            res += ' '+p
    return res

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
data_augmentation = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical", seed = CFG.seed),
                                         tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
                                         tf.keras.layers.experimental.preprocessing.RandomContrast([0,0.3], seed= CFG.seed ),
                                         tf.keras.layers.experimental.preprocessing.RandomTranslation(height_factor=0.2, width_factor=0.2)
                                        ])

In [None]:
def parse_image(file_path):
    # load the raw data from the file as a string
    img = tf.io.read_file(train_dir + file_path)
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    img = tf.image.resize(img, [CFG.img_size,CFG.img_size])
    return img

def prepare_dataset(X, augmentation = False):
    dataset = tf.data.Dataset.from_tensor_slices((X['image'].values, X[CFG.classes].values ))
    dataset = dataset.map(lambda x ,y : (parse_image(x),y) )
    dataset = dataset.batch(CFG.batch_size)
    
    if augmentation :
        dataset = dataset.map(lambda x, y: (data_augmentation(x, training=True), y), 
                                            num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat().prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
ds_train = prepare_dataset(X_train, augmentation = True)
ds_test = prepare_dataset(X_test)

In [None]:
for inputs, outputs in ds_train.as_numpy_iterator():
    # Verify the shapes are still as we expect
    print("Input shape is:", inputs.shape, "output shape is:", outputs.shape)

    # Print the first element and the label
    plt.imshow(inputs[0])
    plt.show()
    print('label of this input is', outputs[0], 'corresponding to', pred2labels(outputs[0]))

    # Break now. We only want to visualise the first example
    break

## Everything ready, lets create a function to create the model and easily tweak it if needed 
The model_transfert should be a pretrained model without the output layer so we can use it as feature extractor

In [None]:
def create_cnn(input_shape, output_length,
               nb_cnn=3, nb_filters = 64, activation_cnn = 'relu', 
               model_transfert = None, fine_tune = False, 
               nb_FC_layer = 3, nb_FC_neurons = 512, reducing = False, activation_FC = 'relu',
               dropout = 0.0,
               activation_output = 'sigmoid',
               name = 'my_cnn_model'
               ):
    '''Create a CNN based model is model_transfert is None. Else, the model_transfert is used for feature extraction. 
    If reducing is not False, nb_FC_neurons must be multiple of 2**nb_FC_layer '''
    
    assert input_shape[-1] == 3, 'For the moment only models with rgb input is dealt'
    #for shape in input_shape[:-1] : assert shape % 2**nb_cnn ==  0 , 'Each dimension of input must be a multiple of 2**nb_cnn'
    if reducing : assert nb_FC_neurons % 2**nb_FC_layer == 0 , 'If reducing, nb_FC_neurons must be multiple of 2**nb_FC_layer '
        
    model = tf.keras.models.Sequential(name=name)
    model.add(tf.keras.layers.InputLayer(input_shape=input_shape, name = 'Input_layer'))
    
    if model_transfert == None: 
        for cnn in range(nb_cnn):
            model.add(tf.keras.layers.Conv2D( filters = nb_filters, kernel_size = (3,3), padding='same', activation = activation_cnn, name ='Conv2D_'+str(cnn+1) ))
            model.add(tf.keras.layers.MaxPooling2D( pool_size=(2, 2), name ='MaxPool_'+str(cnn+1)))
    else : 
        if not fine_tune : model_transfert.trainable = False
            
        model.add(model_transfert)
        model.add(tf.keras.layers.MaxPooling2D( pool_size=(2, 2), name ='MaxPool_transfer'))
        
    model.add(tf.keras.layers.Flatten())
    
    if reducing : 
        for FC in range(nb_FC_layer):
            model.add(tf.keras.layers.Dense(nb_FC_neurons/2**FC, activation= activation_FC, name='FC_layer_'+str(FC+1)))
            
            if dropout != 0.0: 
                model.add(tf.keras.layers.Dropout(dropout, name = 'Dropout_'+str(FC+1)))
    else:
        for FC in range(nb_FC_layer):
            model.add(tf.keras.layers.Dense(nb_FC_neurons, activation= activation_FC, name='FC_layer_'+str(FC+1)))
        if dropout != 0.0:  
            model.add(tf.keras.layers.Dropout(dropout, name = 'Dropout_'+str(FC+1)))

    model.add(tf.keras.layers.Dense(output_length, activation = activation_output ,name='Output_layer'))

    return model


def get_callbacks(monitor='val_loss',save_name=None,patience=8):
    '''Returns the wanted callbacks to save models and avoid overfitting.
    monitor (str, optional): the monitor to check for the early stopping. Default is 'val_loss'
    save_name (str, optional): if not None, uses modelcheckpoint and saves checkpoints at the save_name. Default is None.
    patience (int, optional): number of epoch to wait for improvment of monitor. Default is 8.'''
    if save_name :
        return [tf.keras.callbacks.ModelCheckpoint(filepath=save_name,
                                                   monitor=monitor, 
                                                   save_best_only=True,
                                                   verbose=0),
                tf.keras.callbacks.EarlyStopping(monitor=monitor, 
                                                 patience=patience,
                                                 restore_best_weights=True)
                ]
    else:
        return [tf.keras.callbacks.EarlyStopping(monitor=monitor, 
                                                 patience=patience,
                                                 restore_best_weights=True)
                ]

In [None]:
try : 
    base = tf.keras.applications.Xception( include_top=False, weights='imagenet', input_shape=(CFG.img_size, CFG.img_size, 3), classes=len(CFG.classes) )

    model = create_cnn(input_shape=(CFG.img_size, CFG.img_size, 3), output_length=len(CFG.classes),
                   model_transfert = base, fine_tune = True, 
                   nb_FC_layer = 2, nb_FC_neurons = 512, reducing = True, activation_FC = 'relu',
                   dropout = 0,
                   activation_output = 'sigmoid',
                   name='my_model'
                   )
    optimizer = tf.keras.optimizers.Adam(learning_rate=3.5e-5)

    model.compile(optimizer=optimizer,
                      loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=[
                        tf.keras.metrics.BinaryAccuracy(name='acc'), 
                        tfa.metrics.F1Score(
                            num_classes=len(CFG.classes), 
                            average='micro', name = 'micro-F1'),
                        tfa.metrics.F1Score(
                            num_classes=len(CFG.classes), 
                            average='macro', name = 'macro-F1'),
                        tfa.metrics.F1Score(
                            num_classes=len(CFG.classes), 
                            average='weighted', name = 'weighted-F1')])

    model.summary()
except : 
    print('Internet not available')

In [None]:
if os.path.exists(model_dir + 'model.h5') and not CFG.retrain : 
    print('Loading model from file')
    model = tf.keras.models.load_model(model_dir+'model.h5')
    history = None
else :
    history = model.fit(ds_train,
                          validation_data=ds_test,
                          steps_per_epoch=(X_train.shape[0]*0.8)//CFG.batch_size, 
                          validation_steps= (X_test.shape[0]*0.2)//CFG.batch_size,
                          callbacks = get_callbacks(monitor = 'val_micro-F1', save_name = '/kaggle/working/model.h5', patience = 4),
                        epochs = 10)

## Lets plot the training curves

In [None]:
if history : 
    fig, axes = plt.subplots(1, 3, figsize=(30, 5))

    axes[0].plot(history.history['loss'], label = 'Train loss')
    axes[0].plot(history.history['val_loss'], label = 'Validation loss')
    axes[0].set_title('Loss')
    axes[0].legend()

    axes[1].plot(history.history['acc'], label = 'Train accuracy')
    axes[1].plot(history.history['val_acc'], label = 'Validation accuracy')
    axes[1].set_title('Accuracy')
    axes[1].legend()

    axes[2].plot(history.history['micro-F1'], label = 'Train micro-F1',color='lightblue')
    axes[2].plot(history.history['val_micro-F1'], label = 'Validation micro-F1',color='darkblue')
    
    axes[2].plot(history.history['macro-F1'], label = 'Train macro-F1',color='red')
    axes[2].plot(history.history['val_macro-F1'], label = 'Validation macro-F1',color='darkred')
    
    axes[2].plot(history.history['weighted-F1'], label = 'Train weighted-F1',color='lightgreen')
    axes[2].plot(history.history['val_weighted-F1'], label = 'Validation weighted-F1',color='darkgreen')
    axes[2].set_title('F1-score')
    axes[2].legend()
    
    plt.savefig(png_dir+'history_xception.png')

    plt.show()
else : 
    print('There is no history')

## Now we are able to create the submission file

In [None]:
def parse_test_image(file_path):
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    img = tf.image.resize(img, [CFG.img_size,CFG.img_size])
    return img



def predict_new(path, model):
    img = parse_test_image(path)
    img = tf.expand_dims(img,axis = 0)
    pred = model.predict(img)
    return pred2labels(pred[0])

In [None]:
df_sub = pd.DataFrame(columns=['image','labels'])
for path in os.listdir(test_dir):
    pred = predict_new(test_dir+path, model)
    
    df_sub = df_sub.append( {'image': path, 'labels': pred}, ignore_index = True )
    
print(df_sub.head())
df_sub.to_csv('submission.csv', index=False)
print('Submission completed')