## Problem Statement
Plant Pathology 2021 - FGVC8 is a [Kaggle competition](https://www.kaggle.com/c/plant-pathology-2021-fgvc8) launched on march 15 2021 and closed on mai 27 2021.

Exploration notebook can be find on [Kaggle](https://www.kaggle.com/xavierbarbier/plant-pathology-2021-fgvc8-eda) and the [full project on Github](https://github.com/xavierbarbier/Plant_Pathology_2021_FGVC8).

The goals of this notebook are:

* Use a distributed approach (TPU) to optimise training time
* Create a sample dataset for training
* Compare differents pre-trained model
* Optimise and tune the selected model
* Train the optimised model on the full dataset

In [None]:
#-------------------
# importing libraries
#-------------------
import tensorflow as tf
from tensorflow.keras import layers
from kaggle_datasets import KaggleDatasets
from tensorflow import keras

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import PIL
import shutil
import csv

import matplotlib.image as img
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import Model,layers

In [None]:
!pip install tensorflow_addons

import tensorflow_addons as tfa

In [None]:
# Define the distributed strategy
AUTO = tf.data.experimental.AUTOTUNE

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Load data

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
print(GCS_DS_PATH)

TRAIN_PATH = GCS_DS_PATH + "/train_images/"

train = pd.read_csv("../input/plant-pathology-2021-fgvc8/train.csv")

train.head()

# Create a stratify sample

## Split

In [None]:
labels_counts = train["labels"].value_counts()

plt.barh(labels_counts.index,labels_counts)
plt.title("Labels counts")

We will use 10% of the full dataset avec use sklearn train_test_split to create our sample.

In [None]:
from sklearn.model_selection import train_test_split


# splitting on labels
X_train, X_test, y_train, y_test = train_test_split(train['image'], train['labels'], test_size=0.1, random_state = 12,
                                                      stratify =  train['labels'] )

# using test set as training sample
data_sample = train.iloc[y_test.index]

print("sample shape")
print(data_sample.shape)

In [None]:
labels_counts = data_sample["labels"].value_counts()

plt.barh(labels_counts.index,labels_counts)
plt.title("sample labels counts")

Sample dataset seems to have same distribution as full dataset

In [None]:
data_sample["path"] = TRAIN_PATH + data_sample["image"]

class_dict = {
    'scab': 0,
    'frog_eye_leaf_spot' : 1,
    'rust' : 2,
    'complex' : 3,
    'powdery_mildew' : 4,
    "healthy" : 5
}
num_classes = len(class_dict)    
class_names = dict([(value, key) for key, value in class_dict.items()])
data_sample["labels"] = data_sample["labels"].map(lambda x : [i for i in x.split(" ")])
#train_df["labels"] = train_df["labels"].map(lambda x : x.split(" "))
data_sample["labels"] = data_sample["labels"].map(lambda x : [class_dict[i] for i in x])

data_sample.head()


## Data prep

In [None]:
#--------------
#initialize constants
#--------------
HEIGHT,WIDTH = 299,299
CHANNELS = 3
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
SEED = 143
SPLIT = int(0.8*len(data_sample))
AUTO = tf.data.experimental.AUTOTUNE

def process_img(filepath,label):
    image = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    image = tf.image.convert_image_dtype(image, tf.float32) 
    
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image,label

In [None]:
# Spliting sample data to train and valid set
X_train, X_test, y_train, y_test = train_test_split(data_sample["path"], data_sample['labels'],
                                                    test_size=0.33, random_state = 12,
                                                      stratify =  data_sample['labels'] )

train_ds = pd.concat([X_train, y_train], axis = 1)
valid_ds = pd.concat([X_test, y_test], axis = 1)

files_ls = list(train_ds["path"])
labels = np.zeros((len(train_ds),num_classes))

for i,file in enumerate(train_ds.values):
    labels[i][train_ds.iloc[i]["labels"]] = 1
    
train_ds = tf.data.Dataset.from_tensor_slices((files_ls,labels))
train_ds = train_ds.map(process_img,num_parallel_calls=AUTO)


files_ls = list(valid_ds["path"])
labels = np.zeros((len(valid_ds),num_classes))

for i,file in enumerate(valid_ds.values):
    labels[i][valid_ds.iloc[i]["labels"]] = 1
    
val_ds = tf.data.Dataset.from_tensor_slices((files_ls,labels))
val_ds = val_ds.map(process_img,num_parallel_calls=AUTO)

print("Nb obs train set:",len(train_ds))
print("Nb obs valid set:",len(val_ds))

In [None]:
#--------------
#initialize constants
#--------------

STEPS_PER_EPOCH  = (len(train_ds))//BATCH_SIZE
VALID_STEPS = (len(val_ds))//BATCH_SIZE

In [None]:
train_ds = train_ds.cache().repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO)
val_ds = val_ds.cache().repeat().batch(BATCH_SIZE).prefetch(AUTO)
print("Data Pipeline achieved !")

## Pre trained models

In [None]:
# Define epochs for each training and scoring metric
EPOCHS = 5

metrics = tfa.metrics.F1Score(num_classes = num_classes,average = "macro",name = "f1_score",
                             threshold= 0.5)

In [None]:
def compile_model(model, lr=1e-3):
    
    optimizer = tf.keras.optimizers.Adam(lr=lr)
    
    loss = tf.keras.losses.BinaryCrossentropy()
        
    metrics = tfa.metrics.F1Score(num_classes = num_classes,
                                            average = "macro",name = "f1_score") 

    model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

    return model

### INCEPTION V3

In [None]:


def create_model():
    pre_trained_model = InceptionV3(input_shape = (HEIGHT,WIDTH, CHANNELS), 
                                  include_top = False, 
                                  weights = "imagenet")

  # Setting pretrained model to no trainable
    pre_trained_model.trainable = False

    last_layer = pre_trained_model.get_layer('mixed7')

    last_output = last_layer.output

    x = layers.GlobalMaxPooling2D()(last_output)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)    
    x = layers.Dense(num_classes, activation='sigmoid',dtype='float32')(x)           

    model = Model( pre_trained_model.input, x )
    return model

In [None]:

VERBOSE =1

tf.keras.backend.clear_session()

with strategy.scope():
    
    model = create_model()
    model = compile_model(model, lr=1e-3)    
    
    history = model.fit(
                        train_ds,
                        epochs=EPOCHS,
                        
                        validation_data = val_ds,
                        verbose=VERBOSE,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_steps=VALID_STEPS
                                           )

In [None]:
# Plotting accuracy and val loss
acc = history.history['f1_score']
val_acc = history.history['val_f1_score']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training f1_score')
plt.plot(epochs_range, val_acc, label='Validation f1_score')
plt.legend(loc='lower right')
plt.title('Training and Validation f1_score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
model.save('./my_model.h5')

### MOBILENET

In [None]:
def create_model():
    pre_trained_model = tf.keras.applications.MobileNetV2(input_shape = (HEIGHT,WIDTH, CHANNELS), 
                                include_top = False, 
                                weights = "imagenet")

    # Setting pretrained model to no trainable
    pre_trained_model.trainable = False

    last_layer = pre_trained_model.get_layer('out_relu')
    
    last_output = last_layer.output

    x = layers.GlobalMaxPooling2D()(last_output)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)    
    x = layers.Dense(num_classes, activation='sigmoid',dtype='float32')(x)           

    model = Model( pre_trained_model.input, x )
    return model

In [None]:
tf.keras.backend.clear_session()

with strategy.scope():
    
    model = create_model()
    model = compile_model(model, lr=1e-3)
   
    
    
    history = model.fit(
                        train_ds,
                        epochs=EPOCHS,
                        
                        validation_data = val_ds,
                        verbose=VERBOSE,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_steps=VALID_STEPS
                       )
                       

In [None]:
# Plotting accuracy and val loss
acc = history.history['f1_score']
val_acc = history.history['val_f1_score']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training f1_score')
plt.plot(epochs_range, val_acc, label='Validation f1_score')
plt.legend(loc='lower right')
plt.title('Training and Validation f1_score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

### RESNET 50

In [None]:
def create_model():
    pre_trained_model = tf.keras.applications.ResNet50(input_shape = (HEIGHT,WIDTH, CHANNELS), 
                                include_top = False, 
                                weights = "imagenet")

    # Setting pretrained model to no trainable
    pre_trained_model.trainable = False

    last_layer = pre_trained_model.get_layer('conv5_block3_out')
    
    last_output = last_layer.output

    x = layers.GlobalMaxPooling2D()(last_output)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)    
    x = layers.Dense(num_classes, activation='sigmoid',dtype='float32')(x)           

    model = Model( pre_trained_model.input, x )
    return model

In [None]:
#EPOCHS = 10
VERBOSE =1

tf.keras.backend.clear_session()

with strategy.scope():
    
    model = create_model()
    model = compile_model(model, lr=1e-3)
   
    
    
    history = model.fit(
                        train_ds,
                        epochs=EPOCHS,
                        
                        validation_data = val_ds,
                        verbose=VERBOSE,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_steps=VALID_STEPS
                       )
                       

In [None]:
# Plotting accuracy and val loss
acc = history.history['f1_score']
val_acc = history.history['val_f1_score']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training f1_score')
plt.plot(epochs_range, val_acc, label='Validation f1_score')
plt.legend(loc='lower right')
plt.title('Training and Validation f1_score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## Pre-trained models conclusion

MobileNet model seems to have better results

## MOBILNET Tuner

We will use Keras tuner Hyperband to tune :
* A dropout layer (as there is clear overfitting)
* Learning rate


In [None]:
def model_builder(hp):
    pre_trained_model = tf.keras.applications.MobileNetV2(input_shape = (HEIGHT,WIDTH, CHANNELS), 
                                include_top = False, 
                                weights = "imagenet")

    # Setting pretrained model to no trainable
    pre_trained_model.trainable = False

    last_layer = pre_trained_model.get_layer('out_relu')
    
    last_output = last_layer.output

    x = layers.GlobalMaxPooling2D()(last_output)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)    
    # Tune a drop out layer
    # Choose an optimal value from 0.0 to 0.5
    x = layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.2))(x)
    x = layers.Dense(num_classes, activation='sigmoid',dtype='float32')(x)           

    model = Model( pre_trained_model.input, x )

    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])

    loss = tf.keras.losses.BinaryCrossentropy()

    metrics = tfa.metrics.F1Score(num_classes = num_classes,
                                    average = "macro",name = "f1_score") 


    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
            loss=loss,
            metrics=[metrics])

    return model


### Keras hyperband tuner

In [None]:
import kerastuner as kt

tf.keras.backend.clear_session()

tuner = kt.Hyperband(
    model_builder,
    objective= kt.Objective("val_f1_score", direction="max"),
    max_epochs=10,
    distribution_strategy=strategy    )

In [None]:
print("Search space summary")
tuner.search_space_summary()

In [None]:
tuner.search(train_ds, epochs=5,steps_per_epoch = STEPS_PER_EPOCH,validation_steps = STEPS_PER_EPOCH,
             validation_data = val_ds)



In [None]:
print("Search results summary")
tuner.results_summary()

In [None]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
dropout = best_hps.get('dropout')
lr = best_hps.get('learning_rate') 

In [None]:
lr

# Full data

We will now train the tuned model on the full dataset

## Data prep

In [None]:
class_dict = {
    'scab': 0,
    'frog_eye_leaf_spot' : 1,
    'rust' : 2,
    'complex' : 3,
    'powdery_mildew' : 4,
    "healthy" : 5
}
num_classes = len(class_dict)    
class_names = dict([(value, key) for key, value in class_dict.items()])
train["labels"] = train["labels"].map(lambda x : [i for i in x.split(" ")])
#train_df["labels"] = train_df["labels"].map(lambda x : x.split(" "))
train["labels"] = train["labels"].map(lambda x : [class_dict[i] for i in x])

train["path"] = TRAIN_PATH + train["image"]

train.head()

In [None]:
# splitting for a train and valid set

X_train, X_test, y_train, y_test = train_test_split(train["path"], train['labels'], test_size=0.2, random_state = 12,
                                                      stratify =  train['labels'] )

In [None]:
train_ds = pd.concat([X_train, y_train], axis = 1)
valid_ds = pd.concat([X_test, y_test], axis = 1)

files_ls = list(train_ds["path"])
labels = np.zeros((len(train_ds),num_classes))

for i,file in enumerate(train_ds.values):
    labels[i][train_ds.iloc[i]["labels"]] = 1
    
train_ds = tf.data.Dataset.from_tensor_slices((files_ls,labels))
train_ds = train_ds.map(process_img,num_parallel_calls=AUTO)


files_ls = list(valid_ds["path"])
labels = np.zeros((len(valid_ds),num_classes))

for i,file in enumerate(valid_ds.values):
    labels[i][valid_ds.iloc[i]["labels"]] = 1
    
val_ds = tf.data.Dataset.from_tensor_slices((files_ls,labels))
val_ds = val_ds.map(process_img,num_parallel_calls=AUTO)

STEPS_PER_EPOCH  = (len(train_ds))//BATCH_SIZE
VALID_STEPS = (len(val_ds))//BATCH_SIZE

train_ds = train_ds.cache().repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO)
val_ds = val_ds.cache().repeat().batch(BATCH_SIZE).prefetch(AUTO)
print("Data Pipeline achieved !")

### Callbacks

In [None]:
# Create a callback that saves the model's weights

checkpoint_dir = "./raw_model.h5"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,
                                                 save_weights_only=True,
                                                 save_best_only=True,
                                                 verbose=1,
                                                 monitor= "val_f1_score",
        mode='max')

# Create a callback that stops fitting when val loss do not decrease
callback = tf.keras.callbacks.EarlyStopping(monitor="val_f1_score", patience=10, mode='max')

reducelr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor= "val_f1_score",
        mode='max',
        factor=0.1,
        patience=2,
        verbose=1
    )

callbacks=[callback,cp_callback,reducelr]

In [None]:


def create_model():
    pre_trained_model = tf.keras.applications.MobileNetV2(input_shape = (HEIGHT,WIDTH, CHANNELS), 
                                include_top = False, 
                                weights = "imagenet")

  # Setting pretrained model to no trainable
    pre_trained_model.trainable = False

    last_layer = pre_trained_model.get_layer('out_relu')

    last_output = last_layer.output

    x = layers.GlobalMaxPooling2D()(last_output)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)   
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(num_classes, activation='sigmoid',dtype='float32')(x)           

    model = Model( pre_trained_model.input, x )
    return model

## Training 

In [None]:
EPOCHS = 100
VERBOSE =1

tf.keras.backend.clear_session()

with strategy.scope():
    
    model = create_model()
    model = compile_model(model, lr=lr) 

    history = model.fit(train_ds,
            epochs=EPOCHS,
            validation_data = val_ds,
            verbose=VERBOSE,
            steps_per_epoch = STEPS_PER_EPOCH,
            validation_steps=STEPS_PER_EPOCH,
            callbacks = callbacks)

In [None]:
# Plotting accuracy and val loss
acc = history.history['f1_score']
val_acc = history.history['val_f1_score']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training f1_score')
plt.plot(epochs_range, val_acc, label='Validation f1_score')
plt.legend(loc='lower right')
plt.title('Training and Validation f1_score')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
model.load_weights(checkpoint_dir)

model.save('./my_model.h5')

### Predictions

We now want to make some prediction on a small sample of images.

In [None]:
# loading the model
new_model = tf.keras.models.load_model('./my_model.h5')

In [None]:
# Taking 9 images as sample

train = pd.read_csv("../input/plant-pathology-2021-fgvc8/train.csv")

files_ls = tf.io.gfile.glob(TRAIN_PATH + '*.jpg')

from random import sample

files_ls_sample = sample(files_ls,9)

train["path"] = TRAIN_PATH  +  train["image"]

test_df = train[train["path"].isin(files_ls_sample)]

print("Sample shape")
test_df.shape



In [None]:
test_df.head(9)

In [None]:
# preprocessing without labels

def process_img_test(filepath):
    image = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    image = tf.image.convert_image_dtype(image, tf.float32) 
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(files_ls_sample)
    .map(process_img_test
).batch(BATCH_SIZE)
)



In [None]:
# making predictions
predicts = new_model.predict(test_dataset) 

threshold = 0.5

def get_labels(prediction):
  pred = []
  idx = np.where(prediction>threshold)[0]
  for i in idx:
    pred.append(class_names[i])
  pred = ' '. join(pred)
  if len(pred) == 0:
    pred = []
    idx = np.argmax(prediction)
    pred.append(class_names[idx])
    pred = ' '. join(pred)
    return pred
  else :
    return pred

labels = []
for i in range(len(predicts)):
  pred = predicts[i]

  labels.append(get_labels(pred))
    
test_df["pred"] = labels

## Showing some predictions

In [None]:
test_df["path"] = "../input/plant-pathology-2021-fgvc8/train_images/" + test_df["image"]

# Showing image sample
plt.figure(figsize=(14,9))
n=1
for i in test_df.index :
    plt.subplot(3,3,n)
    
    testImage = img.imread(test_df["path"][i])

    # displaying the image
    plt.imshow(testImage)
    color = "blue" if test_df["pred"][i] == test_df["labels"][i] else "red"
    
    plt.title(test_df["pred"][i].title(), color=color)
    plt.axis("off")
    n+=1
_ = plt.suptitle("Model predictions on sample set (blue: correct, red: incorrect)")