# Intro
Welcome to the [Plant Pathology 2021 - FGVC8](https://www.kaggle.com/c/plant-pathology-2021-fgvc8/data) compedition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/25563/logos/header.png)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation
from keras.optimizers import RMSprop,Adam
from keras.applications import ResNet50

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/plant-pathology-2021-fgvc8/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Functions

In [None]:
def plot_examples(label='healthy'):
    fig, axs = plt.subplots(1, 5, figsize=(25, 12))
    fig.subplots_adjust(hspace = .2, wspace=.2)
    axs = axs.ravel()
    for i in range(5):
        idx = train_data[train_data['labels']==label].index[i]
        img = cv2.imread(path+'train_images/'+train_data.loc[idx, 'image'])
        axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        axs[i].set_title(label)
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])

# Overview

In [None]:
print('Number of train samples:', len(train_data))
print('Number of train images:', len(os.listdir(path+'train_images/')))

In [None]:
train_data.head()

# EDA
Labels

In [None]:
train_data['labels'].value_counts().plot.bar()

Plot Examples

In [None]:
labels = list(train_data['labels'].value_counts().keys())
for label in labels:
    plot_examples(label=label)

# Prepare Data For Data Generator

Label Encoding

In [None]:
labels_dict = dict(zip(labels, range(12)))
train_data = train_data.replace({"labels": labels_dict})

Split Train Data

In [None]:
train_data, val_data = train_test_split(train_data, test_size=0.3)
train_data.index = range(len(train_data.index))
val_data.index = range(len(val_data.index))

In [None]:
print('Number of train samples', len(train_data))
print('Number of val samples', len(val_data))

# Data Generator
Parameter

In [None]:
q_size = 128
img_channel = 3
num_classes = 12
batch_size = 64
epochs = 2

Data Generator Class

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel, num_classes):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.num_classes = num_classes
        self.indexes = np.arange(len(self.list_IDs))

        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, self.num_classes), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            img = cv2.imread(self.path+ID)
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img/255
            y[i, ] = to_categorical(self.labels[i], num_classes=self.num_classes)
        return X, y

# Train On Subset
To test the algorithms we work on a small subset.

In [None]:
number_samples = 500
#train_data = train_data[0:number_samples]
#val_data = val_data[0:number_samples]

# Define Train, Val And Test Data

In [None]:
train_generator = DataGenerator(path+'train_images/', train_data['image'], train_data['labels'],
                                batch_size, q_size, img_channel, num_classes)
val_generator = DataGenerator(path+'train_images/',val_data['image'], val_data['labels'],
                                batch_size, q_size, img_channel, num_classes)

# Class Weights
Based on the distribution of the lables we define the class weights.

In [None]:
class_weight = dict(zip(train_data['labels'].value_counts().keys(),
                        train_data['labels'].value_counts().values/len(train_data)))

# Define Metric
To make the score of the leaderboard the [f1 score](https://www.tensorflow.org/addons/api_docs/python/tfa/metrics/F1Score) is taken.

In [None]:
metrics = [tfa.metrics.F1Score(name = 'f1_score', num_classes=num_classes)]

# Load Pretrained Model

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
conv_base = ResNet50(weights=weights,
                     include_top=False,
                     input_shape=(q_size, q_size, img_channel))
conv_base.trainable = True

# Define Model

In [None]:
model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(optimizer = RMSprop(lr=1e-4),
              loss='binary_crossentropy',
              metrics=metrics)

model.summary()

In [None]:
history = model.fit_generator(generator=train_generator,
                              validation_data=val_generator,
                              epochs = epochs,
                              class_weight = class_weight)

# Analyse Training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['f1_score']
acc_val = history.history['val_f1_score']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
#axs[1].legend()
axs[1].grid()
plt.show()

# Predict Test Data

In [None]:
test_data = pd.DataFrame()
files = os.listdir(path+'test_images')
test_data['image'] = files
test_data['labels'] = 0

test_generator = DataGenerator(path+'test_images/', test_data['image'], test_data['labels'],
                                batch_size, q_size, img_channel, num_classes)

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
samp_subm.loc[0:len(files)-1, 'labels'] = predict.argmax(axis=1)[0:len(files)]
dict_rename = {v : k for k, v in labels_dict.items()}
samp_subm = samp_subm.replace({"labels": dict_rename})

# Export

In [None]:
samp_subm.to_csv('submission.csv', index=False)

In [None]:
samp_subm