# Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import PIL
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
from tqdm import tqdm
import tensorflow_addons as tfa
import random
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option("display.max_columns", None)

print("Done")


Let's explore the data.
How many images are in the datset, the labels and their frequencies.

In [None]:
train = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print(len(train))
print(train.columns)
print(train['labels'].value_counts())
print(train['labels'].value_counts().plot.bar())

In [None]:
train['labels'] = train['labels'].apply(lambda string: string.split(' '))
train

First I convert the labels representation into **one hot encoded format** using MultilabelBinarizer from sklearn. Now we can see and plot the frequencies of each label. 

In [None]:
s = list(train['labels'])
mlb = MultiLabelBinarizer()
trainx = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=train.index)
print(trainx.columns)
print(trainx.sum())

labels = list(trainx.sum().keys())
print(labels)
label_counts = trainx.sum().values.tolist()

fig, ax = plt.subplots(1,1, figsize=(20,6))

sns.barplot(x= labels, y= label_counts, ax=ax)

# Let's view some of the images

In [None]:
fig1 = plt.figure(figsize=(26,10))

for i in range(1, 13):
    
    rand =  random.randrange(1, 18000)
    sample = os.path.join('../input/plant-pathology-2021-fgvc8/train_images/', train['image'][rand])
    
    img = PIL.Image.open(sample)
    
    ax = fig1.add_subplot(4,3,i)
    ax.imshow(img)
    
    title = f"{train['labels'][rand]}{img.size}"
    plt.title(title)
    
    fig1.tight_layout()


# Imaze Size & Processing
from the titles we can see some random image sizes - (4000, 2672). Larger images are harder to process hence takes much longer to train the CNN. Downsampling all these 18632 images is also a time consuming task. This is I am going to use the resized imaged for this dataset [resized-plant2021](https://www.kaggle.com/ankursingh12/resized-plant2021) by Ankur Singh. He has already downsampled the images into size of 256, 384, 512 & 640px.

There are 18632 images in the training set. Even after using the downsampled images we cant fit all of the images into memory at once. So I have used the flow_from_dataframe method from keras. This method reads images in batch size from the storage without loading all the images at once and saving us from **GPU Out of Memory (OOM)** issue. 

In [None]:
df=pd.read_csv("../input/plant-pathology-2021-fgvc8/train.csv")

print("Done")

In [None]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(rescale=1/255.0,
                            rotation_range=5,
                            zoom_range=0.1,
                            shear_range=0.05,
                            horizontal_flip=True,
                            validation_split=0.2)

train_generator = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_256',
    subset='training',
    x_col='image',
    y_col='labels',
    target_size=(224,224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=444
    )

#'../input/plant-pathology-2021-fgvc8/train_images'
valid_generator = datagen.flow_from_dataframe(
    train,
    directory='../input/resized-plant2021/img_sz_256',
    subset='validation',
    x_col='image',
    y_col='labels',
    target_size=(224,224),
    color_mode='rgb',
    class_mode='categorical',
    batch_size=32,
    shuffle=True,
    seed=444
    )

# Transfer Learning
Transfer learning is the process of using frozen weights from a large pre-trained model for a downstream task which is in our case classifying leaf diseases. As we can't use internet in this notebook, I will use the dataset of keras's pretrained models containing the weights of 'imagenet'. The output/top layer of a pretrained layer is a dense layer containing number of nodes = number of output classes. All the models here are pre-trained on 'imagenet' hence they have a output/top layer of 1000 nodes. We will have to replace the output/top layer with our own dense layer with 6 nodes (for 6 classes). 

I am going to be using **Xception**.


In [None]:
seed = 1200
tf.random.set_seed(seed)

weights_path = '../input/keras-pretrained-models/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
model = keras.applications.Xception(weights=weights_path, include_top=False, input_shape=(224, 224, 3))

print(model.input)
print(model.output)

# Activation, Losses & Metrices

As this is a multilabel classification problem, we can't use softmax here, hence the sigmoid activation.

Binary crossentropy is used instead of categorical crossentropy. We use categorical cross-entropy in multi-class problems, but for multi-label problems, we use binary cross-entropy. Think of it this way, an image may have multiple labels, and we need the probabilities that each of these labels corresponds to the given image - this can be considered as n independent binary classifiers for the n labels.


In [None]:
new_model = tf.keras.Sequential([
    model,
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(6, 
        kernel_initializer=keras.initializers.RandomUniform(seed=seed),
        bias_initializer=keras.initializers.Zeros(), name='dense_top', activation='sigmoid')
])

# Freezing the weights
for layer in new_model.layers[:-1]:
    layer.trainable=False
    
new_model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

f1 = tfa.metrics.F1Score(num_classes=6,average='macro')

new_model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy',f1])
callbacks = keras.callbacks.EarlyStopping(monitor=f1, patience=4, mode='max', restore_best_weights=True)
history = new_model.fit_generator(generator=train_generator,
                    validation_data=valid_generator,
                    epochs=65,
                    steps_per_epoch=train_generator.samples//256, # = 58
                    validation_steps=valid_generator.samples//128,
                    callbacks=callbacks)


# Submission

For submission I will resize the test images and then predict the labels for them.

In [None]:
test = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')

for img_name in tqdm(test['image']):
    path = '../input/plant-pathology-2021-fgvc8/test_images/'+str(img_name)
    with PIL.Image.open(path) as img:
        img = img.resize((256,256))
        img.save(f'./{img_name}')

In [None]:
test_data = datagen.flow_from_dataframe(
    test,
    directory = './',
    x_col="image",
    y_col= None,
    color_mode="rgb",
    target_size = (256,256),
    classes=None,
    class_mode=None,
    batch_size=32,
    shuffle=False,
    seed=40,
)

preds = new_model.predict(test_data)
print(preds)
preds = preds.tolist()

indices = []
for pred in preds:
    temp = []
    for category in pred:
        if category>=0.3:
            temp.append(pred.index(category))
    if temp!=[]:
        indices.append(temp)
    else:
        temp.append(np.argmax(pred))
        indices.append(temp)
    
print(indices)

In [None]:
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
print(labels)

testlabels = []


for image in indices:
    temp = []
    for i in image:
        temp.append(str(labels[i]))
    testlabels.append(' '.join(temp))

print(testlabels)


Remove the resized images from output before submission. if there are any other files present except 'submission.csv' it will throw an error when submitting.

In [None]:
delfiles = tf.io.gfile.glob('./*.jpg')

for file in delfiles:
    os.remove(file)

In [None]:
# accuracy
plt.figure(figsize=(15,6))
epoch_list = list(range(1, len(history.history['accuracy']) + 1))
plt.plot(epoch_list, history.history['accuracy'],label='accuracy')
plt.plot(epoch_list, history.history['val_accuracy'],label='val_accuracy')
plt.xlabel('epoches')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
# loss
plt.figure(figsize=(15,6))
epoch__list = list(range(1,len(history.history['loss'])+1))
plt.plot(epoch__list, history.history['loss'],label='loss')
plt.plot(epoch__list, history.history['val_loss'],label='val_loss')
plt.xlabel('epoches')
plt.ylabel('loss')
plt.legend()
plt.show()

In [None]:
# f1  score
plt.figure(figsize=(15,6))
epoch__list = list(range(1,len(history.history['f1_score'])+1))
plt.plot(epoch__list, history.history['f1_score'],label='f1_score')
plt.plot(epoch__list, history.history['val_f1_score'],label='val_f1_score')
plt.xlabel('epoches')
plt.ylabel('f1')
plt.legend()
plt.show()

In [None]:
arr1 = history.history['loss']
result1 = sum(arr1)
print(f"loss_av : {result1 / len(arr1)}")

arr2 = history.history['accuracy']
result2 = sum(arr2)
print(f"accuracy_av : {result2 / len(arr2)}")

arr3 = history.history['val_loss']
result3 = sum(arr3)
print(f"val_loss_av : {result3 / len(arr3)}")

arr4 = history.history['val_accuracy']
result4 = sum(arr4)
print(f"val_accuracy_av : {result4 / len(arr4)}")

In [None]:
sub = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
sub['labels'] = testlabels
sub
sub.to_csv('submission.csv', index=False)
sub