In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import PIL
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
from tqdm import tqdm
import tensorflow_addons as tfa
import random
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option("display.max_columns", None)
sns.set_style('darkgrid')

# Dataset Exploration

In [None]:
train = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print(len(train))
print(train.columns)
# print(train['labels'].value_counts())
print(train['labels'].value_counts().plot.bar())

In [None]:
train.head()


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer().fit(train.labels.apply(lambda x: x.split()))
labels = pd.DataFrame(mlb.transform(train.labels.apply(lambda x: x.split())), columns=mlb.classes_)

fig, ax = plt.subplots(figsize=(20, 6))
labels.sum().plot.bar(title='Target Class Distribution');

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
labels.sum(axis=1).value_counts().plot.bar(title='Distribution of Number of Labels per Image');

In [None]:
fig1 = plt.figure(figsize=(26,10))

for i in range(1, 13):
    
    rand =  random.randrange(1, 18000)
    sample = os.path.join('../input/plant-pathology-2021-fgvc8/train_images', train['image'][rand])
    
    img = PIL.Image.open(sample)
    
    ax = fig1.add_subplot(4,3,i)
    ax.imshow(img)
    
    title = f"{train['labels'][rand]}{img.size}"
    plt.title(title)
    
    fig1.tight_layout()

# Preprocessing and Augmentation

In [None]:
labels = pd.concat([train['image'], labels], axis=1)
labels.head()

In [None]:
image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='../input/plant-pathology-2021-fgvc8/train_images',
    x_col='image',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=64,
    subset='training'
)

valid_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='../input/plant-pathology-2021-fgvc8/train_images',
    x_col='image',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=64,
    subset='validation'
)

# Modelling

In [None]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = tf.keras.applications.MobileNetV2(include_top=False)(inputs)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(6, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs, outputs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-4))

model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
f1 = tfa.metrics.F1Score(num_classes=6, average='macro')

callbacks = keras.callbacks.EarlyStopping(monitor=f1, patience=3, mode='max', restore_best_weights=True)


model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=1e-4), 
              metrics= [f1])

history = model.fit(train_generator, validation_data=valid_generator, epochs=10, callbacks=callbacks)


In [None]:
fix, ax = plt.subplots(figsize=(20, 6))
pd.DataFrame(history.history)[['loss', 'val_loss']].plot(ax=ax, title='Model Loss Curve')

# Submission

In [None]:
submissions = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
submissions.head()

In [None]:
test_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

test_generator = test_data_generator.flow_from_dataframe(
    submissions,
    directory = '../input/plant-pathology-2021-fgvc8/test_images',
    x_col="image",
    y_col=None,
    target_size=(224, 224),
    color_mode="rgb",
    classes=None,
    class_mode=None,
    shuffle=False,
    batch_size=1
)

predictions = model.predict(test_generator,steps=len(test_generator.filenames))

In [None]:
thresh = 0.5
for i in range(3):
    submissions.iloc[i, 1] = ' '.join(labels.columns[1:][predictions[i] >= thresh])
    
submissions.to_csv('submission.csv', index=False)    

In [None]:
model.save('mobilenetv2.h5')