In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm.auto import tqdm

sns.set_style('darkgrid')

# Dataset Exploration

In [None]:
train = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print(train.shape)
train.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer().fit(train.labels.apply(lambda x: x.split()))
labels = pd.DataFrame(mlb.transform(train.labels.apply(lambda x: x.split())), columns=mlb.classes_)

fig, ax = plt.subplots(figsize=(20, 6))
labels.sum().plot.bar(title='Target Class Distribution');

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
labels.sum(axis=1).value_counts().plot.bar(title='Distribution of Number of Labels per Image');

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(20, 10))
for i, img in enumerate(train.groupby('labels').first().reset_index().values):
    ax[i//4][i%4].imshow(plt.imread(f"../input/plant-pathology-2021-fgvc8/train_images/{img[1]}"))
    ax[i//4][i%4].set_title(img[0])
    ax[i//4][i%4].axis('off')
fig.suptitle('Image Samples', fontsize=18); 

# Preprocessing and Augmentation

In [None]:
labels = pd.concat([train['image'], labels], axis=1).drop('healthy', axis=1)
labels.head()

In [None]:
batch_size=128
image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, validation_split=0.1)

train_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='../input/plant-pathology-2021-fgvc8/train_images',
    x_col='image',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=batch_size,
    subset='training'
)

valid_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='../input/plant-pathology-2021-fgvc8/train_images',
    x_col='image',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=batch_size,
    subset='validation'
)

# Modelling

In [None]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = tf.keras.applications.InceptionV3(include_top=False)(inputs)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(5, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs, outputs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-4))

model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
rlp = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.01)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

history = model.fit(train_generator, validation_data=valid_generator, epochs=10, callbacks=[rlp, es])

In [None]:
fix, ax = plt.subplots(figsize=(20, 6))
pd.DataFrame(history.history)[['loss', 'val_loss']].plot(ax=ax, title='Model Loss Curve')

In [None]:
model.save('InceptionV3.h5')

# Submission

In [None]:
submissions = pd.read_csv('../input/plant-pathology-2021-fgvc8/sample_submission.csv')
submissions.head()

In [None]:
test_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

test_generator = test_data_generator.flow_from_dataframe(
    submissions,
    directory = '../input/plant-pathology-2021-fgvc8/test_images',
    x_col="image",
    y_col=None,
    target_size=(224, 224),
    color_mode="rgb",
    classes=None,
    class_mode=None,
    shuffle=False,
    batch_size=1
)

predictions = model.predict(test_generator,steps=len(test_generator.filenames))

In [None]:
thresh = 0.5
for i in range(3):
    pred = ' '.join(labels.columns[1:][predictions[i] >= thresh])
    pred = 'healthy' if pred == '' else pred
    submissions.iloc[i, 1] = pred
    
submissions.to_csv('submission.csv', index=False)    