In [None]:
BATCH_SIZE = 16
EPOCHS = 100
IMG_SIZE = (256, 256)

# Data preparation

In [None]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from keras import optimizers
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten, BatchNormalization, GlobalAveragePooling2D
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Sequential, Model
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from skimage.transform import resize
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
train.mean(axis=0, numeric_only=True)

In [None]:
def plot_samples(df, class_name):
    class_df = df[df[class_name] == 1]
    class_df = class_df.reset_index(drop=True)
    plt.figure(figsize=(30, 8))
    for i, rand_idx in enumerate(np.random.randint(len(class_df), size=10)):
        plt.subplot(2, 5, i + 1)
        image_name = class_df["image_id"][rand_idx] + ".jpg"
        img = plt.imread(data_path + 'images/' + image_name)
        plt.imshow(img)
        plt.axis("off")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.suptitle(f"Examples of class {class_name}", fontsize="x-large")
    plt.show()

In [None]:
plot_samples(train, 'healthy')

In [None]:
plot_samples(train, 'multiple_diseases')

In [None]:
plot_samples(train, 'rust')

In [None]:
plot_samples(train, 'scab')

In [None]:
from sklearn.model_selection import train_test_split

# Split train data and valid data
train, valid = train_test_split(train, 
                            test_size=0.15,
                            stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
                            random_state=42)

ori_train_size = train.shape[0]

In [None]:
import os

os.mkdir('data')
os.mkdir('data/train')
os.mkdir('data/val')
os.mkdir('data/test')
os.mkdir('data/train/healthy')
os.mkdir('data/train/multiple_diseases')
os.mkdir('data/train/rust')
os.mkdir('data/train/scab')
os.mkdir('data/val/healthy')
os.mkdir('data/val/multiple_diseases')
os.mkdir('data/val/rust')
os.mkdir('data/val/scab')
os.mkdir('data/test/test')

In [None]:
from shutil import copyfile

classes = ['healthy', 'multiple_diseases', 'rust', 'scab']

# train data
df = train
for i in df.values:
    filename = i[0]
    clss = classes[i[1:].argmax()]
    copyfile(data_path + 'images/' + filename + '.jpg', 'data/train/' + clss + '/' + filename + '.jpg')

# val data
df = valid
for i in df.values:
    filename = i[0]
    clss = classes[i[1:].argmax()]
    copyfile(data_path + 'images/' + filename + '.jpg', 'data/val/' + clss + '/' + filename + '.jpg')

In [None]:
# val data
df = test
for i in df.values:
    filename = i[0]
    copyfile(data_path + 'images/' + filename + '.jpg', 'data/test/test/' + filename + '.jpg')

## Load all data

In [None]:
all_datagen = ImageDataGenerator(rescale=1./255)
train_datagen = ImageDataGenerator(
        rotation_range=45,
        width_shift_range=0.2,
        height_shift_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest')
test_datagen = ImageDataGenerator()

In [None]:
all_train_gen = all_datagen.flow_from_directory(
    directory='data/train/',
    target_size=IMG_SIZE,
    batch_size=train.shape[0],
    class_mode="categorical",
    seed=42
)
all_val_gen = all_datagen.flow_from_directory(
    directory='data/val/',
    target_size=IMG_SIZE,
    batch_size=valid.shape[0],
    class_mode="categorical",
    seed=42
)

# load all data into memory
all_train_data = all_train_gen[0]
all_val_data = all_val_gen[0]

## OverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
train_data_idx = np.array([i for i in range(len(all_train_data[0]))]).reshape(-1,1)
X_over, y_over = oversample.fit_resample(train_data_idx, all_train_data[1].argmax(axis=1))

In [None]:
oversam_data = []
for i in X_over[len(all_train_data[0]):len(X_over)].reshape(-1):
    oversam_data += [all_train_data[0][i]]

oversam_train_data = np.array(list(all_train_data[0]) + oversam_data)
del all_train_data
del oversam_data

In [None]:
all_train_data = (oversam_train_data, tf.keras.utils.to_categorical(y_over, num_classes=4))
del oversam_train_data

## Data loader

In [None]:
saved_train_data = all_train_data

In [None]:
# Restore
all_train_data = saved_train_data

In [None]:
train_generator = train_datagen.flow(all_train_data[0], all_train_data[1], batch_size=BATCH_SIZE)
val_generator = test_datagen.flow(all_val_data[0], all_val_data[1], batch_size=BATCH_SIZE)
STEP_SIZE_TRAIN = len(train_generator)
STEP_SIZE_VALID = len(val_generator)

# Model

In [None]:
LR_START = 0.00001
LR_MAX = 0.0008
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

rng = [i for i in range(EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
class CNN_Model(object):
    def __init__(self, trainable=True):
        self.batch_size = BATCH_SIZE
        self.trainable = trainable
        self.num_epochs = EPOCHS

        # Building model
        self._build_model()

        # Input data
        if trainable:
            self.model.summary()

        self.model.compile(loss="categorical_crossentropy", optimizer=optimizers.Adam(), metrics=['accuracy'])

    def _build_model(self):
        # CNN model
        self.model = Sequential()
        self.model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(64, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(64, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(64, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(64, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(64, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(128, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(256, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Conv2D(256, (3, 3), activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(GlobalAveragePooling2D())

        self.model.add(Flatten())
        self.model.add(Dense(512, activation='relu'))
        self.model.add(BatchNormalization())
        self.model.add(Dense(4, activation='softmax'))

    def train(self):
        # schedule learning rate
        lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)
        # Model Checkpoint
        cpt_save = ModelCheckpoint('pretrained_myCNN.h5', save_best_only=True, monitor='val_accuracy', mode='max')

        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, min_delta=0.001, mode='max')

        print("Training......")

        self.model.fit(train_generator,
                       steps_per_epoch=STEP_SIZE_TRAIN,
                       validation_data=val_generator,
                       validation_steps=STEP_SIZE_VALID,
                       callbacks=[cpt_save, lr_callback, early_stopping], verbose=1,
                       epochs=self.num_epochs)

In [None]:
cnn_model = CNN_Model(trainable=True)

In [None]:
# Train with oversampled data
cnn_model.train()

In [None]:
# Original data
all_train_data = (all_train_data[0][:ori_train_size], all_train_data[1][:ori_train_size])
all_train_data[0].shape

In [None]:
train_generator = train_datagen.flow(all_train_data[0], all_train_data[1], batch_size=BATCH_SIZE)
val_generator = test_datagen.flow(all_val_data[0], all_val_data[1], batch_size=BATCH_SIZE)
STEP_SIZE_TRAIN = len(train_generator)
STEP_SIZE_VALID = len(val_generator)

In [None]:
# reload best weight
cnn_model.model.load_weights('pretrained_myCNN.h5')

In [None]:
# Continue training with original data
cnn_model.train()

# Test

In [None]:
test_model = CNN_Model(trainable=False).model
test_model.load_weights('pretrained_myCNN.h5')

In [None]:
# Load all data
all_test_gen = all_datagen.flow_from_directory(
    directory='data/test/',
    target_size=IMG_SIZE,
    batch_size=test.shape[0],
    class_mode='binary',
    shuffle=False
)
all_test_data = all_test_gen[0]

In [None]:
test_generator = test_datagen.flow(all_test_data[0], all_test_data[1], batch_size=BATCH_SIZE, shuffle=False)

result = []
for k in range(len(test_generator)):
    res = test_model.predict(test_generator[k][0])
    result += list(res)
result = np.array(result)
rnd_res = result.round(2)

In [None]:
submission = pd.DataFrame(data=rnd_res, columns=['healthy', 'multiple_diseases', 'rust', 'scab'])
submission.insert(loc=0, column='image_id', value=all_test_gen.filenames)
submission.image_id = submission.image_id.str.split('/').str[1].str.split('.').str[0]

df = pd.read_csv(data_path + 'test.csv')
submission = pd.merge(df, submission, on=['image_id'])
submission

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
import shutil

# Clean output
shutil.rmtree('data')