### Install and import necessary libraries

In [None]:
# !pip install -q efficientnet
# !pip install opencv-python==3.4.2.17
# !pip install opencv-contrib-python==3.4.2.17
# !conda install tensorflow-gpu -y
# !conda install keras=2.3.1 -y
# !conda install pandas -y
# !conda install tqdm -y
# !conda install scikit-learn -y
# !conda install plotly -y

In [None]:
import os
import gc
import re

import cv2
import math
import numpy as np
import scipy as sp
import pandas as pd

import tensorflow as tf
from IPython.display import SVG
import efficientnet.tfkeras as efn
from keras.utils import plot_model
import tensorflow.keras.layers as L
from keras.utils import model_to_dot
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
# from kaggle_datasets import KaggleDatasets
from tensorflow.keras.applications import InceptionResNetV2

# import seaborn as sns
from tqdm import tqdm
import matplotlib.cm as cm
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid

tqdm.pandas()
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from collections import OrderedDict


from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD, NMF
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn import cluster
from joblib import parallel_backend, Parallel, delayed

# import warnings
# warnings.filterwarnings("ignore")

### Load the data and define hyperparameters

In [None]:
EPOCHS = 20
SAMPLE_LEN = 100
IMAGE_PATH = "../input/plant-pathology-2020-fgvc7/images/"
TEST_PATH = "../input/plant-pathology-2020-fgvc7/test.csv"
TRAIN_PATH = "../input/plant-pathology-2020-fgvc7/train.csv"
SUB_PATH = "../input/plant-pathology-2020-fgvc7/sample_submission.csv"

sub = pd.read_csv(SUB_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)

##  device placement and hyperparameters <a id="2.1"></a>

In [None]:
# TPU or GPU detection
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
    
def seed_everything(seed=0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

SEED=2048
seed_everything(SEED)

print("REPLICAS: ", strategy.num_replicas_in_sync)
print("GPUs: {}".format(tf.config.experimental.list_physical_devices('GPU')))

# # Data access
# GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
AUTO = tf.data.experimental.AUTOTUNE
EPOCHS = 40
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

VALIDATION_SIZE = 0.15
IMAGE_SIZE = 400

# multiprocessing
N_JOBS=-1

## image path and labels <a id="2.2"></a>

In [None]:
def format_path(st):
    return IMAGE_PATH + st + '.jpg'

test_paths = test_data.image_id.apply(format_path).values
trainval_paths = train_data.image_id.apply(format_path).values

trainval_labels = np.float32(train_data.loc[:, 'healthy':'scab'].values)

train_paths, valid_paths, train_labels, valid_labels =\
train_test_split(trainval_paths, trainval_labels, test_size=VALIDATION_SIZE,
                 random_state=SEED)

print('train samples: ', len(train_paths))
print('valid samples: ', len(valid_paths))
print('test samples: ', len(test_paths))
print('path example: ', train_paths[0])
print('label example: ',  train_labels[0])

## image loading<a id="2.3"></a>

In [None]:
def decode_image(filename, label=None, image_size=(IMAGE_SIZE, IMAGE_SIZE)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
#     image = tf.cast(image, tf.float32) / 255.0
# https://www.tensorflow.org/tutorials/images/transfer_learning
# https://github.com/keras-team/keras-applications/blob/bc89834ed36935ab4a4994446e34ff81c0d8e1b7/keras_applications/imagenet_utils.py#L42
    image = tf.cast(image, tf.float32)
    image = (image/127.5) - 1
    image = tf.image.resize(image, image_size)
    
    if label is None:
        return image
    else:
        return image, label

def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label

##  training history display<a id="2.4"></a>

In [None]:
def display_training_curves(training, validation, title, subplot):
    """
    Source: https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu
    """
    if subplot%10==1: # set up the subplots on the first call
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    #ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])

## get pipeline component by name <a id="2.5"></a>

In [None]:
def get_backbone(cnn='VGG16'):
    assert cnn in ['ResNet101V2', 'VGG16', 'InceptionResNetV2', 'MobileNetV2']
    if cnn == 'ResNet101V2':
        backbone = tf.keras.applications.ResNet101V2(
            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
            weights='imagenet',
            include_top=False)
    if cnn == 'VGG16':
        backbone = tf.keras.applications.VGG16(
            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
            weights='imagenet',
            include_top=False)
    if cnn == 'InceptionResNetV2':
        backbone = tf.keras.applications.InceptionResNetV2(
            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
            weights='imagenet',
            include_top=False)
    if cnn == 'MobileNetV2':
        backbone = tf.keras.applications.MobileNetV2(
            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
            weights='imagenet',
            include_top=False)     
    return backbone
 

def get_classifier(name='linearSVM'):
    if name == 'linearSVM':
#         return LinearSVC(class_weight='balanced',
#                              probability=True)
        return SVC(kernel='linear',
                   class_weight='balanced',
                   probability=True)
    if name == 'rbfSVM':
        return SVC(kernel='rbf',
                       class_weight='balanced',
                       probability=True)
    if name == 'LR':
        return LogisticRegression()

    
def get_dim_reductor(name='PCA_128'):
    method, n_components = name.split('_')
    n_components = int(n_components)
#     print(method, n_components)
    if method == 'PCA':
        return PCA(n_components=n_components)
#         return KernelPCA(n_components=n_components)
    
    if method == 'KPCA':
        return KernelPCA(kernel='rbf', n_components=n_components)
    
    if method == 'LSA':
        return TruncatedSVD(n_components=n_components,
                            random_state=SEED)
    
    if method == 'NMF':
        return NMF(n_components=n_components)    
    

## create dataset object  <a id="2.6"></a>

In [None]:
trainval_dataset = (
    tf.data.Dataset
    .from_tensor_slices((trainval_paths, trainval_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_dataset = (
tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

train_dataset_1 = (
tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(64)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths, valid_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image, num_parallel_calls=AUTO)
    .map(data_augment, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

## create output directories  <a id="2.7"></a>

In [None]:
ckpt_dir = '../output/best_models'
submission_dir = '../output/submissions'
os.makedirs(ckpt_dir, exist_ok=True)
os.makedirs(submission_dir, exist_ok=True)

## End-to-end finetuned CNN <a id="3.3"></a>

CNNs to be finetuned:

* ResNet101V2
* VGG16
* InceptionResNetV2
* MobileNetV2

The first three models are heavy architectures, while the last, MobileNetV2 has All available pretrained CNNs are here: [Module: tf.keras.applications](https://www.tensorflow.org/api_docs/python/tf/keras/applications)

### learning rate scheduler

In [None]:
LR_START = 0.0001
LR_MAX = 0.00005 * 8
LR_MIN = 0.0001
LR_RAMPUP_EPOCHS = 4
LR_SUSTAIN_EPOCHS = 6
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)

rng = [i for i in range(EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

### finetune models

In [None]:
record_ls = []

for cnn in ['ResNet101V2', 'VGG16', 'InceptionResNetV2', 'MobileNetV2']:
    print(f'Finitune {cnn}...')
    record = OrderedDict()
    
    with strategy.scope():
        # build model
        backbone = get_backbone(cnn)
        model =  tf.keras.Sequential([
            backbone,
            L.GlobalMaxPooling2D(),
#             L.Dropout(0.3),
            L.Dense(4, activation='softmax')
            ])
        model.compile(
            optimizer = 'adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
            )
        model.summary()
        
        
        ckpt_path = os.path.join(ckpt_dir,
                                 f'finetuned_{cnn}.h5')
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            ckpt_path,
            verbose=1,
            monitor='val_categorical_accuracy',
            save_best_only=True,
            mode='auto') 

        STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE
        history = model.fit(
            train_dataset, 
            epochs=EPOCHS, 
            callbacks=[lr_callback, checkpoint],
            steps_per_epoch=STEPS_PER_EPOCH,
            validation_data=valid_dataset
            )
        
        # display training curves
        display_training_curves(
            history.history['loss'], 
            history.history['val_loss'], 
            'loss', 211)
        display_training_curves(
            history.history['categorical_accuracy'], 
            history.history['val_categorical_accuracy'], 
            'accuracy', 212)
        plt.show()
        
        record['model'] = f'finetuned_{cnn}'
        best_idx = np.argmax(history.history['val_categorical_accuracy'])
        record['train_loss'] = history.history['loss'][best_idx]
        record['valid_loss'] = history.history['val_loss'][best_idx]
        record['train_acc'] = history.history['categorical_accuracy'][best_idx]
        record['valid_acc'] = history.history['val_categorical_accuracy'][best_idx]
        record_ls.append(record)
        
        # run testing with best model weights
        model.load_weights(ckpt_path)
        
        print('record: ', record['valid_loss'], record['valid_acc'])
#         val_loss, val_acc = model.evaluate(valid_dataset)
#         print('confirmation: ', val_loss, val_acc)
        
        print('Start inference on test dataset.')
        probs = model.predict(test_dataset, verbose=1)
        sub.loc[:, 'healthy':] = probs
        sub.to_csv(os.path.join(submission_dir, f'finetune_{cnn}.csv'), index=False)
#         sub.head()
    
    # release memory
    # https://forums.fast.ai/t/how-could-i-release-gpu-memory-of-keras/2023/19
    del model
    K.clear_session()
    gc.collect()

In [None]:
report_df =  pd.DataFrame(record_ls)

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(report_df)
    
report_df.to_csv(os.path.join('../output', f'fintune_cnn_report.csv'), index=False)

Compelling result is derived by end-to-end finetuning CNN. Can we do better by preprocessing input images?

## Retry finetuning CNNs after backgroud removal <a id="3.4"></a>

The background may distract disease detection and classification, so let's try if backgroud removal helps. Following [victorlouisdg's kernel](https://www.kaggle.com/victorlouisdg/plant-pathology-opencv-background-removal), I use [grabcut] to do this job.

In [None]:
from mpl_toolkits.axes_grid1 import ImageGrid

In [None]:
bg_removed_img_dir = '../input/images_no_bg'
os.makedirs(bg_removed_img_dir, exist_ok=True)

### preprocessing images using grabcut

In [None]:
def init_grabcut_mask(h, w):
    mask = np.ones((h, w), np.uint8) * cv2.GC_PR_BGD
    mask[h//4:3*h//4, w//4:3*w//4] = cv2.GC_PR_FGD
    mask[2*h//5:3*h//5, 2*w//5:3*w//5] = cv2.GC_FGD
    return mask

plt.imshow(init_grabcut_mask(3*136, 3*205))

In [None]:
def remove_background(image):
    h, w = image.shape[:2]
    mask = init_grabcut_mask(h, w)
    bgm = np.zeros((1, 65), np.float64)
    fgm = np.zeros((1, 65), np.float64)
    cv2.grabCut(image, mask, None, bgm, fgm, 1, cv2.GC_INIT_WITH_MASK)
    mask_binary = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
    result = cv2.bitwise_and(image, image, mask = mask_binary)
#     add_contours(result, mask_binary) # optional, adds visualizations
    return result

In [None]:
# visualize samples

num_show = 5

rows, cols = (num_show, 2)
axes_pad = 0.2
fig_h = 4.0 * rows + axes_pad * (rows-1) 
fig_w = 4.0 * cols + axes_pad * (cols-1) 
fig = plt.figure(figsize=(fig_w, fig_h))
grid = ImageGrid(fig, 111, nrows_ncols=(rows, cols), axes_pad=0.2)   

for i, ax in enumerate(grid):
    img_path = trainval_paths[i//2]
    img = cv2.resize(cv2.imread(img_path), (IMAGE_SIZE, IMAGE_SIZE))
    if i % 2 == 1:
        img = remove_background(img)
    ax.imshow(img[:, :, ::-1])    

In [None]:
for img_path in tqdm(trainval_paths):
    img = cv2.resize(cv2.imread(img_path), (IMAGE_SIZE, IMAGE_SIZE))
    nobg = remove_background(img)
    cv2.imwrite(os.path.join(bg_removed_img_dir,
                             os.path.basename(img_path)),
               nobg)

In [None]:
for img_path in tqdm(test_paths):
    img = cv2.resize(cv2.imread(img_path), (IMAGE_SIZE, IMAGE_SIZE))
    nobg = remove_background(img)
    cv2.imwrite(os.path.join(bg_removed_img_dir,
                             os.path.basename(img_path)),
               nobg)

### prepare preprocessed dataset

In [None]:
def format_path_nobg(st):
    return os.path.join(bg_removed_img_dir, st + '.jpg')

test_paths_new = test_data.image_id.apply(format_path_nobg).values
trainval_paths_new = train_data.image_id.apply(format_path_nobg).values

trainval_labels_new = np.float32(train_data.loc[:, 'healthy':'scab'].values)

train_paths_new, valid_paths_new, train_labels_new, valid_labels_new =\
train_test_split(trainval_paths_new,
                 trainval_labels_new,
                 test_size=VALIDATION_SIZE,
                 random_state=SEED)

print('train samples: ', len(train_paths_new))
print('valid samples: ', len(valid_paths_new))
print('test samples: ', len(test_paths_new))
print('path example: ', train_paths_new[0])
print('label example: ',  train_labels_new[0])

In [None]:
train_dataset_new = (
tf.data.Dataset
    .from_tensor_slices((train_paths_new, train_labels_new))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset_new = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths_new, valid_labels_new))
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset_new = (
    tf.data.Dataset
    .from_tensor_slices(test_paths_new)
    .map(decode_image, num_parallel_calls=AUTO)
    .map(data_augment, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

### run training

In [None]:
record_ls = []

for cnn in ['ResNet101V2', 'VGG16', 'InceptionResNetV2', 'MobileNetV2']:
    print(f'Finitune {cnn}...')
    record = OrderedDict()
    
    with strategy.scope():
        # build model
        backbone = get_backbone(cnn)
        model =  tf.keras.Sequential([
            backbone,
            L.GlobalMaxPooling2D(),
#             L.Dropout(0.3),
            L.Dense(4, activation='softmax')
            ])
        model.compile(
            optimizer = 'adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
            )
        model.summary()
        
        
        ckpt_path = os.path.join(ckpt_dir,
                                 f'finetuned_{cnn}_nobg.h5')
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            ckpt_path,
            verbose=1,
            monitor='val_categorical_accuracy',
            save_best_only=True,
            mode='auto') 

        STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE
        history = model.fit(
            train_dataset_new, 
            epochs=EPOCHS, 
            callbacks=[lr_callback, checkpoint],
            steps_per_epoch=STEPS_PER_EPOCH,
            validation_data=valid_dataset_new
            )
        
        # display training curves
        display_training_curves(
            history.history['loss'], 
            history.history['val_loss'], 
            'loss', 211)
        display_training_curves(
            history.history['categorical_accuracy'], 
            history.history['val_categorical_accuracy'], 
            'accuracy', 212)
        plt.show()
        
        record['model'] = f'finetuned_{cnn}_nobg'
        best_idx = np.argmax(history.history['val_categorical_accuracy'])
        record['train_loss'] = history.history['loss'][best_idx]
        record['valid_loss'] = history.history['val_loss'][best_idx]
        record['train_acc'] = history.history['categorical_accuracy'][best_idx]
        record['valid_acc'] = history.history['val_categorical_accuracy'][best_idx]
        record_ls.append(record)
        
        # run testing with best model weights
        model.load_weights(ckpt_path)
        
        print('record: ', record['valid_loss'], record['valid_acc'])
#         val_loss, val_acc = model.evaluate(valid_dataset)
#         print('confirmation: ', val_loss, val_acc)
        
        print('Start inference on test dataset.')
        probs = model.predict(test_dataset_new, verbose=1)
        sub.loc[:, 'healthy':] = probs
        sub.to_csv(os.path.join(submission_dir,
                                f'finetune_{cnn}_nobg.csv'),
                   index=False)
#         sub.head()
    
    # release memory
    # https://forums.fast.ai/t/how-could-i-release-gpu-memory-of-keras/2023/19
    del model
    K.clear_session()
    gc.collect()

In [None]:
report_df =  pd.DataFrame(record_ls)

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(report_df)
    
report_df.to_csv(os.path.join('../output',
                              f'fintune_cnn_nobg_report.csv'),
                 index=False)

For all models, the accuracy is slightly decreased comapared to no background removal scenario. But it seems the overfitting problem is eased for InceptionResNetV2, since the training accuracy and validation accuracy are very close. 

# Ensembling <a id="4"></a>

Ensembling involves the averaging of multiple prediction vectos to reduce errors and improve accuracy. Now, I will ensemble predictions from DenseNet and EfficientNet to (hopefully) produce better results.

In [None]:
ensemble_subs = ['finetune_MobileNetV2.csv',
                 'finetune_ResNet101V2.csv',
                 'finetune_InceptionResNetV2.csv'
                 'finetuned_InceptionResNetV2_nobg.csv']

sub = pd.read_csv(SUB_PATH)

In [None]:
final = 0
cnt  = len(ensemble_subs)

for sub_name in ensemble_subs:
    df = pd.read_csv(os.path.join(submission_dir, sub_name))
    prob = df.loc[:, 'healthy':].to_numpy()
#     display(df)
#     print(prob)
    final += prob
    
final = final / cnt
sub.loc[:, 'healthy':] = final

sub.to_csv(os.path.join(submission_dir, 'ensemble.csv'),
           index=False)