#### Reference From: https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models

<h2>Requirements</h2>

In [None]:
#!pip install opencv-python

In [None]:
#pip install tensorflow
# !pip install --upgrade tensorflow
# print(tf.__version__)
# print(tf.keras.__version__)

In [None]:
!pip install -q efficientnet

In [None]:
#!pip install keras

In [None]:
#!pip install plotly

<h2>Data Description</h2>

The data which is used in this research is taken from the <b style="font-size: 20px"><a href="https://www.kaggle.com/c/plant-pathology-2020-fgvc7/data">plant-pathology-2020-fgvc7</a></b>

<h3>1. Images</h3>
<p>A folder containing the train and test images, in jpg format.</p>

<h3>2. Train Data</h3><p>(train.csv)</p>

<p>Data which is used to train the models is stored in this file</p>

<h5>Columns:</h5>

<p><b>image_id:</b> ID of the image which is located in images folder</p>
<p><b>healthy</b> This column describes if the leaf is healthy or unhealthy if healthy it is marked as 1 else it is 0</p>
<p><b>scab</b> This column describes if the leaf is diseased with Scab or not if the leaf is effected with Scab it is marked as 1 else it is 0</p>
<p><b>rust</b> This column describes if the leaf is diseased with Rust or not if the leaf is effected with Rust it is marked as 1 else it is 0</p>
<p><b>multiple_diseases</b> This column describes if the leaf is diseased with multiple diseases (like scab and rust) or not if the leaf is effected with multiple diseases it is marked as 1 else it is 0</p>

<h3>3. Test Data</h3>

<p>This data is used to test the model how effectively the model is working on unseen data</p>

<h5>Columns:</h5>
<p><b>image_id:</b> ID of the image which is located in images folder</p>

In [None]:
import gc
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import cv2

from tqdm import tqdm
# tqdm.pandas()

import tensorflow as tf

import efficientnet.tfkeras as efn

from tensorflow.keras.models import Sequential
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model


from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import cohen_kappa_score, accuracy_score, confusion_matrix

from kaggle_datasets import KaggleDatasets


import warnings
warnings.filterwarnings('ignore')

<h2>Load the Data</h2>

In [None]:
DIR_INPUT = '../input/plant-pathology-2020-fgvc7'
train_data_path = DIR_INPUT + "/train.csv"
test_data_path = DIR_INPUT + "/test.csv"
images_path = DIR_INPUT + "/images/"

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

In [None]:
train = pd.read_csv(train_data_path)

In [None]:
test = pd.read_csv(test_data_path)

<h2>Exploring the Data</h2>

In [None]:
train.head()

In [None]:
# Rows and Columns in the data

train.shape

In [None]:
# summary of the dataset: 1821 rows, 5 columns, no null values

print(train.info())

In [None]:
# Data Description

train.describe()

#### Structure of test data

In [None]:
test.head()

In [None]:
# Rows and Columns in the test data

test.shape

In [None]:
# Let us find the duplicates in the dataset if any in train data

bool_series = train["image_id"].duplicated()
len(bool_series[bool_series].index.values)

In [None]:
# Let us find the duplicates in the dataset if any in test data

bool_series = train["image_id"].duplicated()
len(bool_series[bool_series].index.values)

There are no duplicate images in both train and test data.

In [None]:
# Let us find the missing values percentage of train data in column wise

missing_values = round(100*(train.isnull().sum()/len(train.index)),2)
missing_values

In [None]:
# Let us find the missing values percentage of test data in column wise

missing_values = round(100*(test.isnull().sum()/len(test.index)),2)
missing_values

#### Data Distribution

In [None]:
def plot_data_distribution(column_name, labels = [], colours = [], xlabel='', ylabel=''):
    plt.bar(labels, train[column_name].value_counts(), color=colours)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.annotate('{}\n({:.4}%)'.format(train[column_name].value_counts()[0], 
                                             train[column_name].value_counts()[0]/train[column_name].count()*100),
                 (0.20, 0.45), xycoords='axes fraction')
    plt.annotate('{}\n({:.4}%)'.format(train[column_name].value_counts()[1], 
                                             train[column_name].value_counts()[1]/train[column_name].count()*100),
                 (0.70, 0.45), xycoords='axes fraction')
    plt.tight_layout()
    plt.show()

In [None]:
# Letus find the number of healthy and non-healthy leafs from data
plot_data_distribution('healthy', labels=['Un-Healthy', 'Healthy'], colours=['#FF6666','#66FF66'], xlabel='Healthy', ylabel='Count')

In [None]:
# Letus find the number of Scab leaves and non-scab leaves from data
plot_data_distribution('scab', labels=['Leaves without Scab', 'Leaves with Scab'], colours=['#66FF66', '#F27900'], xlabel='Scab', ylabel='Count')

In [None]:
# Letus find the number of Rust leaves and non-rust leaves from data

plot_data_distribution('rust', labels=['Leaves without Rust', 'Leaves with Rust'], colours=['#66FF66', '#b7410e'], xlabel='Rust', ylabel='Count')

In [None]:
# Letus find the number of Multiple diseased leaves and leaves without multiple diseases from data

plot_data_distribution('multiple_diseases', labels=['Leaves without multiple diseases', 'Leaves with multiple diseases'], colours=['#66FF66', '#b7410e'], xlabel='Multiple Diseases', ylabel='Count')

In [None]:
# Distribution of diseases 

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
leaves = ['Healthy', 'Multiple Diseases', 'Rust', 'Scab']
healthy = len(train[train['healthy'] == 1])
multiple = len(train[train['multiple_diseases'] == 1])
rust = len(train[train['rust'] == 1])
scab = len(train[train['scab'] == 1])
total_count = [healthy, multiple, rust, scab]
ax.pie(total_count, labels = leaves,autopct='%1.2f%%')
plt.show()

### Sample images from the dataset

In [None]:
# Load images 
def load_image(image_id): 
    image = cv2.imread(images_path + image_id + ".jpg")
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

In [None]:
train_images = train["image_id"][:500].apply(load_image)

In [None]:
# function for loading the images form the dataset
def get_sample_images(image_type, skip = 0, take = 100):
    
    disease_cond_list = []
    if image_type == 'healthy':
        disease_cond_list = [1, 0, 0, 0]
    elif image_type == 'multiple_diseases':
        disease_cond_list = [0, 1, 0, 0]
    elif image_type == 'rust':
        disease_cond_list = [0, 0, 1, 0]
    elif image_type == 'scab':
        disease_cond_list = [0, 0, 0, 1]
    else:
        disease_cond_list = [1, 0, 0, 0]

    if (len(disease_cond_list) > 1 ):
        data = train[(train['healthy'] == disease_cond_list[0]) & (train['scab'] == disease_cond_list[1]) & (train['rust'] == disease_cond_list[2]) & (train['multiple_diseases'] == disease_cond_list[3])][skip:][:take]
        
        images = train_images.loc[list(data.index)]
        cols, rows = 3, min([3, len(images)//3])

        fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(30, rows*20/3))
        for col in range(cols):
            for row in range(rows):
                ax[row, col].imshow(images.loc[images.index[row*3+col]])
        plt.show()
    else:
        return None

#### Sample images of Healthy Leaves

In [None]:
get_sample_images("healthy", 3, 6)

#### Sample images of leaves with multiple diseases

In [None]:
get_sample_images("multiple_diseases", 3, 6)

#### Sample images of leaves with rust

In [None]:
get_sample_images("rust", 3, 6)

#### Sample images of leaves with scab

In [None]:
get_sample_images("scab", 3, 6)

### Data Cleaning

In [None]:
# Finding Duplicates

def get_duplicates(images):
    for idx, i in enumerate(images):
        
        image1 = cv2.imread(images_path + i[0] + ".jpg")
        image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
        
        image2 = cv2.imread(images_path + i[1] + ".jpg")
        image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)
        
        fig = plt.figure(figsize=(10,10))
        ax1 = fig.add_subplot(2,2,1)
        ax1.set_title(i[0])
        ax1.imshow(image1)
        ax2 = fig.add_subplot(2,2,2)
        ax2.set_title(i[1])
        ax2.imshow(image2)

In [None]:
get_duplicates(np.array([['Train_379', 'Train_1173'], ['Test_683', 'Test_1691'], ['Test_570', 'Test_1212']]))

In [None]:
train_df = train.loc[train.image_id != 'Train_1173']
test_df = test.loc[(test.image_id != 'Test_1691') & (test.image_id != 'Test_1212')]

### Preprocessing

In [None]:
image_sample = GCS_DS_PATH + '/images/Train_1307.jpg'

In [None]:
def read_file(fileName, label=None):
    bits = tf.io.read_file(fileName)
    image = tf.image.decode_jpeg(bits, channels=3)
    if label is None:
        return image
    else:
        return image, label

In [None]:
plt.imshow(read_file(image_sample))

In [None]:
def image_brightness(image, label=None, brightness=0.2):
    image = read_file(image)
    image = tf.image.random_brightness(image, brightness)
    if label is None:
        return image
    else:
        return image, label

In [None]:
plt.imshow(image_brightness(image_sample))

In [None]:
def image_flipping(image, label=None, flip_type='left_right'):
    image = read_file(image)
    if flip_type == 'left_right':
        image = tf.image.random_flip_left_right(image)
    elif flip_type == 'up_down':
        image = tf.image.random_flip_up_down(image)
    else:
        image = image
    return image

### Preparing the Data

In [None]:
def map_image_with_path(image):
    return GCS_DS_PATH + '/images/' + image + '.jpg'

In [None]:
test_paths = test.image_id.apply(map_image_with_path).values
train_paths = train.image_id.apply(map_image_with_path).values

In [None]:
labels = train[['healthy', 'multiple_diseases', 'rust', 'scab']].values
train_paths, valid_paths, train_labels, valid_labels = train_test_split(train_paths, labels, test_size=0.15, random_state=1)

In [None]:
print("The number of training data : ", train_paths.shape[0])
print("The number of validation data : ", valid_paths.shape[0])

#### Setup TPU Config

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Connected to TPU')
else:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMG_SIZE = 600
EPOCHS = 25 # @param {type: "slider", min:10, max:100}
STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE
nb_classes = train_labels.shape[0]

In [None]:
LR_START = 0.00001
LR_MAX = 0.0005
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 1
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

rng = [i for i in range(EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
def decode_image(filename, label=None, image_size=(IMG_SIZE, IMG_SIZE)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    if label is None:
        return image
    else:
        return image, label

In [None]:
def data_augmentation(image, label=None):
    image = tf.image.rot90(image,k=np.random.randint(4))
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .map(data_augmentation, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(IMG_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

In [None]:
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths, valid_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [None]:
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

### EfficientNet - B7

In [None]:
def build_efficientnet_model(weights='imagenet'):
    with strategy.scope():
        en =efn.EfficientNetB7(input_shape=(IMG_SIZE, IMG_SIZE, 3), weights=weights, include_top=False)
        en.trainable = True

        model = tf.keras.Sequential([
            en,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(train_labels.shape[1], activation='softmax')
        ])
        model.compile(optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
        print(model.summary())
        return model

In [None]:
modelB7 = build_efficientnet_model()

In [None]:
# tf.keras.utils.model_to_dot(Model(model.layers[0].input, model.layers[0].layers[11].output), dpi=70).create(prog='dot', format='svg')
tf.keras.utils.plot_model(
    Model(modelB7.layers[0].input, modelB7.layers[0].layers[11].output),
    show_shapes=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=True,
    dpi=80,
)

In [None]:
%%time
# checkpoint=tf.keras.callbacks.ModelCheckpoint(f"Enet_model.h5", monitor='val_categorical_accuracy', verbose=1, save_best_only=True,
#        save_weights_only=True,mode='max')

historyB7 = modelB7.fit(
    train_dataset,
    epochs=EPOCHS,
    validation_data=valid_dataset, 
    verbose=1,
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
)

In [None]:
def plot_training(params, title, ylabel, xlabel, legend=[]):
    plt.plot(params[0])
    plt.plot(params[1])
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.legend(legend, loc="lower right")
    plt.show()

In [None]:
plot_training(
    [historyB7.history["loss"], historyB7.history["val_loss"]],
    "Loss",
    "loss",
    "epoch",
    ["train", "validation"]
)

In [None]:
plot_training(
    [historyB7.history["categorical_accuracy"], historyB7.history["val_categorical_accuracy"]],
    "model accuracy",
    "accuracy",
    "epoch",
    ["train", "validation"]
)

In [None]:
# model.load_weights("../input/%s/best_weight.h5" % (os.listdir('../input')[0]))
print("The Accuracy on the Validation data : {:.2f}%".format(100 * modelB7.evaluate_generator(valid_dataset, verbose = 1)[-1]))

In [None]:
predictedB7 = modelB7.predict(test_dataset,verbose = 1)

#### Sample Predicted Images EfficientNet

In [None]:
def predict_efficient(img):
    image = load_image(img)
    img_cv = cv2.resize(image/255.0, (IMG_SIZE, IMG_SIZE)).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
    
    preds = modelB7.layers[2](modelB7.layers[1](modelB7.layers[0](img_cv))).numpy()[0]

    if list.index(preds.tolist(), max(preds)) == 0:
        pred = [1, 0, 0, 0]
    if list.index(preds.tolist(), max(preds)) == 1:
        pred = [0, 1, 0, 0]
    if list.index(preds.tolist(), max(preds)) == 2:
        pred = [0, 0, 1, 0]
    if list.index(preds.tolist(), max(preds)) == 3:
        pred = [0, 0, 0, 1]

    variables = ['Healthy', 'Multiple Diseases', 'Rust', 'Scab'] 

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(11, 3))
    
    axes[0].imshow(load_image(img))
    axes[0].set_title(img)
    axes[1].bar(variables, pred, color=['#66FF66', '#F27900', '#b7410e', '#cc9966'])
    axes[1].set_title(img)

In [None]:
predict_efficient(test['image_id'][0])
predict_efficient(test['image_id'][2])
predict_efficient(test['image_id'][3])
predict_efficient(test['image_id'][966])

### Efficientnet Noisy Student

In [None]:
modelNoisy = build_efficientnet_model('noisy-student')

In [None]:
%%time
# checkpoint=tf.keras.callbacks.ModelCheckpoint(f"Enet_model.h5", monitor='val_categorical_accuracy', verbose=1, save_best_only=True,
#        save_weights_only=True,mode='max')

historyB7 = modelNoisy.fit(
    train_dataset, 
    epochs=EPOCHS,
    validation_data=valid_dataset, 
    verbose=1, 
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
)

In [None]:
plot_training(
    [historyB7.history["loss"], historyB7.history["val_loss"]],
    "Loss",
    "loss",
    "epoch",
    ["train", "validation"]
)

In [None]:
plot_training(
    [historyB7.history["categorical_accuracy"], historyB7.history["val_categorical_accuracy"]],
    "model accuracy",
    "accuracy",
    "epoch",
    ["train", "validation"]
)

In [None]:
predictedB7Noisy = modelNoisy.predict(test_dataset,verbose = 1)

### DenseNet

In [None]:
def build_densenet_model(weights='imagenet'):
    with strategy.scope():
        dn = DenseNet121(input_shape=(IMG_SIZE, IMG_SIZE, 3), weights=weights, include_top=False)
        dn.trainable = True

        model = tf.keras.Sequential([
            dn,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(train_labels.shape[1], activation='softmax')
        ])
        model.compile(optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
        print(model.summary())
        return model

In [None]:
model_dense = build_densenet_model()

In [None]:
tf.keras.utils.plot_model(
    Model(model_dense.layers[0].input, model_dense.layers[0].layers[13].output),
    show_shapes=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=True,
    dpi=65,
)

In [None]:
%%time
# checkpoint=tf.keras.callbacks.ModelCheckpoint(f"Enet_model.h5", monitor='val_categorical_accuracy', verbose=1, save_best_only=True,
#        save_weights_only=True,mode='max')

historyDense = model_dense.fit(
    train_dataset, 
    epochs=EPOCHS, 
    validation_data=valid_dataset, 
    verbose=1, 
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
)

In [None]:
plot_training(
    [historyDense.history["loss"], historyDense.history["val_loss"]],
    "Loss",
    "loss",
    "epoch",
    ["train", "validation"]
)

In [None]:
plot_training(
    [historyDense.history["categorical_accuracy"], historyDense.history["val_categorical_accuracy"]],
    "model accuracy",
    "accuracy",
    "epoch",
    ["train", "validation"]
)

In [None]:
predictDense = model_dense.predict(test_dataset, verbose=1)

#### Predict Images

In [None]:
def predict_densenet(img):
    image = load_image(img)
    img_cv = cv2.resize(image/255.0, (IMG_SIZE, IMG_SIZE)).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
    
    preds = model_dense.layers[2](model_dense.layers[1](model_dense.layers[0](img_cv))).numpy()[0]

    if list.index(preds.tolist(), max(preds)) == 0:
        pred = [1, 0, 0, 0]
    if list.index(preds.tolist(), max(preds)) == 1:
        pred = [0, 1, 0, 0]
    if list.index(preds.tolist(), max(preds)) == 2:
        pred = [0, 0, 1, 0]
    if list.index(preds.tolist(), max(preds)) == 3:
        pred = [0, 0, 0, 1]

    variables = ['Healthy', 'Multiple Diseases', 'Rust', 'Scab'] 

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(11, 3))
    
    axes[0].imshow(load_image(img))
    axes[0].set_title(img)
    axes[1].bar(variables, pred, color=['#66FF66', '#F27900', '#b7410e', '#cc9966'])
    axes[1].set_title(img)

In [None]:
predict_densenet(test['image_id'][0])
predict_densenet(test['image_id'][2])
predict_densenet(test['image_id'][3])
predict_densenet(test['image_id'][966])