# Imports

In [None]:
#################################
import pandas as pd
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
#################################
from sklearn.model_selection import train_test_split
import tensorflow as tf
#################################

# Constants

In [None]:
BASE_PATH = '/kaggle/input/plant-pathology-2020-fgvc7/'
SAMPLE_LENGTH = 100

# Data Inspection

## Loading Data

In [None]:
train_df = pd.read_csv(BASE_PATH + 'train.csv')
print(train_df.head())

## Displaying Images by Labels

In [None]:
def load_image(image_id):
    return mpimg.imread(BASE_PATH + f'images/{image_id}.jpg')

In [None]:
def display_image(image,binary=False,rcmap=False,show=False,label=False):
    if binary:
        binary = 'binary'
    
    f = plt.figure(figsize=(10,3))
    subplot_max = image.shape[-1] + 1
    if label:
        subplot_max += 1
    
    ax = f.add_subplot(1,subplot_max,1)
    ax.title.set_text('Image - RGB')
    plt.imshow(image)
    plt.axis('off')
    
    ax = f.add_subplot(1,subplot_max,2)
    ax.title.set_text('Image - R')
    plt.imshow(image[...,0],cmap=(binary or 'Reds') + rcmap * '_r')
    plt.axis('off')
    
    ax = f.add_subplot(1,subplot_max,3)
    ax.title.set_text('Image - G')
    plt.imshow(image[...,1],cmap=(binary or 'Greens') + rcmap * '_r')
    plt.axis('off')
    
    ax = f.add_subplot(1,subplot_max,4)
    ax.title.set_text('Image - B')
    plt.imshow(image[...,2],cmap=(binary or 'Blues') + rcmap * '_r')
    plt.axis('off')
    
    if label:
        ax = f.add_subplot(1,subplot_max,5)
        #ax.title.set_text('Label')
        plt.text(0.5,0.5,label)
        plt.axis('off')
        
    if show:
        plt.show()

subset_images = train_df['image_id'][:5].apply(load_image)        
for i in range(len(subset_images)):
    la = train_df.iloc[i]
    display_image(subset_images[i],True,label = la.index.values[la == 1])
plt.show()

In [None]:
def plot_colour_distribution(column,max_sample_length=100,subset=None):
    subset = train_df[train_df[column] == 1]['image_id'].reset_index(drop=True)
    max_sample_length = min(max_sample_length, len(subset))
    red_mean, green_mean, blue_mean = [], [], []
    for i in range(max_sample_length):
        img = load_image(subset[i])
        red_mean.append(np.mean(img[...,0]))
        green_mean.append(np.mean(img[...,1]))
        blue_mean.append(np.mean(img[...,2]))
    
    sns.distplot(red_mean,color='r')
    sns.distplot(green_mean,color='g')
    sns.distplot(blue_mean,color='b')
    plt.show()

### Healthy Leaves

In [None]:
subset_images = train_df[train_df['healthy'] == 1].sample(n=5)['image_id'].apply(load_image).reset_index(drop=True)
for i in range(len(subset_images)):
    display_image(subset_images[i],True)
plt.show()

In [None]:
plot_colour_distribution('healthy',100)

### Leaves with Multiple Disease

In [None]:
subset_images = train_df[train_df['multiple_diseases'] == 1].sample(n=5)['image_id'].apply(load_image).reset_index(drop=True)
for i in range(len(subset_images)):
    display_image(subset_images[i],True)
plt.show()

In [None]:
plot_colour_distribution('multiple_diseases',100)

### Leaves with Rust

In [None]:
subset_images = train_df[train_df['rust'] == 1].sample(n=5)['image_id'].apply(load_image).reset_index(drop=True)
for i in range(len(subset_images)):
    display_image(subset_images[i],True)
plt.show()

In [None]:
plot_colour_distribution('rust',100)

### Leaves with Scab

In [None]:
subset_images = train_df[train_df['scab'] == 1].sample(n=5)['image_id'].apply(load_image).reset_index(drop=True)
for i in range(len(subset_images)):
    display_image(subset_images[i],True)
plt.show()

In [None]:
plot_colour_distribution('scab',100)

* Rust is clearly visible on Red Channel
* Scab tends to haves spotty appearance in different channels with the veins often having lower values of green then for healthy leaves.

## Amount of Data per Category

In [None]:
train_df.iloc[:,1:].sum().plot.bar(rot=0)
plt.show()

Healthy, Rust and Scab have almost equal amount of examples but multiple_diseases is severely imbalanced.

## Planned Preprocessing Steps
As the disease are quite localized, cropping the image to just the leaves should help. Image flips would help increase the low amount of images. And the imbalance would need to be balanced using either duplicates, more augmentation for multiple_diseases or weights for loss function.
* Crop to leaves
* Flip Images / Augment data
* 4 ~ 6 times augmentation for multiple_diseases

# Training skeleton

## Loading Dataset

### Constants

In [None]:
IMAGE_SIZE = (256,256)
VALIDATION_SIZE = 0.15
BATCH_SIZE = 16
SHUFFLE_BUFFER = 512

In [None]:
train_val_df = pd.read_csv(BASE_PATH + 'train.csv')
test_df = pd.read_csv(BASE_PATH + 'test.csv')

def path_to_image(image_id, labels=None):
    img = tf.io.read_file(BASE_PATH + 'images/' + image_id + '.jpg')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, IMAGE_SIZE)
    if labels is None:
        return img
    return img, labels

### Splitting Dataset

In [None]:
train_df, val_df = train_test_split(
    train_val_df,
    stratify=train_val_df[train_val_df.columns[1:]],
    test_size=VALIDATION_SIZE,
    random_state=1526,
)

In [None]:
def df_to_dataset(df,no_y=False):
    if no_y:
        t_slice = (df['image_id'].values,)
    else:
        t_slice = (
            df['image_id'].values,
            df[df.columns[1:]].values,
        )
    return (
        tf.data.Dataset
        .from_tensor_slices(t_slice)
        .map(
            path_to_image,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
        .repeat()
        .shuffle(SHUFFLE_BUFFER)
        .batch(BATCH_SIZE)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )

train_dataset = df_to_dataset(train_df)
val_dataset = df_to_dataset(val_df)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((test_df['image_id'].values,))
    .map(path_to_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .batch(BATCH_SIZE)
)

## Defining Model

### Hyperparameters

In [None]:
optimizer = tf.keras.optimizers.Adam(3e-4)
epochs = 20
steps_per_epoch = len(train_df) // BATCH_SIZE
validation_steps = len(val_df) // BATCH_SIZE

In [None]:
model = tf.keras.Sequential([
    tf.keras.applications.ResNet50(
        include_top = False,
        weights = 'imagenet',
        input_shape = IMAGE_SIZE + (3,),
        pooling = 'avg',
    ),
    tf.keras.layers.Dense(4, activation='softmax'),
])

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'],
)

In [None]:
history = model.fit(
    x = train_dataset,
    epochs = epochs,
    validation_data = val_dataset,
    steps_per_epoch = steps_per_epoch,
    validation_steps = validation_steps,
)

In [None]:
sns.lineplot(range(len(history.history['accuracy'])),history.history['accuracy'])
sns.lineplot(range(len(history.history['val_accuracy'])),history.history['val_accuracy'])
plt.show()

In [None]:
sns.lineplot(range(len(history.history['loss'])),history.history['loss'])
sns.lineplot(range(len(history.history['val_loss'])),history.history['val_loss'])
plt.show()

In [None]:
a = model.predict(test_dataset)

In [None]:
sample_df = pd.read_csv(BASE_PATH + 'sample_submission.csv')
sample_df[sample_df.columns[1:]] = a

In [None]:
sample_df.to_csv('predictions.csv',index=False)

In [None]:
sample_df[sample_df.columns[1:]].idxmax(axis=1).value_counts()