In [None]:
import os
import json

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import albumentations as A
from sklearn import metrics as sk_metrics

from PIL import Image
from timeit import default_timer as timer

In [None]:
import h5py
f1 = h5py.File('../input/resnet50-pretrained/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', 'r')
print(list(f1.keys()))

In [None]:
# Importing necessary libs

import random
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import plot_model

import pickle
from keras.applications.resnet50 import ResNet50 as ResModel

In [None]:
BASE_PATH = '../input/cassava-leaf-disease-classification/'
TRAIN_PATH = os.path.join(BASE_PATH, 'train_images')
TEST_PATH = '../input/cassava-leaf-disease-classification/test_images/'
MODEL_BASE = '../input/resnet50'

In [None]:
df_train = pd.read_csv("../input/cassava-leaf-disease-classification/train.csv")
df_train.info()

In [None]:
dist = df_train['label'].value_counts().reset_index()
dist.columns = [
    'label',
    'percentage'
]
dist['percentage'] /= len(df_train)
labels = dist['label']
sizes = dist['percentage']

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

Clearly, most popular disease is label - 3. 

In [None]:
# json to label mapping to get better understanding
with open("../input/cassava-leaf-disease-classification/label_num_to_disease_map.json") as f:
    map_dis = json.loads(f.read())
    map_dis = {int(k) : v for k, v in map_dis.items()}

print(json.dumps(map_dis, indent=4))

Cassava Mosaic Disease - CMD is the most frequent disese.

Let's check the number of samples in train dir, samples in each class and the dimensions

In [None]:
inp_files = os.listdir(TRAIN_PATH)
print(f"Number of training samples: {len(inp_files)}")
print(f"Number of samples in each  class: \n {df_train['label'].value_counts()}")

In [None]:
# Dimensions of first 300 images
img_shapes = {}
for image_name in os.listdir(os.path.join(BASE_PATH, "train_images"))[:300]:
    image = cv2.imread(os.path.join(BASE_PATH, "train_images", image_name))
    img_shapes[image.shape] = img_shapes.get(image.shape, 0) + 1

print(img_shapes)

Loading the classes names to the train dataframe to map class with the disease better

In [None]:
df_train["class_name"] = df_train["label"].map(map_dis)
df_train

Let's see the image distribution in the dataset (train)

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(y="class_name", data=df_train);

There's a clear imbalance in the distribution. This can lead to discrimination against the classes on test data.
### Possible ways to reduce imbalance
* Try different models and see what fits best - IMP: Accuracy is the key metric, so can't change that
* ReSampling 
    * Undersampling majority class
    * Oversampling minority class
* Generate synthetic samples

### First let's plot some images from the dataset and take a look at them

In [None]:
def plot(class_id, label):
    plot_list = df_train[df_train["label"] == class_id].sample(2)['image_id'].tolist()
    labels = [label for i in range(len(plot_list))]
    size = np.sqrt(2)
    if int(size)*int(size) < 2:
        size = int(size) + 1
    
    plt.figure(figsize=(20, 20))
    for index, (image_id, label) in enumerate(zip(plot_list, labels)):
        plt.subplot(size, size, index + 1)
        image = cv2.imread(os.path.join('../input/cassava-leaf-disease-classification/', "train_images", image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        plt.imshow(image)
        plt.title(label, fontsize=12)
    plt.show()

In [None]:
# Plotting 2 images from each class
for key in map_dis:
    plot(int(key), map_dis[key])

## Split dataset for train and validation
20% for val

In [None]:
df_train = df_train.astype({"label": str})
train, test = train_test_split(df_train, test_size = 0.2, random_state = 42)
print(train.shape, test.shape)

## Creating ImageDataDenerator to augment and create batches

In [None]:
IMG_SIZE = 224
size = (IMG_SIZE,IMG_SIZE)

datagen = ImageDataGenerator(
                    rotation_range = 40,
                    width_shift_range = 0.2,
                    height_shift_range = 0.2,
                    shear_range = 0.2,
                    zoom_range = 0.2,
                    horizontal_flip = True,
                    vertical_flip = True,
                    fill_mode = 'nearest'
)

In [None]:
train_gen = datagen.flow_from_dataframe(
                    train,
                    directory = TRAIN_PATH,
                    x_col = "image_id",
                    y_col = "label",
                    target_size = size,
                    class_mode = "categorical",
                    batch_size = 64,
                    shuffle = True,
                    seed = 42,
                    interpolation = "nearest"
)

In [None]:
valid_gen = datagen.flow_from_dataframe(
                    test,
                    directory = TRAIN_PATH,
                    x_col = "image_id",
                    y_col = "label",
                    target_size = size,
                    class_mode = "categorical",
                    batch_size = 64,
                    shuffle = False,
                    seed = 42,
                    interpolation = "nearest"
)

## Creating a RESNET50 model with pretrained imagenet weights

In [None]:
#Defining the std params / hyperparams

N_CLASS = 5
EPOCHS=50
learn_rate=.001
sgd=SGD(lr=learn_rate,momentum=.9,nesterov=False)

STEP_SIZE_TRAIN = train_gen.n//train_gen.batch_size
STEP_SIZE_VALID = valid_gen.n//valid_gen.batch_size

**NOTE:** Play around with optimizers and loss.

### Defining a couple of "fine-tunable" hyperparameters such as Learning Rate Annealer, Checkpoint
**(NOTE: Experiment with these the most and fine tune it over time. Currently  using the std vals)**

In [None]:
lrr = ReduceLROnPlateau(monitor = 'val_acc',
                              factor = 0.2,
                              patience = 3,
                              min_lr = 0.001,
                              mode = 'min',
                              verbose = 1)

# Saving model with min val loss
checkpoint = ModelCheckpoint('best_model.hdf5', 
                             monitor = 'val_loss',
                             verbose = 1,
                             mode = 'min', 
                             save_best_only = True)

early_stop = EarlyStopping(monitor = 'val_loss',
                               patience = 5,
                               mode = 'min',
                               restore_best_weights = True)

### Defining Base model with imagenet weights and adding custom layers

In [None]:
base_model = ResNet50(include_top = False, weights = 'imagenet', input_shape = (IMG_SIZE, IMG_SIZE, 3), classes = N_CLASS)

In [None]:
# Addinng Layers to the Resnet50

model_resnet=models.Sequential()
#Add the Dense layers along with activation and batch normalization
model_resnet.add(base_model)
model_resnet.add(layers.Flatten())
#Add the Dense layers along with activation and batch normalization
model_resnet.add(layers.Dense(1024,activation=('relu')))
model_resnet.add(layers.Dense(512,activation=('relu'))) 
model_resnet.add(layers.Dropout(.4))
model_resnet.add(layers.Dense(256,activation=('relu'))) 
model_resnet.add(layers.Dropout(.3))
model_resnet.add(layers.Dense(128,activation=('relu')))
model_resnet.add(layers.Dropout(.2))
model_resnet.add(layers.Dense(N_CLASS,activation=('softmax')))

#Summary of ResNet50 Model
model_resnet.summary()

In [None]:
# Compiling the model
model_resnet.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Training and Plots

In [None]:
history = model_resnet.fit(train_gen,
                    validation_data = valid_gen,
                    epochs = EPOCHS,
                    steps_per_epoch = STEP_SIZE_TRAIN,
                    validation_steps = STEP_SIZE_VALID,
                    callbacks = [early_stop, checkpoint, lrr]
                   )

## Model Evaluation

In [None]:
model_resnet.evaluate_generator(generator = valid_gen, steps = STEP_SIZE_VALID)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'c-', label='Training accuracy')
plt.plot(epochs, val_acc, 'y-', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'c-', label='Training Loss')
plt.plot(epochs, val_loss, 'y-', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
## print(predictions)

## Once the entire code runs successfully, shift to the submission notebook.
Cross check that "best_model.hf5" is saved in output dir