# Intro
Welcome to the [Cassava Leaf Disease Classification](https://www.kaggle.com/c/cassava-leaf-disease-classification) competition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/13836/logos/header.png)

There are 5 classifications (click for further informations):
* 0: [Cassava Bacterial Blight (CBB)](https://en.wikipedia.org/wiki/Bacterial_blight_of_cassava)
* 1: [Cassava Brown Streak Disease (CBSD)](https://en.wikipedia.org/wiki/Cassava_brown_streak_virus_disease)
* 2: [Cassava Green Mottle (CGM)](https://en.wikipedia.org/wiki/Cassava_green_mottle_virus)
* 3: [Cassava Mosaic Disease (CMD)](https://en.wikipedia.org/wiki/Cassava_mosaic_virus)
* 4: Healthy"

We will give a simple starter notebook based on a CNN.

<span style="color: royalblue;">Please vote the notebook up if it helps you. Thank you. </span>

# Libraries
We load some standard libraries and packages of sklearn and keras.

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import json
import random

from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import ResNet50

# Path

In [None]:
path = '/kaggle/input/cassava-leaf-disease-classification/'
os.listdir(path)

# Functions
We define some helper functions for visualizations.

In [None]:
def plot_bar(data, name):
    data_label = data[name].value_counts().sort_index()
    dict_train = dict(zip(data_label.keys(), ((data_label.sort_index())).tolist()))
    names = list(dict_train.keys())
    values = list(dict_train.values())
    plt.bar(names, values)
    plt.grid()
    plt.show()
    
def plot_examples(label=0):
    fig, axs = plt.subplots(1, 5, figsize=(25, 12))
    fig.subplots_adjust(hspace = .2, wspace=.2)
    axs = axs.ravel()
    for i in range(5):
        idx = train_data[train_data['label']==label].index[i]
        img = cv2.imread(path+'train_images/'+train_data.loc[idx, 'image_id'])
        axs[i].imshow(img)
        axs[i].set_title(label_data[str(train_data.loc[idx, 'label'])])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
with open(path+'label_num_to_disease_map.json') as json_file:
    label_data = json.load(json_file)

# EDA

In [None]:
print('number of train data:', len(train_data))
print('number of train images:', len(os.listdir(path+'train_images/')))
print('number of test images:', len(os.listdir(path+'test_images/')))

In [None]:
label_data

Distribution of the labels. As we can see there is a class imbalance in terms of class 3. The proportion of class 3 is about 61%.

In [None]:
len(train_data[train_data['label']==3])/len(train_data.index)

In [None]:
plot_bar(train_data, 'label')

There are serveral techniques to overcome this drawback of class imbalance:
1. Remove instances from the majority class 3.
2. Data augmentation to use to add extra samples from the minority classes. In our context of images, this is generally achieved by adding distortion to the data by performing translation, rotation, varying the scale as well as by adding different types of noise such as Gaussian or Poisson.
3. Additional use if the dropout and regularization methods.

In the section **Prepare Data** we want to start with reduction on images for class 3.

# Some Examples

## Healthy

In [None]:
plot_examples(label=4)

## Cassava Bacterial Blight (CBB)

In [None]:
plot_examples(label=0)

## Cassava Brown Streak Disease (CBSD)

In [None]:
plot_examples(label=1)

## Cassava Green Mottle (CGM)

In [None]:
plot_examples(label=2)

## Cassava Mosaic Disease (CMD)

In [None]:
plot_examples(label=3)

# Prepare Data

To overcome the drawback of class imbalance we reduce randomly about 10,500 images of class 3.

In [None]:
ids_label_3 = list(train_data[train_data['label']==3].index)
ids_label_3_subset = random.sample(ids_label_3, 10500)
train_data.drop(train_data.index[ids_label_3_subset], inplace=True)
train_data.index = range(len(train_data.index))

In [None]:
plot_bar(train_data, 'label')

# Parameters

In [None]:
batch_size = 64
img_size = 256
img_channel = 3

## Train Labels And Class Weights

In [None]:
y_train = to_categorical(train_data['label'])

In [None]:
class_weight = dict(zip(range(0, 5), (train_data['label'].value_counts().sort_index()/len(train_data))))

In [None]:
class_weight

## Image Data Generator

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, 5), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            data_file = cv2.imread(self.path+ID)
            img = cv2.resize(data_file, (self.img_size, self.img_size))
            X[i, ] = img/255.
            y[i, ] = self.labels[i]
        return X, y

# Define Model

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

In [None]:
conv_base = ResNet50(include_top=False,
                     weights=weights,
                     input_shape=(img_size, img_size, img_channel))
conv_base.trainable = True

In [None]:
model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(5, activation='softmax'))

In [None]:
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

# Train Model

In [None]:
epochs = 10

In [None]:
train_generator = DataGenerator(path+'train_images/', train_data['image_id'], y_train, batch_size, img_size, img_channel)

In [None]:
history = model.fit_generator(generator=train_generator,
                              epochs = epochs)

# Predict Test Data

In [None]:
test_generator = DataGenerator(path+'test_images/', samp_subm['image_id'], samp_subm['label'], len(samp_subm), img_size, img_channel)

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
samp_subm['label'] = predict.argmax(axis=1)[0:len(samp_subm)]

# Export Data

In [None]:
samp_subm.to_csv('submission.csv', index=False)

# Analyse Wrong Predictions
We analyse the predictions on the train set.

In [None]:
pred = model.predict_generator(train_generator, verbose=1)

In [None]:
conf_mat = confusion_matrix(y_train.argmax(axis=1)[0:len(train_data)], pred.argmax(axis=1)[0:len(train_data)])
fig, ax = plot_confusion_matrix(conf_mat=conf_mat,
                                show_normed=False,
                                show_absolute=True,
                                figsize=(8, 8))
fig.show()