# Intro
Welcome to the Cassava Leaf Disease Classification competition.

There are 5 classifications (click for further informations):
* 0: [Cassava Bacterial Blight (CBB)](https://en.wikipedia.org/wiki/Bacterial_blight_of_cassava)
* 1: [Cassava Brown Streak Disease (CBSD)](https://en.wikipedia.org/wiki/Cassava_brown_streak_virus_disease)
* 2: [Cassava Green Mottle (CGM)](https://en.wikipedia.org/wiki/Cassava_green_mottle_virus)
* 3: [Cassava Mosaic Disease (CMD)](https://en.wikipedia.org/wiki/Cassava_mosaic_virus)
* 4: Healthy"

We will give a simple starter notebook based on a CNN.

# Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt

In [None]:
from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.optimizers import RMSprop,Adam

# Path

In [None]:
path = '/kaggle/input/cassava-leaf-disease-classification/'
os.listdir(path)

# Functions

In [None]:
def plot_bar(data, name):
    data_label = data[name].value_counts().sort_index()
    dict_train = dict(zip(data_label.keys(), ((data_label.sort_index())).tolist()))
    names = list(dict_train.keys())
    values = list(dict_train.values())
    plt.bar(names, values)
    plt.grid()
    plt.show()

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# EDA

In [None]:
print('number of train data:', len(train_data))
print('number of train images:', len(os.listdir(path+'train_images/')))
print('number of test images:', len(os.listdir(path+'test_images/')))

Distribution of the labels:

In [None]:
plot_bar(train_data, 'label')

Plot an image:

In [None]:
img = cv2.imread(path+'train_images/'+'1000015157.jpg')
plt.imshow(img)
plt.show()

# Prepare Data For Model

In [None]:
batch_size = 3
img_size = 256
img_channel = 3

## Train Labels And Class Weights

In [None]:
y_train = to_categorical(train_data['label'])

In [None]:
class_weight = dict(zip(range(0, 5), (train_data['label'].value_counts().sort_index()/len(train_data))))

## Image Data Generator

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        return int(np.floor(len(self.list_IDs)/self.batch_size))
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    
    def __data_generation(self, list_IDs_temp):
        X = np.empty((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.empty((self.batch_size, 5), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            data_file = cv2.imread(self.path+ID)
            img = cv2.resize(data_file, (self.img_size, self.img_size))
            X[i, ] = img
            y[i, ] = self.labels[i]
        X = X.astype('float32')
        X -= X.mean()
        X /= X.std()
        return X, y

1. # Define Model

We pull the VGG16 and use pretrained weights on imagenet and train just the last 4 layers on the dataset and add some additional layers in addition to that.

In [None]:
from keras.applications import VGG16
import keras

vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(img_size, img_size, img_channel))

for layer in vgg_conv.layers[:-4]:

    layer.trainable = False

from keras import models
from keras import layers
from keras import optimizers

# Create the model
model = models.Sequential()

# Add the vgg convolutional base model
model.add(vgg_conv)

# Add new layers
model.add(layers.Flatten())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(5, activation='softmax'))

# Show a summary of the model. Check the number of trainable parameters
#model.summary()

# model = Sequential()
# model.add(Conv2D(128, input_shape=(img_size,img_size,img_channel), kernel_size=5, strides=4, activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPool2D(pool_size=(2)))
# model.add(Conv2D(128, kernel_size=5, activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPool2D(pool_size=(2)))
# model.add(Conv2D(256, kernel_size=5, activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPool2D(pool_size=(2)))
# model.add(Flatten())
# model.add(Dense(64, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(5, activation='softmax'))

In [None]:
model.compile(optimizer=RMSprop(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

# Train Model

In [None]:
epochs = 10

In [None]:
train_generator = DataGenerator(path+'train_images/', train_data['image_id'], y_train, batch_size, img_size, img_channel)

In [None]:
history = model.fit_generator(generator=train_generator,
                              epochs = epochs,
                              class_weight = class_weight,
                              workers=4)

# Predict Test Data

In [None]:
test_generator = DataGenerator(path+'test_images/', samp_subm['image_id'], samp_subm['label'], 1, img_size, img_channel)

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
samp_subm['label'] = predict.argmax(axis=1)

# Export Data

In [None]:
samp_subm.to_csv('submission.csv', index=False)