# <center> ðŸŒ¿ Cassava Disease Classification ðŸŒ¿ </center>
<center><img src  = "https://media.istockphoto.com/photos/cassava-plant-tapioca-leaf-picture-id1074190342?k=6&m=1074190342&s=612x612&w=0&h=dZgP3KRT8-T9L30zvTMqHRiyatCsM1d54NwKon0Uk4g=" height = 600 width = 600 ></center>

# Problem Statement

This competition is a Vision Based Classification Competition . Our task is to classify each cassava image into four disease categories or a fifth category indicating a healthy leaf. With our help, farmers may be able to quickly identify diseased plants, potentially saving their crops before they inflict irreparable damage. 

For more information: https://www.kaggle.com/c/cassava-leaf-disease-classification/overview

# ðŸ“š Loading Libraries ðŸ“š 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers,models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.densenet import DenseNet121
import warnings
warnings.simplefilter("ignore")
from PIL import Image
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Importing Data

In [None]:
import os
Dir = '../input/cassava-leaf-disease-classification'
os.listdir(Dir)

In [None]:
print(len(os.listdir('../input/cassava-leaf-disease-classification/train_images')))

In [None]:
print(len(os.listdir('../input/cassava-leaf-disease-classification/test_images')))

In [None]:
train_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train_df.head()

In [None]:
import seaborn as sns
sns.countplot(train_df['label'])

We can see that there is imbalance in the data as we have more images of 3 category.

In [None]:
train_df['label'].value_counts()

In [None]:
np.round((train_df['label'].value_counts()/len(train_df['label']))*100, 2)

In [None]:
len(train_df)

In [None]:
import json
with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as file:
    print(json.dumps(json.loads(file.read()), indent=4))

# Let's do some Exploratory Data Analysis

In [None]:
import cv2
sample = train_df[train_df.label == 0].sample(9)
plt.figure(figsize=(12,12))
for ind, (image_id, label) in enumerate(zip(sample.image_id, sample.label)):
    plt.subplot(3, 3, ind + 1)
    image = cv2.imread(os.path.join("../input/cassava-leaf-disease-classification/train_images", image_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
    
plt.show()

**Identification**:

Look to see if leaves are drying and dying early. Look for angular spots on the leaves, and cut out small pieces of the leaf from the edge of the spots and place them in a drop of water. Look for bacterial streaming - the streaming appears as white streaks in the water. Look for dark brown to black streaks on the green part of the stem, and for the presence of sticky liquid. Look for browning in the vascular tissues, i.e., the water conducting tubes, after peeling the bark and splitting the stem.

In [None]:
sample = train_df[train_df.label == 1].sample(9)
plt.figure(figsize=(12,12))
for ind, (image_id, label) in enumerate(zip(sample.image_id, sample.label)):
    plt.subplot(3, 3, ind + 1)
    image = cv2.imread(os.path.join("../input/cassava-leaf-disease-classification/train_images", image_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
    
plt.show()

**Identification**:

Look for the yellow blotches along the veins from the midrib; these become patches as they join together. Look for occasional streaks on the stems, and dry brown root rots. Note that another virus disease, caused by Cassava mosaic virus (CMV), causes similar symptoms. However, CMV occurs on young expanding leaves, and causes leaf distortions

In [None]:
sample = train_df[train_df.label == 2].sample(9)
plt.figure(figsize=(12,12))
for ind, (image_id, label) in enumerate(zip(sample.image_id, sample.label)):
    plt.subplot(3, 3, ind + 1)
    image = cv2.imread(os.path.join("../input/cassava-leaf-disease-classification/train_images", image_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
    
plt.show()

**Identification**:

Look for yellow patterns on the leaves, from small dots to irregular patches of yellow and green. Look for leaf margins that are distorted. The plants may be stunted.

In [None]:
sample = train_df[train_df.label == 3].sample(9)
plt.figure(figsize=(12,12))
for ind, (img_id, lab) in enumerate(zip(sample.image_id, sample.label)):
    plt.subplot(3,3,ind+1)
    image = cv2.imread(os.path.join("../input/cassava-leaf-disease-classification/train_images", img_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
plt.show()

**Identification**:

Infected leaves are white or pale yellow with pale green patches and will often be twisted, an unusual shape, and stunted. Cassava mosaic disease causes low yields.

Now let's see the **healthy leaves**.

In [None]:
sample = train_df[train_df.label == 4].sample(9)
plt.figure(figsize=(12,12))
for ind, (img_id, lab) in enumerate(zip(sample.image_id, sample.label)):
    plt.subplot(3,3,ind+1)
    image = cv2.imread(os.path.join("../input/cassava-leaf-disease-classification/train_images", img_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.axis("off")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
y_pred = [3] * len(train_df.label)
print("The baseline accuracy is {}".format(accuracy_score(y_pred, train_df.label)))

As we saw that there are around 61% leaves of 3 category only.

# Model

In [None]:
Batch_size = 16
img_height, img_width = 300, 300

In [None]:
train_df['label'].dtype

# Data Augmentation

In [None]:
train_df['label'] = train_df['label'].astype('str')
gen = ImageDataGenerator(
    horizontal_flip = True,
    vertical_flip = True,
    validation_split = 0.2,
)

train_datagen = gen.flow_from_dataframe(
    train_df,
    directory = os.path.join(Dir, "train_images"),
    batch_size = Batch_size,
    target_size = (img_height, img_width),
    subset = "training",
    seed = 42,
    x_col = "image_id",
    y_col = "label",
    class_mode = "categorical"
)

There are 17118 training images.

In [None]:
val_gen = ImageDataGenerator(
    validation_split = 0.2
)

val_datagen = val_gen.flow_from_dataframe(
    train_df,
    directory = os.path.join(Dir, "train_images"),
    batch_size = Batch_size,
    target_size = (img_height, img_width),
    subset = "validation",
    seed = 42,
    x_col = "image_id",
    y_col = "label",
    class_mode = "categorical"
)

There are 4279 validation images.

In [None]:
len(train_datagen), len(val_datagen)

**Calculation**:

The length of training images is basically 21397 * 0.8 / 16 = 1070 as the generator returns the batches.

Similarly the length of validation images is 21397 * 0.2 / 16 = 268.

In [None]:
img, label = next(train_datagen)

next() is used to get the next batch of images and labels.

In [None]:
label

In [None]:
Steps_per_train = train_datagen.n / train_datagen.batch_size
Steps_per_val = val_datagen.n / val_datagen.batch_size

In [None]:
Steps_per_train, Steps_per_val

I have tried out various models like:

1. A CNN model with 3 convolution layers and the accuracy came out to be 61.65 which is near to the base accuracy only.
2. A CNN model with 4 convolution layers and the accuracy came out to be 67.35 which is better than previous accuracy.
3. A CNN model with 4 convolution layers with Dropout, Batch Normalization and the accuracy is 63.59.
4. Tranfer Learning: I tried out various models like ResNet, DenseNet, VGG16, EfficientNet. Finally the maximum accuracy I got is through Efficient Net.

# Transfer Learning

In [None]:
from tensorflow.keras.applications.efficientnet import EfficientNetB3
def create_model():
    model = models.Sequential()
    model.add(EfficientNetB3(include_top = False, weights = 'imagenet',
                             input_shape = (img_height, img_width, 3),  drop_connect_rate=0.3))
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation = "relu"))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(5, activation='softmax'))
    
    loss = tf.keras.losses.CategoricalCrossentropy(
        label_smoothing=0.0001,
        name='categorical_crossentropy'
    )
    model.compile(optimizer = Adam(lr = 1e-4),
                  loss = loss,
                  metrics = ["categorical_accuracy"])
    return model

model = create_model()
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
rlronp=tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",
                                            factor=0.2,
                                            mode = "min",
                                            min_lr=1e-6,
                                            patience=2, 
                                            verbose=1)

estop=tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                       mode= "min",
                                       patience=3, 
                                       verbose=1,
                                       restore_best_weights=True)

history = model.fit_generator(
    train_datagen,
    steps_per_epoch = Steps_per_train,
    epochs = 5,
    validation_data = val_datagen,
    validation_steps = Steps_per_val,
    callbacks = [rlronp,estop]
)
model.save("Casava_Model"+ ".h5")

# Plots between Accuracy and Loss

In [None]:
history.history.keys()

In [None]:
import matplotlib.pyplot as plt
train_acc = history.history["categorical_accuracy"]
val_acc = history.history["val_categorical_accuracy"]
epochs = range(1, len(train_acc)+1)
plt.plot(epochs, train_acc, "bo", label = "Training Accuracy")
plt.plot(epochs, val_acc, "b", label = "Validation Accuracy")
plt.title("Training and Validation Accuracy")
plt.legend()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
train_loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = range(1, len(train_loss)+1)
plt.plot(epochs, train_loss, "bo", label = "Training Loss")
plt.plot(epochs, val_loss, "b", label = "Validation Loss")
plt.title("Training and Validation Loss")
plt.legend()

If you find my work helpful, please upvote! Open for criticism!!

Thank You!