In [None]:
import numpy as np
import pandas as pd
import math
import cv2
import os
import sys
import csv
import time
import re
import keras
import collections
import json
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import applications
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from kaggle_datasets import KaggleDatasets
from functools import partial
from sklearn.model_selection import train_test_split
from scipy import stats
import shutil

**DATA ANALYSIS**

In [None]:
base_path = "../input/cassava-leaf-disease-classification/"
sample_path = "../input/cassava-leaf-disease-classification/sample_submission.csv"
train_path = "../input/cassava-leaf-disease-classification/train.csv"
test_path = "../input/cassava-leaf-disease-classification/test_images"
label_json_path = "../input/cassava-leaf-disease-classification/label_num_to_disease_map.json"
images_dir_path = "../input/cassava-leaf-disease-classification/train_images"

train_csv = pd.read_csv(train_path)
train_csv['label'] = train_csv['label'].astype('string')

label_class = pd.read_json(label_json_path, orient='index')
label_class = label_class.values.flatten().tolist()


input_files = os.listdir(os.path.join(base_path, "train_images"))
print(f"Number of images in the dataset: {len(input_files)}")

There are 5 classes in the dataset:

In [None]:
with open(os.path.join(base_path, "label_num_to_disease_map.json")) as file:
    map_classes = json.loads(file.read())
    map_classes = {int(k) : v for k, v in map_classes.items()}
    
print(json.dumps(map_classes, indent=4))

#print("Number of unique classes:", num_classes)

In [None]:
samples = 30000
df = pd.read_csv("../input/cassava-leaf-disease-classification/train.csv")
df_test = pd.read_csv(sample_path)
df = df.loc[:samples,:]
num_classes = len(df["label"].unique())
num_data = len(df)
df.head()#Prints the first 5 entries in the data file to get an idea of how the data is formatted

Number of images per class:

In [None]:
data = pd.DataFrame(df['label'].value_counts()) 
data.reset_index(inplace=True) 
data.columns=['class','no. of images']

print(data.head(10))

Histogram plot showing the number of images per class:

In [None]:
with open('/kaggle/input/cassava-leaf-disease-classification/label_num_to_disease_map.json', 'r') as f:
    json_data = json.load(f)

    sort_train = df.label.value_counts().reset_index()
sort_train['disease'] = sort_train['index'].apply(lambda x: str(x)+' : '+json_data[str(x)])

print(collections.Counter(df.label))
sns.barplot(y=sort_train.disease, x=sort_train.label)
plt.xlabel(''); plt.ylabel('')
plt.show()

Plotting four random images from 4 random classes:

In [None]:
def visualize_batch(image_ids, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(4, 4, ind + 1)
        image = cv2.imread(os.path.join(base_path, "train_images", image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.axis("off")
    
    plt.show()
    

tmp_df = df.sample(16)
image_ids = tmp_df["image_id"].values
labels = tmp_df["label"].values

visualize_batch(image_ids, labels)

**Data augmentation and preprocessing**

In [None]:
# Data agumentation and pre-processing 
BATCH_SIZE = 48
IMG_SIZE = 320

train_gen = ImageDataGenerator(
                                rotation_range=270,
                                width_shift_range=0.2,
                                height_shift_range=0.2,
                                brightness_range=[0.1,0.9],
                                shear_range=25,
                                zoom_range=0.3,
                                channel_shift_range=0.1,
                                horizontal_flip=True,
                                vertical_flip=True,
                                rescale=1/255,
                                validation_split=0.2
                               )
                                    
    
valid_gen = ImageDataGenerator(rescale=1/255,
                               validation_split = 0.2
                              )

In [None]:
train_generator = train_gen.flow_from_dataframe(
                            dataframe=train_csv,
                            directory = images_dir_path,
                            x_col = "image_id",
                            y_col = "label",
                            target_size = (IMG_SIZE, IMG_SIZE),
                            class_mode = "categorical",
                            batch_size = BATCH_SIZE,
                            shuffle = True,
                            subset = "training",

)

valid_generator = valid_gen.flow_from_dataframe(
                            dataframe=train_csv,
                            directory = images_dir_path,
                            x_col = "image_id",
                            y_col = "label",
                            target_size = (IMG_SIZE, IMG_SIZE),
                            class_mode = "categorical",
                            batch_size = BATCH_SIZE,
                            shuffle = False,
                            subset = "validation"
)

In [None]:
batch = next(train_generator)
images = batch[0]
labels = batch[1]

plt.figure(figsize=(15,9))
for i, (img, label) in enumerate(zip(images, labels)):
    plt.subplot(5,3, i%15 +1)
    plt.axis('off')
    plt.imshow(img)
    plt.title(label_class[np.argmax(label)])
    
    if i==15:
        break

**Model building**

In [None]:
# Loading the InceptionResNetV2 architecture with imagenet weights as baseline architecture
base = applications.InceptionResNetV2(include_top=False, weights='imagenet',input_shape=[IMG_SIZE,IMG_SIZE,3])
#base.summary()

In [None]:
model = tf.keras.Sequential()
model.add(base)
model.add(BatchNormalization(axis=-1))
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9), metrics=['acc', tf.keras.metrics.TruePositives(name='tp')])
#model.summary()

In [None]:
def scheduler(epoch, lr):
    if epoch >3 and epoch%2==0:
        return lr/1.25
    else:
        return lr

# A callback to save the model
callback0 = tf.keras.callbacks.ModelCheckpoint("./Cassava.h5", 
                                               monitor='val_loss',save_best_only=True)

# A callback to reduce the learning rate with increase in epoch
callback1 = tf.keras.callbacks.LearningRateScheduler(scheduler)

**TRAINING**

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    

    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [None]:
history = model.fit(train_generator, validation_data=valid_generator, epochs=3, callbacks=[callback0, callback1])

In [None]:
plot_history(history)

In [None]:
test_img_path = "../input/cassava-leaf-disease-classification/test_images/2216849948.jpg"

img = cv2.imread(test_img_path)
resized_img = cv2.resize(img, (IMG_SIZE, IMG_SIZE)).reshape(-1, IMG_SIZE, IMG_SIZE, 3)/255

plt.figure(figsize=(8,4))
plt.title("TEST IMAGE")
plt.imshow(resized_img[0])

In [None]:
preds = []
ss = pd.read_csv('../input/cassava-leaf-disease-classification/sample_submission.csv')

for image in ss.image_id:
    img = tf.keras.preprocessing.image.load_img('../input/cassava-leaf-disease-classification/test_images/' + image)
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.preprocessing.image.smart_resize(img, (IMG_SIZE, IMG_SIZE))
    img = tf.reshape(img, (-1, IMG_SIZE, IMG_SIZE, 3))
    prediction = model.predict(img/255)
    preds.append(np.argmax(prediction))

my_submission = pd.DataFrame({'image_id': ss.image_id, 'label': preds})
my_submission.to_csv('submission.csv', index=False) 

In [None]:
print("Submission File: \n---------------\n")
print(my_submission.head()) 

> **Results**

In [None]:
STEP_SIZE = valid_generator.n // valid_generator.batch_size

predict = model.predict(valid_generator, STEP_SIZE)
print(predict)

In [None]:
good_preds = []
bad_preds = []

val_filenames = valid_generator.filenames
label_map = (valid_generator.class_indices)
#label_categories = to_categorical(np.asarray(labels)) 
cla = np.argmax(predict, axis=-1)
label_map = list(map(int, label_map.keys()))
val_label = valid_generator.labels

for idx, res in enumerate(predict):
    #print("image_id: ", val_filenames[idx], ", class predict: ", label_map[cla[idx]], "class: ", label_map[val_label[idx]])
    
    if label_map[cla[idx]] != label_map[val_label[idx]]:
        bad_preds.append([val_filenames[idx], label_map[cla[idx]], label_map[val_label[idx]], res[cla[idx]]])
    else:
        good_preds.append([val_filenames[idx], label_map[cla[idx]], label_map[val_label[idx]], res[cla[idx]]])
print("wrong predictions: ", len(bad_preds), " right predictions: ", len(good_preds), " acc: ", np.round(100*(len(predict)-len(bad_preds))/len(predict),2))

In [None]:
fig=plt.figure(figsize=(16, 8))

good_preds = np.array(good_preds)
good_preds = np.array(sorted(good_preds, key = lambda x: x[3], reverse=True))
#print(good_preds.shape)

columns = 5
rows = 1
for i in range(1, columns*rows +1):
    n = good_preds[i,0]
    #print(n)
    img = cv2.imread(os.path.join(images_dir_path,n))
    lbl = good_preds[i,2]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
    lbl2 = good_preds[i,1]
    plt.title("Label = " + str(lbl) + "\nClassified:" + str(lbl2) + "\nConfidence:" + str(good_preds[i,3]))
plt.show()

In [None]:
### plot the worst predictions

fig=plt.figure(figsize=(16, 8))

bad_preds = np.array(bad_preds)
bad_preds = np.array(sorted(bad_preds, key = lambda x: x[3], reverse=True))
#print(bad_preds.shape)

columns = 5
rows = 1
for i in range(1, columns*rows +1):
    n = bad_preds[i,0]
    #print(n)
    img = cv2.imread(os.path.join(images_dir_path,n))
    lbl = bad_preds[i,2]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
    lbl2 = bad_preds[i,1]
    plt.title("Label = " + str(lbl) + "\nClassified:" + str(lbl2) + "\nConfidence:" + str(good_preds[i,3]))
plt.show()

In [None]:
#data = pd.DataFrame(df['label'].value_counts()) 
#data.reset_index(inplace=True) 
#data.columns=['class','no. of images']


seed_value = 131
max_sample_per_class = 14000
from collections import Counter
no_classes_keep = 1000
c = df.label.values
count = Counter(c).most_common(no_classes_keep)
keep_labels = [i[0] for i in count]
train_keep = pd.DataFrame()
for label in keep_labels:
    if len(df[df.label.isin([label])]) < max_sample_per_class:
        max_sample_per_class = len(df[df.label.isin([label])])
    train_keep_label = df[df.label.isin([label])].sample(n=max_sample_per_class)
    train_keep = train_keep.append(train_keep_label, ignore_index=True)



train_val = df.label.value_counts()
train_keep_df = pd.DataFrame({'label':train_val.index, 'frequency':train_val.values})#.head(30)
train_keep_df.reset_index(inplace=True)
#print(train_keep_df)
all_preds = np.concatenate((good_preds, bad_preds), axis=0)
val_img_per_class = Counter(all_preds[:,2])
bad_val_labels = Counter(bad_preds[:,2])
good_val_labels =  Counter(good_preds[:,2])

In [None]:
df.set_index("label", inplace = True)
print("\nAccuracy")

bad_label_pos = 0

for i in range(5):
    if i == 0:
        label = bad_preds[bad_label_pos, 2]
        pre_label = label
        bad_label_pos += 1
    else:
        label = bad_preds[bad_label_pos, 2]
        while pre_label == label and bad_label_pos < len(bad_preds):
            bad_label_pos += 1
            label = bad_preds[bad_label_pos, 2]
        pre_label = label    
    
    label_counts = train_keep_df.loc[int(label)]
    print("label:", label, "has", label_counts["frequency"], "images in the class and ",  )
    print("label:", label, "has", val_img_per_class[label], " validation images images in the class \nand ",  bad_val_labels[label], " images classified wrong")
    
good_label_pos = 0 
print("\nAccuracy")
for i in range(5):
    if i == 0:
        label = good_preds[good_label_pos, 2]
        pre_label = label
        good_label_pos += 1
    else:
        label = good_preds[good_label_pos, 2]
        while pre_label == label and good_label_pos < len(good_preds):
            good_label_pos += 1
            label = good_preds[bad_label_pos, 2]
        pre_label = label
    label_counts = train_keep_df.loc[int(label)]
    print("label:", label, "has", val_img_per_class[label], " validation images images in the class \nand ",  good_val_labels[label], " images classified correct")