In [None]:
from IPython.display import Image
Image("../input/cassava-leaf-disease-classification/train_images/100042118.jpg")

**Credits**

* [Notebook](https://www.kaggle.com/kretes/eda-distributions-images-and-no-duplicates)

As the second-largest provider of carbohydrates in Africa, cassava is a key food security crop grown by smallholder farmers because it can withstand harsh conditions. At least 80% of household farms in Sub-Saharan Africa grow this starchy root, but viral diseases are major sources of poor yields. With the help of data science, it may be possible to identify common diseases so they can be treated.

Existing methods of disease detection require farmers to solicit the help of government-funded agricultural experts to visually inspect and diagnose the plants. This suffers from being labor-intensive, low-supply and costly. As an added challenge, effective solutions for farmers must perform well under significant constraints, since African farmers may only have access to mobile-quality cameras with low-bandwidth.

In this competition, we introduce a dataset of 21,367 labeled images collected during a regular survey in Uganda. Most images were crowdsourced from farmers taking photos of their gardens, and annotated by experts at the National Crops Resources Research Institute (NaCRRI) in collaboration with the AI lab at Makerere University, Kampala. This is in a format that most realistically represents what farmers would need to diagnose in real life.

Your task is to classify each cassava image into four disease categories or a fifth category indicating a healthy leaf. With your help, farmers may be able to quickly identify diseased plants, potentially saving their crops before they inflict irreparable damage.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import json
import sys
from glob import glob
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import Image,display
import seaborn as sns
import matplotlib.image as mpimg
import scipy.spatial.distance as dist
from sklearn.model_selection import train_test_split
from skimage.measure import compare_ssim
import os
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import math, re
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
from tensorflow import keras
from functools import partial
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
print("Tensorflow version " + tf.__version__)


In [None]:
train = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train.shape

In [None]:
train.head()

In [None]:
with open("../input/cassava-leaf-disease-classification/label_num_to_disease_map.json") as f:
    map_dis = json.loads(f.read())
print("Image Class Labels:")
print(json.dumps(map_dis, indent=4))

**Let's check for any missing values in the train labels df**

In [None]:
missing = train.isnull().sum()
all_val = train.count()

missing_train = pd.concat([missing, all_val], axis=1, keys=['Missing', 'AllObservations'])
missing_train

**Looks good. Let's review the class distributions**

In [None]:
## Distinct classes
print("Distinct number of classes is :" + str(train['label'].nunique()))

In [None]:
plt.figure(figsize = (8, 8))
plt.title('Leaf Classes Density plot')
sns.kdeplot(train['label'], color="blue", shade=True)
plt.show()

**There is a good amount of class imbalance where a large section of images belong to class 3 only**

In [None]:
leaf_id_count = pd.DataFrame(train.groupby(['label'])['label'].count())
leaf_id_count.rename(columns={'label': 'Count_Images'}, inplace=True)
leaf_id_count.reset_index(inplace=True)
leaf_id_count.sort_values(by=['Count_Images'],ascending=False, inplace=True)
leaf_id_count['Cummulative_Count'] = leaf_id_count['Count_Images'].cumsum()
leaf_id_count['Cummulative_Pctg']= leaf_id_count['Cummulative_Count']/leaf_id_count['Count_Images'].sum()
leaf_id_count['Row_id'] = np.arange(len(leaf_id_count))
fig = plt.figure()
ax = plt.axes()
ax.plot(leaf_id_count['Row_id'], leaf_id_count['Cummulative_Pctg']);
ax.set(xlabel='Count of Classes', ylabel='Cummulative %',
       title='Cummulative distribution of images by class count');

**Let's compare the images of majority class vs. minority classes**

In [None]:
mainPath = '../input/cassava-leaf-disease-classification/train_images/'
all_img_paths = [y for x in os.walk(mainPath) for y in glob(os.path.join(x[0], '*.jpg'))]
all_filenames = []
for filepath in all_img_paths:
    FileName = os.path.basename(filepath)
    all_filenames.append(FileName)
path_dict = dict(zip(all_filenames,all_img_paths))

In [None]:
all_cats= train.label.unique().tolist()

In [None]:
for img_cat in all_cats:
    process_img_lst = train[train['label']==img_cat].image_id.tolist()[0:4]
    full_img_paths = [path_dict[x] for x in process_img_lst]
    print("Sample image in Category:" + str(map_dis[str(img_cat)]))
    img0 = mpimg.imread(full_img_paths[0])
    img1 = mpimg.imread(full_img_paths[1])
    img2 = mpimg.imread(full_img_paths[2])
    img3 = mpimg.imread(full_img_paths[3])
    
    fig, ((ax0,ax1),(ax2,ax3)) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
    ax0.imshow(img0)
    ax0.set_title("Image 1")
    ax1.imshow(img1)
    ax1.set_title("Image 2")
    ax2.imshow(img2)
    ax2.set_title("Image 3")
    ax3.imshow(img3)
    ax3.set_title("Image 4")
    plt.show()

**Looking beyond what meets the eye in the pictures**


For this excercise we will use intensity histograms

In [None]:
img0 = cv2.imread('../input/cassava-leaf-disease-classification/train_images/1000015157.jpg')

train_hist = plt.hist(img0.ravel(), bins = 256, color = 'orange', )
train_hist = plt.hist(img0[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
train_hist = plt.xlabel('Intensity Value')
train_hist = plt.ylabel('Count')
train_hist = plt.legend(['Total', 'Red Channel', 'Green Channel', 'Blue Channel'])
print('Intensity Histogram of Sample Image for Cassava Bacterial Blight (CBB)')
plt.show()

In [None]:
img0 = cv2.imread('../input/cassava-leaf-disease-classification/train_images/1000201771.jpg')

train_hist = plt.hist(img0.ravel(), bins = 256, color = 'orange', )
train_hist = plt.hist(img0[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
train_hist = plt.xlabel('Intensity Value')
train_hist = plt.ylabel('Count')
train_hist = plt.legend(['Total', 'Red Channel', 'Green Channel', 'Blue Channel'])
print('Intensity Histogram of Sample image for Cassava Mosaic Disease (CMD)')
plt.show()

In [None]:
img0 = cv2.imread('../input/cassava-leaf-disease-classification/train_images/1001723730.jpg')

train_hist = plt.hist(img0.ravel(), bins = 256, color = 'orange', )
train_hist = plt.hist(img0[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
train_hist = plt.hist(img0[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
train_hist = plt.xlabel('Intensity Value')
train_hist = plt.ylabel('Count')
train_hist = plt.legend(['Total', 'Red Channel', 'Green Channel', 'Blue Channel'])
print('Intensity Histogram of Sample image for Healthy Leaf')
plt.show()

**There could be interesting insights to how the diseases get manifested in the images in the underlying RGB channels**

**Let's look at Image resolutions provided in the training set**

In [None]:
import collections
from PIL import Image

In [None]:
DIR = '../input/cassava-leaf-disease-classification/train_images'
imageSizes_train = collections.Counter([Image.open(f'{DIR}/{filename}').size
                        for filename in os.listdir(f"{DIR}/")])

In [None]:
def isdf(imageSizes):
    imageSizeFrame = pd.DataFrame(list(imageSizes.most_common()),columns = ["imageDim","count"])
    imageSizeFrame['fraction'] = imageSizeFrame['count'] / sum(imageSizes.values())
    imageSizeFrame['count_cum'] = imageSizeFrame['count'].cumsum()
    imageSizeFrame['count_cum_fraction'] = imageSizeFrame['count_cum'] / sum(imageSizes.values())
    return imageSizeFrame

train_isdf = isdf(imageSizes_train)
train_isdf['set'] = 'train'


In [None]:
train_isdf

**All the images are of the size 800X600. For image classification case we need to change this to square format like 512X512 or 224X224. We will take care of this in our Data generator**

**Let's build a Baseline model on GPU using any Pre trained networks**

For this demonstration we will use EfficientNet

In [None]:
import tensorflow as tf
import skimage.io
from skimage.transform import resize
from imgaug import augmenters as iaa
from sklearn import preprocessing 
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Add, Dense, Dropout, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D,GlobalAveragePooling2D,Concatenate, ReLU, LeakyReLU,Reshape, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.applications import EfficientNetB0 
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.vgg16 import decode_predictions
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import metrics
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from tqdm import tqdm
import imgaug as ia
from imgaug import augmenters as iaa
import keras.backend as K
K.set_image_data_format('channels_last')
K.set_learning_phase(1)

In [None]:
##Basic Model params
batch_size = 64
seed = 42
shape = (224, 224, 3) ##desired shape of the image for resizing purposes
val_sample = 0.2 # 10 % as validation sample

In [None]:
path_df = pd.DataFrame(path_dict.items())
path_df.rename(columns={ path_df.columns[0]: "image_id" }, inplace = True)
path_df.rename(columns={ path_df.columns[1]: "path" }, inplace = True)
train_all = pd.merge(train, path_df, on='image_id')
#del path_df,path_dict,all_img_paths,all_filenames
gc.collect()

In [None]:
train_all.head()

**UDFs**

In [None]:
def getTrainParams():
    data = train_all.copy()
    le = preprocessing.LabelEncoder()
    data['label'] = le.fit_transform(data['label'])
    lbls = data['label'].tolist()
    lb = LabelBinarizer()
    labels = lb.fit_transform(lbls)
    
    return np.array(train_all['path'].tolist()),np.array(labels),le

In [None]:
class Leaf_DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, paths, labels, batch_size, shape, shuffle = False, use_cache = False, augment = False):
        self.paths, self.labels = paths, labels
        self.batch_size = batch_size
        self.shape = shape
        self.shuffle = shuffle
        self.use_cache = use_cache
        self.augment = augment
        if use_cache == True:
            self.cache = np.zeros((paths.shape[0], shape[0], shape[1], shape[2]), dtype=np.float16)
            self.is_cached = np.zeros((paths.shape[0]))
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.ceil(len(self.paths) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        indexes = self.indexes[idx * self.batch_size : (idx+1) * self.batch_size]

        paths = self.paths[indexes]
        X = np.zeros((paths.shape[0], self.shape[0], self.shape[1], self.shape[2]))
        # Generate data
        if self.use_cache == True:
            X = self.cache[indexes]
            for i, path in enumerate(paths[np.where(self.is_cached[indexes] == 0)]):
                image = self.__load_image(path)
                self.is_cached[indexes[i]] = 1
                self.cache[indexes[i]] = image
                X[i] = image
        else:
            for i, path in enumerate(paths):
                X[i] = self.__load_image(path)

        y = self.labels[indexes]
                
        if self.augment == True:
            seq = iaa.Sequential([
                iaa.OneOf([
                    iaa.Fliplr(0.5), # horizontal flips
                    iaa.Crop(percent=(0, 0.1)), # random crops
                    #iaa.ContrastNormalization((0.75, 1.5)),
                    #iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5),
                    #iaa.Multiply((0.8, 1.2), per_channel=0.2),
                    #iaa.Affine(rotate=90),
                    #iaa.Affine(rotate=180),
                    #iaa.Affine(rotate=270),
                    iaa.Flipud(0.5),
                ])], random_order=True)

            X = np.concatenate((X, seq.augment_images(X), seq.augment_images(X), seq.augment_images(X)), 0)
            X = X.astype('float32')
            y = np.concatenate((y, y, y, y), 0)
            y = y.astype('float32')
        
        return X, y
    
    def on_epoch_end(self):
        
        # Updates indexes after each epoch
        self.indexes = np.arange(len(self.paths))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __iter__(self):
        """Create a generator that iterate over the Sequence."""
        for item in (self[i] for i in range(len(self))):
            yield item
            
    def __load_image(self, path):
        image_norm = skimage.io.imread(path)/255.0
        

        im = resize(image_norm, (shape[0], shape[1],shape[2]), mode='reflect')
        im = im.astype('float32')
        return im

In [None]:
## Test data generator to generate predictions
class Leaf_TestDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, paths, batch_size, shape, shuffle = False, use_cache = False, augment = False):
        self.paths= paths
        self.batch_size = batch_size
        self.shape = shape
        self.shuffle = shuffle
        self.use_cache = use_cache
        self.augment = augment
        if use_cache == True:
            self.cache = np.zeros((paths.shape[0], shape[0], shape[1], shape[2]), dtype=np.float16)
            self.is_cached = np.zeros((paths.shape[0]))
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.ceil(len(self.paths) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        indexes = self.indexes[idx * self.batch_size : (idx+1) * self.batch_size]

        paths = self.paths[indexes]
        X = np.zeros((paths.shape[0], self.shape[0], self.shape[1], self.shape[2]))
        # Generate data
        if self.use_cache == True:
            X = self.cache[indexes]
            for i, path in enumerate(paths[np.where(self.is_cached[indexes] == 0)]):
                image = self.__load_image(path)
                self.is_cached[indexes[i]] = 1
                self.cache[indexes[i]] = image
                X[i] = image
        else:
            for i, path in enumerate(paths):
                X[i] = self.__load_image(path)

        #y = self.labels[indexes]
                
        if self.augment == True:
            seq = iaa.Sequential([
                iaa.OneOf([
                    iaa.Fliplr(0.5), # horizontal flips
                    iaa.Crop(percent=(0, 0.1)), # random crops
                    #iaa.ContrastNormalization((0.75, 1.5)),
                    #iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5),
                    #iaa.Multiply((0.8, 1.2), per_channel=0.2),
                    #iaa.Affine(rotate=90),
                    #iaa.Affine(rotate=180),
                    #iaa.Affine(rotate=270),
                    iaa.Flipud(0.5),
                ])], random_order=True)

            X = np.concatenate((X, seq.augment_images(X), seq.augment_images(X), seq.augment_images(X)), 0)
            X = X.astype('float32')
            #y = np.concatenate((y, y, y, y), 0)
            #y = y.astype('float32')
        
        return X
    
    def on_epoch_end(self):
        
        # Updates indexes after each epoch
        self.indexes = np.arange(len(self.paths))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __iter__(self):
        """Create a generator that iterate over the Sequence."""
        for item in (self[i] for i in range(len(self))):
            yield item
            
    def __load_image(self, path):
        image_norm = skimage.io.imread(path)/255.0
        

        im = resize(image_norm, (shape[0], shape[1],shape[2]), mode='reflect')
        im = im.astype('float32')
        return im

**Define the model**

In [None]:
def create_model(input_shape, n_out):
    inp = Input(input_shape)
    #x = img_augmentation(inp)
    pretrain_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inp)
    x = pretrain_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.25)(x)
    x = Dense(n_out, activation="sigmoid")(x)
    ##Uncomment if you want to train few more layers before the head
    #for layer in pretrain_model.layers[:160]:
        #layer.trainable = False
    
    for layer in pretrain_model.layers:
        layer.trainable = False
        
    return Model(inp, x)

In [None]:
from tensorflow.keras.metrics import categorical_accuracy,top_k_categorical_accuracy
def top_5_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)

In [None]:
nlabls = train['label'].nunique()
model = create_model(input_shape=(224,224,3), n_out=nlabls)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.summary()

In [None]:
paths, labels,_ = getTrainParams()
keys = np.arange(paths.shape[0], dtype=np.int)  
np.random.seed(seed)
np.random.shuffle(keys)
lastTrainIndex = int((1-val_sample) * paths.shape[0])

pathsTrain = paths[0:lastTrainIndex]
labelsTrain = labels[0:lastTrainIndex]

pathsVal = paths[lastTrainIndex:]
labelsVal = labels[lastTrainIndex:]

print(paths.shape, labels.shape)
print(pathsTrain.shape, labelsTrain.shape, pathsVal.shape, labelsVal.shape)


In [None]:
train_generator = Leaf_DataGenerator(pathsTrain, labelsTrain, batch_size, shape, use_cache=False, augment = False, shuffle = True)
val_generator = Leaf_DataGenerator(pathsVal, labelsVal, batch_size, shape, use_cache=False, shuffle = False)

In [None]:
##Calculate class weights to help the model compensate for some of the class imbalance
class_wt = pd.DataFrame(train[['image_id', 'label']].groupby(['label']).agg(['count']))
class_wt.reset_index(inplace=True)
class_wt['weight']= class_wt.iloc[:,1]/train.shape[0]
class_wt

In [None]:
class_weight = {0: 0.050802, 1: 0.102304,2:0.111511,3:0.614946,4:0.120437}

In [None]:
epochs = 2
use_multiprocessing = False 
base_cnn = model.fit_generator(
    train_generator,
    steps_per_epoch=len(train_generator),
    validation_data=val_generator,
    validation_steps=24,
    class_weight = class_weight,
    epochs=epochs,
    #callbacks = [clr],
    verbose=1)

In [None]:
model.save('EfficientNetB0_2epochs.h5')

**Generate Submission**

In [None]:
TestPath = '../input/cassava-leaf-disease-classification/test_images/'
all_testimg_paths = [y for x in os.walk(TestPath) for y in glob(os.path.join(x[0], '*.jpg'))]
all_testfilenames = []
for filepath in all_testimg_paths:
    FileName = os.path.basename(filepath)
    all_testfilenames.append(FileName)
test_path_dict = dict(zip(all_testfilenames,all_testimg_paths))
testPaths = np.array(list(test_path_dict.values()))
test_generator = Leaf_TestDataGenerator(testPaths, batch_size, shape, use_cache=False, shuffle = False)
pred_prob = model.predict(test_generator)
pred = np.argmax(pred_prob, axis=-1)
data_items = test_path_dict.items()
data_list = list(data_items)
sub_df = pd.DataFrame(data_list)
sub_df.drop(sub_df.columns[1], axis=1, inplace=True)
sub_df['label']= pred
sub_df.columns = ['image_id','label']
sub_df.to_csv('submission.csv', index=False)

**Work in progress....**

If you found this helpful, please upvote!![](http://)