# Cassava Leaf Disease Exploratory Data Analysis

This is an EDA to explore the dataset for the Cassava Leaf Disease Classification challenge

Manihot esculenta or the cassava plant is the most widely grown root crop in the world and the third largest source of carbohydrates for human food. However, it is vunerable to a number of viral diseases.

The aim of the challenge is the classify images of cassava leaves into 1 of 5 categories: 4 types of disease or a healthy leaf.

Here we will look at a dataset of labelled images to use in training a classification model.

In [None]:
# Imports
!pip install -q imutils
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from pathlib import Path
import cv2,imutils
from PIL import Image

In [None]:
# Filepath
base_path = Path('../input/cassava-leaf-disease-classification')

In [None]:
# Read in train.csv and labels mapped to disease names
train_df = pd.read_csv(base_path/'train.csv')
disease_map = pd.read_json(base_path/'label_num_to_disease_map.json', typ='series')

In [None]:
train_df.head()

In [None]:
# Map disease names to labels in dataframe
map_dict = disease_map.to_dict()
short_map_dict = {0:'CBB', 1:'CBSD', 2:'CGM', 3:'CMD', 4:'Healthy'}
train_df['class'] = train_df['label']
train_df = train_df.replace({'class':map_dict})
train_df['class_short'] = train_df['label']
train_df = train_df.replace({'class_short':short_map_dict})
train_df = train_df.rename(columns={'label':'label_number'})


In [None]:
train_df.head()

Now each image has a number and a disease name to classify it

In [None]:
# Number of total images
len_df = len(train_df)
print(f"Number of train images: {len_df}")

In [None]:
# Number of images in each class
train_df['class'].value_counts().sort_index()

In [None]:
# No. of images in each class plotted
train_df['class'].value_counts().sort_index().plot(kind='bar')

As you can see there is a huge class imbalance.

## Resolution of images

In [None]:
# Finding height and width of each image
from PIL import Image
width_list = []
height_list = []
for i in range(0,len_df):  
    im = Image.open(base_path/'train_images'/train_df['image_id'][i])
    width, height = im.size
    width_list.append(width)
    height_list.append(height)

In [None]:
width_list.count(width_list[0]) == len(width_list)

In [None]:
height_list.count(height_list[0]) == len(height_list)

Since both the cells above result in True, the width and height of all the images are the same, 800 and 600 pixels respectively

## Detecting outliers

In [None]:
train_path = '../input/cassava-leaf-disease-classification/train_images/'

for i in range(0,len_df):
    img_name = train_df.loc[i,'image_id']
    img = cv2.cvtColor(cv2.imread(train_path+img_name), cv2.COLOR_BGR2RGB)
    #normalize each image in the range [0,1]
    norm_image = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    
    #add columns for width and height
    width,height,depth = img.shape
    train_df.loc[i,'width'] = width
    train_df.loc[i,'height'] = height
    
    #calculate the mean for each original and normalized image
    train_df.loc[i,'mean'] = img.mean()
    train_df.loc[i,'normalized_mean'] = norm_image.mean()
    

In [None]:
mean_images = [0,0,0]
norm_mean_images = [0,0,0]
for i in range(0,len_df):
    img_name = train_df.loc[i,'image_id']
    img = cv2.cvtColor(cv2.imread(train_path+img_name), cv2.COLOR_BGR2RGB)
    #normalize each image in the range [0,1]
    norm_img = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    
    mean_images += np.mean(img, axis=(0,1))
    norm_mean_images += np.mean(norm_img, axis=(0,1))
    
mean_images = mean_images/len_df/255
norm_mean_images = norm_mean_images/len_df/255    

In [None]:
std = []

train_path = '../input/cassava-leaf-disease-classification/train_images/'
files = os.listdir(train_path)
x = np.array([np.array(Image.open(train_path + fname)) for fname in files])

for image in x:
    c = 0
    r_std = image[:,:,c].flatten()
    c+=1
    g_std = image[:,:,c].flatten()
    c+=1
    b_std = image[:,:,c].flatten()
    std.append(np.array([r_std,g_std,b_std]))

np_std = np.array(std)

r_channel = np_std[:,0].flatten()
g_channel = np_std[:,1].flatten()
b_channel = np_std[:,2].flatten()

r_norm = np.std(r_channel) / 255
g_norm = np.std(g_channel) / 255
b_norm = np.std(b_channel) / 255
imgs_std = np.array([r_norm,g_norm,b_norm])
print(imgs_std)

In [None]:
#create box plots
fig = make_subplots(rows=1,cols=2,
                    subplot_titles=['Mean','Normalized Mean'])
colours = ['rgb(69, 130, 191)','rgb(102, 179, 70)','rgb(237, 232, 71)','rgb(240, 158, 50)','rgb(222, 55, 29)']

for i,class_name in enumerate(train_df['class_short'].unique()):
    
    #create subplot for mean
    fig.add_trace(go.Box(y=train_df[train_df['class_short'] == class_name]['mean'],
                        name=class_name,showlegend=False,
                        fillcolor=colours[i],
                        marker=dict(size=6,
                        color='black',
                        outliercolor = colours[i]),
                        line=dict(width=1)),1,1)
    
    #create subplot for normalized mean
    fig.add_trace(go.Box(y=train_df[train_df['class_short'] == class_name]['normalized_mean'],
                        name=class_name,showlegend=False,
                        fillcolor=colours[i],
                        marker=dict(size=6,
                        color='black',
                        outliercolor = colours[i]),
                        line=dict(width=1)),1,2)

fig.update_layout(title='Outlier Detection Box Plots')
fig.show()

In [None]:
#Put each class into a list to display them
healthy_images = train_df[train_df['class_short'] == 'Healthy']['image_id'].to_list()
cbb_images = train_df[train_df['class_short'] == 'CBB']['image_id'].to_list()
cbsd_images = train_df[train_df['class_short'] == 'CBSD']['image_id'].to_list()
cgm_images = train_df[train_df['class_short'] == 'CGM']['image_id'].to_list()
cmd_images = train_df[train_df['class_short'] == 'CMD']['image_id'].to_list()

In [None]:
#display a random sample of 9 images for each class
def showImages(images):
    random_images = [np.random.choice(images) for i in range(9)]
    plt.figure(figsize=(10,9))
    for i in range(9):
        plt.subplot(3, 3, i + 1)
        img = plt.imread(base_path/'train_images'/random_images[i])
        plt.imshow(img, cmap='gray')
        plt.axis('off')
    plt.tight_layout()

In [None]:
def findOutliers(df):
    Q1 = df['normalized_mean'].quantile(0.25)
    Q3 = df['normalized_mean'].quantile(0.75)
    IQR = Q3 - Q1
    len_df = len(df)
    # label each image as True if it is an outlier, False is it isn't
    for i in range(0,len_df):
        if df.loc[i,'normalized_mean'] < (Q1 - 1.5 * IQR) or df.loc[i,'normalized_mean'] > (Q3 + 1.5 * IQR):
            df.loc[i,'outlier'] = True
        else:
            df.loc[i,'outlier'] = False

In [None]:
def displayOutliers(df, class_name):
    # Get only outlier images
    outlier_df = df.loc[df['outlier'] == True]
    outlier_images = outlier_df['image_id'].to_list()
    print(f'Number of Outlier Images in {class_name} Class: '+str(len(outlier_images))+'\n')
    # Plot outlier images
    plt.figure(figsize=(10,len(outlier_images)))
    for i in range(0,len(outlier_images)):
        plt.subplot(len(outlier_images)/3, 3, i + 1)
        img = plt.imread(base_path/'train_images'/outlier_images[i])
        plt.imshow(img, cmap='gray')
        plt.axis('off')
    plt.tight_layout()
    
    # return outlier_df as class_name_outlier_df
    return outlier_df

## **Healthy leaf images**

In [None]:
healthy_df  = train_df[train_df['class_short'] ==  'Healthy']
healthy_df.reset_index(inplace = True)
print('Number of Images in the Healthy Class: '+str(len(healthy_df)))
showImages(healthy_images)

### Healthy leaf outliers

In [None]:
findOutliers(healthy_df)
healthy_outlier_df = displayOutliers(healthy_df, 'Healthy')

## **CBB (Cassava Bacterial Blight) images**

In [None]:
CBB_df  = train_df[train_df['class_short'] ==  'CBB']
CBB_df.reset_index(inplace = True)
print('Number of Images in the CBB Class: '+str(len(CBB_df)))
showImages(cbb_images)

### CBB outliers

In [None]:
findOutliers(CBB_df)
CBB_outlier_df = displayOutliers(CBB_df, 'CBB')

In [None]:
CBB_outlier_df

## **CBSD (Cassava Brown Streak Disease) images**

In [None]:
CBSD_df  = train_df[train_df['class_short'] ==  'CBSD']
CBSD_df.reset_index(inplace = True)
print('Number of Images in the CBSD Class: '+str(len(CBSD_df)))
showImages(cbsd_images)

### CBSD outliers

In [None]:
findOutliers(CBSD_df)
CBSD_outlier_df = displayOutliers(CBSD_df, 'CBSD')

## **CGM (Cassava Green Mottle) images**

In [None]:
CGM_df  = train_df[train_df['class_short'] ==  'CGM']
CGM_df.reset_index(inplace = True)
print('Number of Images in the CGM Class: '+str(len(CGM_df)))
showImages(cgm_images)

### CGM outliers

In [None]:
findOutliers(CGM_df)
CGM_outlier_df displayOutliers(CGM_df, 'CGM')

## **CMD (Cassava Mosiac Disease) images**

In [None]:
CMD_df  = train_df[train_df['class_short'] ==  'CMD']
CMD_df.reset_index(inplace = True)
print('Number of Images in the CMD Class: '+str(len(CMD_df)))
showImages(cmd_images)

### CMD outliers

In [None]:
findOutliers(CMD_df)
CMD_outlier_df = displayOutliers(CMD_df, 'CMD')

In [None]:
# Remove outlier from train_df
cond = train_df['image_id'].isin(CBB_outlier_df['image_id'])
train_df.drop(train_df[cond].index, inplace = True)
train_df = train_df.reset_index(drop=True)