# Cassava Leaf Disease EDA



Cassava anthracnose disease (CAD) is widespread in most of the cassava growing regions of Africa. The disease is caused by a fungus (Collectothricum gloeosporioides) that is also capable of causing diseases on other food crops. It is estimated that CAD causes yield losses in the neighbourhood of 30% or more in susceptible cultivars. The disease affects both leaf and stem production. Severe anthracnose attacks can cause death of stems which can affect the availability of planting materials especially in large scale production systems.

In [None]:
!pip install seaborn==0.11.1

In [None]:
import sys               
import time              
import pickle     
import numpy as np
import pandas as pd
import json
from scipy import ndimage, stats, signal

import matplotlib.pyplot as plt
import seaborn as sb
from PIL import Image, ImageStat
from skimage import io, color


%matplotlib inline

In [None]:
sb.__version__

In [None]:
temp=Image.open('../input/cassava-leaf-disease-classification/test_images/2216849948.jpg')
plt.imshow(temp);

In [None]:
train = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train.head()

In [None]:
with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json','rb') as file:
    map_disease = json.load(file)
    print(json.dumps(map_disease, indent=4, sort_keys=True))

In [None]:
df_eda = train.copy()

In [None]:
brightness=[]
median=[]
contrast=[]
size=[]
kurtosis=[]
for i in range(df_eda.shape[0]):
    path = '../input/cassava-leaf-disease-classification/train_images/' + df_eda.iloc[i,0]
    
    im_sk = io.imread(path)
    temp_im_sk = color.rgb2gray(im_sk)
    im_kurtosis = stats.kurtosis(temp_im_sk.flatten(), fisher=False)
    kurtosis.append(im_kurtosis)
    
    im = Image.open(path)
    im_temp = im.convert('L')
    stat = ImageStat.Stat(im_temp)
    brightness.append(stat.rms[0])
    median.append(stat.median[0])
    contrast.append(stat.stddev[0])
    size.append(np.array(im).shape)

In [None]:
df_eda['brightness']=brightness
df_eda['median']=median
df_eda['contrast']=contrast
df_eda['kurtosis']=kurtosis
df_eda['height']= [item[0] for item in size]
df_eda['width']= [item[1] for item in size]
df_eda['channels']= [item[2] for item in size]

In [None]:
df_eda

In [None]:
plt.figure(figsize=(6,4))
sb.countplot(data=df_eda, y='label')
plt.yticks(ticks=range(0,5),labels=list(map_disease.values()))
plt.title('Representation of classes on the training data')
plt.show()

In [None]:
def show_sample_imgs(label):
    k=1
    plt.figure(figsize=(12,7))
    plt.suptitle(map_disease[str(label)] + ' sample images', y=0.9)
    for i in range(6):
        temp_df = df_eda.query("label == @label")
        im = temp_df.iloc[np.random.randint(temp_df.shape[0]),0]
        path = '../input/cassava-leaf-disease-classification/train_images/' + im
        img = Image.open(path)
        plt.subplot(2,3,k)
        plt.imshow(img)
        k+=1

In [None]:
for i in range(5):
    show_sample_imgs(i)

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
sb.histplot(data=df_eda.iloc[:,1:], x='brightness', hue='label')
plt.subplot(2,2,2)
sb.histplot(data=df_eda.iloc[:,1:], x='median', hue='label')
plt.subplot(2,2,3)
sb.histplot(data=df_eda.iloc[:,1:], x='contrast', hue='label')
plt.subplot(2,2,4)
sb.histplot(data=df_eda.iloc[:,1:], x='kurtosis', hue='label');

To be continued...