# Exploratory Data Analysis For Cassava Dataset 

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns
import pandas as pd 
import os 
import json
from PIL import Image

In [None]:

dataset_dir = '../input/cassava-leaf-disease-classification/'

train_labelmap  = dataset_dir+'train.csv'

train_image = dataset_dir + 'train_images/'


In [None]:
train_df = pd.read_csv(train_labelmap)
train_df.head(5)

In [None]:
with open("../input/cassava-leaf-disease-classification/label_num_to_disease_map.json") as file:
    class_index = json.loads(file.read())
    
class_index

In [None]:
train_df["disease_name"] = train_df["label"].astype(str).map(class_index)

In [None]:
train_df

In [None]:
train_df_dist = train_df.disease_name.value_counts()

train_df_dist 

In [None]:
# Plot distribution
plt.figure(figsize = (9, 8))
sns.set_color_codes("pastel")
sns.countplot(x="label", data=train_df)
plt.show()

### Visualize images in each class(disease)

In [None]:
def show_images(df,label,rows = 1,no_of_image=20):
    '''
    df - input dataframe 
    label - leaf disease label which want to visualize 
    no_of_image - no of images that we want to visualize 
    '''
    # extract rows which has a particular disease name 
    img_df = df[df["label"] == label]
    
    # take sample  
    img_sample_df  =  img_df.sample(no_of_image)
                                 
    # get the image-name and disease name 
    images = img_sample_df['image_id'].values
    label = img_sample_df['disease_name'].values

    fig = plt.figure()
    for n,img_id in enumerate(images):
        a = fig.add_subplot(rows, np.ceil(no_of_image/float(rows)), n + 1)
        # read a image 
        img = Image.open(train_image+img_id)
        # plot the current image 
        plt.imshow(img)
        a.set_title(img_id) 
    fig.set_size_inches(30, 30)
    plt.show()
    

### Cassava Bacterial Blight (CBB)
label : 0 

In [None]:
show_images(train_df,0,rows=6,no_of_image=30)

### Cassava Brown Streak Disease (CBSD)
label : 1

In [None]:
show_images(train_df,1,rows=6,no_of_image=30)

### Cassava Green Mottle (CGM) 
label : 2

In [None]:
show_images(train_df,2,rows=6,no_of_image=30)

### Cassava Mosaic Disease (CMD)
label : 3 

In [None]:
show_images(train_df,3,rows=6,no_of_image=30)

### Healthy leaf
label : 4

In [None]:
show_images(train_df,4,rows=6,no_of_image=30)