In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg
from skimage import color
import os

In [None]:
train_images = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/train.csv')
sample_sub = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/sample_submission.csv')
TRAIN_IMAGES_PATH = '/kaggle/input/cassava-leaf-disease-classification/train_images'
TEST_IMAGES_PATH = '../input/cassava-leaf-disease-classification/test_images'

with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as json_data:
    label_map = json.load(json_data)

In [None]:
train_images.head()

In [None]:
sns.countplot(train_images['label'])

This plot indicates that we have a class imbalance in the dataset.

In [None]:
label_map

In [None]:
def select_imgs(n, label):
    '''Function to select random ids from the dataframe given a particular label'''
    t = train_images[train_images['label'] == label]
    img_ids = t.sample(n = n, random_state = 0)['image_id']
    return list(img_ids)

def plot_images(df, ids, label = None):
    '''Plots an even number of images in 2 rows'''
    n = len(ids)
    fig, ax = plt.subplots(2, n//2, figsize = (20,10))
    for i, im_id in enumerate(ids):
        img = mpimg.imread(os.path.join(TRAIN_IMAGES_PATH, im_id))
        ax[i//(n//2)][i%(n//2)].imshow(img)
        ax[i//(n//2)][i%(n//2)].axis('off')
    plt.tight_layout()
    if label is not None:
        plt.suptitle(label_map[str(label)])
    plt.show()

In [None]:
#some label 0 images
plot_images(train_images, select_imgs(8, 0), 0)

In [None]:
#some label 1 images
plot_images(train_images, select_imgs(8, 1), 1)

In [None]:
#some label 2 images
plot_images(train_images, select_imgs(8, 2), 2)

In [None]:
#some label 3 images
plot_images(train_images, select_imgs(8, 3), 3)

In [None]:
#some label 4 images
plot_images(train_images, select_imgs(8, 4), 4)

Looking at these images we realise that the colour of the diseased leaves is one of the key identifiers of the disease. Training networks with grayscale images may not perform well.

We shall now illustrate the differences in the brightness across all images.

In [None]:
def get_brightness(image):
    image = color.rgb2gray(image)
    return np.mean(image)*255

#get brightness of each image and append to dataframe
brightness_array = []
image_list = list(train_images['image_id'].unique())
for img in image_list:
    image = mpimg.imread(os.path.join(TRAIN_IMAGES_PATH, img))
    brightness = get_brightness(image)
    brightness_array.append(brightness)

df = pd.DataFrame({'image_id': image_list,
                         'brightness': brightness_array})

In [None]:
print('Mean Brightness is: ', df['brightness'].mean())

In [None]:
print('Max Brightness is: ', df['brightness'].max())

In [None]:
print('Min Brightness is: ', df['brightness'].min())

In [None]:
plt.hist(df['brightness'])

In [None]:
#bright images
bright_ids = df[df['brightness'] > 160].image_id
plot_images(train_images, bright_ids[0:8])

In [None]:
#dark ids
dark_ids = df[df['brightness'] < 50].image_id
plot_images(train_images, dark_ids[0:8])