In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
       # print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## This notebook is a culmination of various techniques explored by Kagglers - [ilovescience](https://www.kaggle.com/tanlikesmath/cassava-classification-eda-fastai-starter), [Kevin Joseph Scaria](https://www.kaggle.com/kevinscaria/cassava-leaf-disease-eda-modelling), [Yaroslav Isaienkov](https://www.kaggle.com/ihelon/cassava-leaf-disease-exploratory-data-analysis), [Aayush Jain](https://www.kaggle.com/foolofatook/starter-eda-cassava-leaf-disease)

# Loading Data & Basic Visualization

## Importing Libraries

In [None]:
import cv2
import json
import seaborn as sns
import albumentations as A
import matplotlib.pyplot as plt

import tensorflow as tf

BASE_DIR = "../input/cassava-leaf-disease-classification/"

## Let's have a look at the class labels in the dataset i.e Disease Names

In [None]:
label_mapping = pd.read_json(os.path.join(BASE_DIR, 'label_num_to_disease_map.json'), typ = 'series')
print(label_mapping)

## Reading the CSV file which has details on the images and lables

In [None]:
df = pd.read_csv(os.path.join(BASE_DIR, "train.csv"))

## Creating a mapping from image_id and label -> Path of the image (Usefull for segregating the images label wise in different folders)

In [None]:
# Create a new column that holds the path of each image
df['path'] = df['image_id'].map(lambda x: os.path.join(BASE_DIR, 'train_imgs', x))

# Shuffle the Dataframe
df = df.sample(frac=1).reset_index(drop=True) 

## Mapping the label -> Class name

In [None]:
df["class_name"] = df["label"].map(label_mapping)

In [None]:
print('Total number of images in the dataset - {}'.format(len(df)))
df

## Visualizing the distribution of data

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(y = "class_name", data = df)

# Image Visualizations

## Reading a single image and it's shape

In [None]:
img = cv2.imread(os.path.join(BASE_DIR, 'train_images', '3613193696.jpg'))
plt.imshow(img)
print('Shape of the Image - {}'.format(img.shape))

## Function for easy class-wise visualization

In [None]:
def visualize_batch(image_ids, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(3, 3, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, "train_images", image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(f'Class: {label}', fontsize = 12)
        plt.axis("off")
    
    plt.show()

## Random sampling

In [None]:
random_df = df.sample(9)
image_ids = random_df["image_id"].values
labels = random_df["class_name"].values

visualize_batch(image_ids, labels)

## Visualization of Images - Class : Cassava Bacterial Blight (CBB)

In [None]:
cbb_df = df[df["label"] == 0]
print('Total train images for Class {} - {}'.format(label_mapping[0], cbb_df.shape[0]))

cbb_df = cbb_df.sample(9)
image_ids = cbb_df["image_id"].values
labels = cbb_df["class_name"].values

visualize_batch(image_ids, labels)

## Visualization of Images - Class : Cassava Brown Steak Disease (CBSD)

In [None]:
cbsd_df = df[df["label"] == 1]
print('Total train images for Class {} - {}'.format(label_mapping[1], cbsd_df.shape[0]))

cbsd_df = cbsd_df.sample(9)
image_ids = cbsd_df["image_id"].values
labels = cbsd_df["class_name"].values

visualize_batch(image_ids, labels)

## Visualization of Images - Class : Cassava Green Mottle (CGM)

In [None]:
cgm_df = df[df["label"] == 2]
print('Total train images for Class {} - {}'.format(label_mapping[2], cgm_df.shape[0]))

cgm_df = cgm_df.sample(9)
image_ids = cgm_df["image_id"].values
labels = cgm_df["class_name"].values

visualize_batch(image_ids, labels)

## Visualization of Images - Class : Cassava Mosaic Disease (CMD)

In [None]:
cmd_df = df[df["label"] == 3]
print('Total train images for Class {} - {}'.format(label_mapping[3], cmd_df.shape[0]))

cmd_df = cmd_df.sample(9)
image_ids = cmd_df["image_id"].values
labels = cmd_df["class_name"].values

visualize_batch(image_ids, labels)

## Visualization of Images - Class : Healthy (H)

In [None]:
h_df = df[df["label"] == 4]
print('Total train images for Class {} - {}'.format(label_mapping[4], h_df.shape[0]))

h_df = h_df.sample(9)
image_ids = h_df["image_id"].values
labels = h_df["class_name"].values

visualize_batch(image_ids, labels)

# Exploring Data Augmentations

## Augmentations in TensorFlow

In [None]:
def augmentImage(image_id, seed = 0):
    
    # Reading image given the path
    image = tf.io.read_file(os.path.join(BASE_DIR, "train_images", image_id))
    image = tf.image.decode_jpeg(image, channels = 3)
    
    original_image = image
    
    # Brightness, Contrast
    brightness = tf.image.random_brightness(image, 0.4, seed = seed)
    contrast = tf.image.random_contrast(image, 0.2, 0.6, seed = seed)
    
    # Cropping
    crop = tf.image.random_crop(image, size = [500, 600, 3], seed = seed)
    central_crop = tf.image.central_crop(image, central_fraction = 0.3)
    
    
    # Flippings
    # replace with random_flip_left_right when using as part of a augmentation pipeline
    left_right = tf.image.flip_left_right(image)    
    #replace with random_flip_up_down when using as part of a augmentation pipeline
    up_down = tf.image.flip_up_down(image)
    transpose = tf.image.transpose(image)
    
    # Color-based
    hue = tf.image.random_hue(image, 0.2, seed = seed)
    saturation = tf.image.random_saturation(image, 5, 10, seed = seed)
    jpeg_quality = tf.image.random_jpeg_quality(image, 75, 85)
        
    # Rotations
    #rotated = tf.image.rot90(image)    
    
    # Grayscale
    grayscaled = tf.image.rgb_to_grayscale(image)
    
    
    return (original_image, brightness, contrast, crop, central_crop, left_right, up_down, transpose, hue, saturation, jpeg_quality)

In [None]:
def displayAugmentations(l, index):
    
    augmentedImages = augmentImage(l[index])
    plt.figure(figsize=(16, 12))
    for i, imageName in zip(range(11), ['Input Image', 'Augmented - Brightness', 'Augmented - Contrast', 'Augmented - Crop', 'Augmented - Center Cropped', 'Augmented - Horizontal Flip',
                                      'Augmented - Vertical Flip', 'Augmented - Transpose' , 'Augmented - Hue', 'Augmented - Saturation','Augmented - Jpeg Quality']):
        ax = plt.subplot(3, 4, i + 1)
        plt.imshow(augmentedImages[i].numpy().astype("uint8"))
        plt.title(imageName)
        plt.axis("off")

In [None]:
cbb_list = cbb_df['image_id'].tolist()
cbsd_list = cbsd_df['image_id'].tolist()
cgm_list = cgm_df['image_id'].tolist()
cmd_list = cmd_df['image_id'].tolist()
h_list = h_df['image_id'].tolist()

In [None]:
print('Cassava Bacterial Blight Images')
displayAugmentations(cbb_list, 0)

In [None]:
print('Cassava Brown Steak Disease Images')
displayAugmentations(cbsd_list, 0)

In [None]:
print('Cassava Green Mottle Images')
displayAugmentations(cgm_list, 0)

In [None]:
print('Cassava Mosaic Disease Images')
displayAugmentations(cmd_list, 0)

In [None]:
print('Healthy Leaf Images')
displayAugmentations(h_list, 0)

## Augmentations in Albumentations

In [None]:
def apply_augmentation(image_id):
    
    img = cv2.imread(os.path.join(BASE_DIR, "train_images", image_id))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    coarse_dropout = A.CoarseDropout(p = 1.0, max_holes = 100, max_height = 50, max_width = 50, min_holes = 35, min_height = 10, min_width = 10)
    clahe = A.CLAHE (clip_limit = 4.0, tile_grid_size=(8, 8), always_apply = False, p = 0.7)
    c_crop = A.CenterCrop(600, 375, always_apply = False, p = 1.0)
    gauss_noise = A.GaussNoise(var_limit=(10.0, 160.0), mean = 25, always_apply = False, p = 0.7)
    distort = A.GridDistortion(always_apply = False, p = 1.0, num_steps = 6, distort_limit=(-0.40, 0.59), interpolation = 1, border_mode = 0, value = (0, 0, 0), mask_value = None)
    h_flip = A.HorizontalFlip(always_apply = False, p = 1.0)
    bright = A.RandomBrightness(always_apply = False, p = 1.0, limit=(-0.30, 0.43))
    contrast = A.RandomContrast(always_apply = False, p = 1.0, limit=(-0.38, 0.30))
    fog = A.RandomFog(always_apply = False, p = 1.0, fog_coef_lower=0.07, fog_coef_upper=0.26, alpha_coef = 1.0)
    rain = A.RandomRain(always_apply = False, p = 1.0, slant_lower = 2, slant_upper = 4, drop_length = 7, drop_width = 1, drop_color = (0, 0, 0), blur_value = 1, brightness_coefficient = 0.97, rain_type = 'drizzle')
    r_crop = A.RandomResizedCrop(always_apply = False, p = 1.0, height = 130, width = 229, scale = (0.08, 1.0), ratio = (0.75, 1.4), interpolation = 0)
    rotate = A.RandomRotate90(always_apply = False, p = 1.0)
    # To be tried - Cutout, Elastic Trnasform, ISONoise, HSV, MotionBlur , RandomSizedCrop, ShiftScaleRotate
    
    t_1 = coarse_dropout(image = img)["image"]
    t_2 = clahe(image = img)["image"]
    t_3 = c_crop(image = img)["image"]
    t_4 = gauss_noise(image = img)["image"]
    t_5 = distort(image = img)["image"]
    t_6 = h_flip(image = img)["image"]
    t_7 = bright(image = img)["image"]
    t_8 = contrast(image = img)["image"]
    t_9 = fog(image = img)["image"]
    t_10 = rain(image = img)["image"]
    t_11 = r_crop(image = img)["image"]
    t_12 = rotate(image = img)["image"]
        
    return (img, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12)

In [None]:
def displayAAugmentations(l, index):
    
    augmentedImages = apply_augmentation(l[index])
    plt.figure(figsize=(16, 12))
    for i, imageName in zip(range(13), ['Input Image', 'Augmented - Coarse Dropout', 'Augmented - CLAHE', 'Augmented - Center Crop', 'Augmented - Gaussian Noise', 'Augmented - Distortion',
                                      'Augmented - Horizontal Flip', 'Augmented - Brightness' , 'Augmented - Contrast', 'Augmented - Fog', 'Augmented - Rain', 'Augmented - ResizedCrop', 'Augmented - Rotation']):
        ax = plt.subplot(4, 4, i + 1)
        plt.imshow(augmentedImages[i].astype("uint8"))
        plt.title(imageName)
        plt.axis("off")

In [None]:
print('Cassava Bacterial Blight Images')
displayAAugmentations(cbb_list, 0)

In [None]:
print('Cassava Brown Steak Disease Images')
displayAAugmentations(cbsd_list, 0)

In [None]:
print('Cassava Green Mottle Images')
displayAAugmentations(cgm_list, 0)

In [None]:
print('Cassava Mosaic Disease Images')
displayAAugmentations(cmd_list, 0)

In [None]:
print('Healthy Leaf Images')
displayAAugmentations(h_list, 0)