In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from scipy.ndimage.measurements import center_of_mass
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from multiprocessing import cpu_count
from PIL import Image

import os
import glob
import imageio
import cv2
import gc
import joblib
import json

In [None]:
IMG_HEIGHT = 600
IMG_WIDTH = 800
IMG_SIZE = 600
N_CHANNELS = 3
CHUNK_SIZE = 1024
N_VAL_IMGS = 128 * 8

In [None]:
IMG_DIR = '/kaggle/input/cassava-leaf-disease-classification/train_images/'

train = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/train.csv')
# add image file paths to train DataFrame
train['file_path'] = train['image_id'].apply(lambda image_id: f'{IMG_DIR}{image_id}')

N_IMGS = len(train)

In [None]:
# read label to disease map and add disease to train DataFrame
with open('/kaggle/input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as f:
    label_map = json.load(f)
    
train['disease'] = train['label'].apply(str).apply(label_map.get)
train['label+disease'] = train[['label', 'disease']].apply(lambda row: f'{row["label"]} ({row["disease"]})', axis=1)

In [None]:
# Show train DataFrame head
pd.options.display.max_colwidth = 99
display(train.head())

# Class Inbalance Visualization

The above graph clearly shows the dataset is unbalanced. The dataset consists 61% of label number 3.

In [None]:
display(train['label+disease'].value_counts())


label_counts = train['label'].value_counts()
print(f'Label 3 contributes to {int(label_counts.loc[3] / label_counts.sum() * 100)}%')

plt.figure(figsize=(20,6))
train['label+disease'].value_counts().sort_index().plot(kind='bar')
plt.xticks(size=14, rotation=15)
plt.show()

# Split train in train and val df

The dataset is split in 5 stratified folds, meaning the class inbalance is preserved and equal in all folds. Using 5 folds means each fold will consist of 80% training data and 20% validation data, resulting in each sample being used exactly once over all folds.

In [None]:
def get_fold_data():
    # split in train test
    LABELS = train['label'].values
    SKF = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    fold_data = dict()
    for idx, (train_fold_idxs, val_fold_idxs) in enumerate(SKF.split(np.arange(N_IMGS), LABELS)):
        fold_data[f'fold_{idx}'] = {
            'train': {
                'file_paths': train.loc[train_fold_idxs, 'file_path'].values,
                'labels': train.loc[train_fold_idxs, 'label'].values,
                'image_ids': train.loc[train_fold_idxs, 'image_id'].values,
            },
            'val': {
                'file_paths': train.loc[val_fold_idxs, 'file_path'].values,
                'labels': train.loc[val_fold_idxs, 'label'].values,
                'image_ids': train.loc[val_fold_idxs, 'image_id'].values,
            },
        }
        
    return fold_data

fold_data = get_fold_data()

In [None]:
# show number of train and validation images per fold and the label count in the validation set
for fold, (k, v) in enumerate(fold_data.items()):
    print(f'--- FOLD {fold} ---')
    print(f'length train: {len(v["train"]["labels"])}, length validation: {len(v["val"]["labels"])}')
    print(f'validation label count: {pd.Series(v["val"]["labels"]).value_counts().to_dict()}')
    print()

# Show Training Images

In [None]:
def show_train_images(rows, cols):
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*6, rows*6))
    for idx, fp, label, disease in train.loc[:rows*cols-1, ['file_path', 'label', 'disease']].itertuples(name=None):
        img = imageio.imread(fp)
        axes[idx // cols, idx % cols].imshow(img)
        axes[idx // cols, idx % cols].set_title(f'Label: {label}, disease: {disease}')
            
show_train_images(5,4)

In [None]:
def split_in_chunks(data):
    return [data[i:i + CHUNK_SIZE] for i in range(0, len(data), CHUNK_SIZE)]

# split fold file paths and labels in chunks for TFRecords
fold_data_chunks = dict()
for fold_k, fold_v in fold_data.items():
    fold_data_chunks[fold_k] = dict()
    
    # Every fold consists of a train and val dictionary consisting of a list with the file paths, labels and image_ids
    fold_data_chunks[fold_k] = {
        'train': {
            'file_paths': split_in_chunks(fold_v['train']['file_paths']),
            'labels': split_in_chunks(fold_v['train']['labels']),
            'image_ids': split_in_chunks(fold_v['train']['image_ids']),
        },
        'val': {
            'file_paths': split_in_chunks(fold_v['val']['file_paths']),
            'labels': split_in_chunks(fold_v['val']['labels']),
            'image_ids': split_in_chunks(fold_v['val']['image_ids']),
        }
    }

In [None]:
# Check for correct split of images, as shown every images occurs 5 times in 5 folds and once in the validation set
def occurances():
    res_all = []
    res_val = []
    for fold_v in fold_data_chunks.values():
        for k in ['train', 'val']:
            for chunk in fold_v[k]['file_paths']:
                for fp in chunk:
                    res_all.append(fp)

        for chunk in fold_v['val']['file_paths']:
            for fp in chunk:
                res_val.append(fp)

    s_all = pd.Series(res_all).value_counts()
    s_val = pd.Series(res_val).value_counts()
    print(f's_all min: {s_all.min()}, s_all max: {s_all.max()}')
    print(f's_val min: {s_val.min()}, s_val max: {s_val.max()}')
    
occurances()

# Make TFRecords

Create TFRecords consisting of JPEGS for image augmentation purposed, as the JPEGS are sized 800\*600. A random 600\*600 square can be selected from the JPEG, using a slightly different image every epoch. Using TFRecords instead of individual JPEGS is also more efficient, as a TFRecord consists oof 1024 images and can be read at once, this reduces the amount of disk reads significantly.

In [None]:
def process_img(file_path):
    img = tf.io.read_file(file_path).numpy()
    
    return img

In [None]:
def make_tfrecords(fold_data_chunks):
    for fold_k, fold_v in fold_data_chunks.items():
        print('*'*10, fold_k.upper(), '*'*10)
        # Try to make output folder
        try:
            os.makedirs(f'./{fold_k}/val')
            os.makedirs(f'./{fold_k}/train')
        except:
            print(f'folders already created')

        for k, v in fold_v.items():
            # make TFRecords for each chunk
            data = zip(v['file_paths'], v['labels'], v['image_ids'])
            for idx, (file_paths_chunk, labels_chunk, image_ids_chunk) in tqdm(enumerate(data), total=len(v['labels'])):
                # read the images in parallel using joblib
                jobs = [joblib.delayed(process_img)(fp) for fp in file_paths_chunk]
                processed_images_chunk = joblib.Parallel(n_jobs=cpu_count(), verbose=0)(jobs)
                
                # write the raw JPEGS to a TFRecord, including the label and image_id
                with tf.io.TFRecordWriter(f'./{fold_k}/{k}/batch_{idx}.tfrecords') as file_writer:
                    for image, label, image_id in zip(processed_images_chunk, labels_chunk, image_ids_chunk):
                        record_bytes = tf.train.Example(features=tf.train.Features(feature={
                            'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
                            'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
                            'image_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[str.encode(image_id)])),
                        })).SerializeToString()
                        file_writer.write(record_bytes)
    

make_tfrecords(fold_data_chunks)

# Check TFRecords

Quick check to see if everything went OK. Show the first train and validation batch

In [None]:
# Check spectograms
def decode_tfrecord(record_bytes):
    features = tf.io.parse_single_example(record_bytes, {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
        'image_id': tf.io.FixedLenFeature([], tf.string),
    })

    image = tf.io.decode_jpeg(features['image'])    
    image = tf.cast(image, tf.float32)
    image = image / 255
    
    label = tf.cast(features['label'], tf.int32)
    
    image_id = features['image_id']
    
    return image, label, image_id

def show_tfrecords(file_path):
    rows = 4
    cols = 3
    fig, ax = plt.subplots(rows, cols, figsize=(cols*6, rows*6))
    tfrecord = tf.data.TFRecordDataset(file_path)
    for idx, (image, label, image_id) in enumerate(tfrecord.map(decode_tfrecord)):
        image = tf.cast(image * 255, tf.uint8)
        image = tf.squeeze(image)
        row, col = idx // cols, idx % cols
        ax[row, col].imshow(image)
        ax[row, col].title.set_text(f'Label {label}, image_id: {image_id.numpy().decode()}')
        if idx == rows * cols - 1:
            break
    plt.show()

print('TRAIN BATCH')
show_tfrecords(f'./fold_0/train/batch_0.tfrecords')
print('VAL BATCH')
show_tfrecords(f'./fold_0/val/batch_0.tfrecords')