- I have prepared both datasets jpeg and tfredords.
- There is a prepared dataset at the following link.
- JPEG ([128x128](https://www.kaggle.com/spidermandance/cassava-jpeg-128x128), [196x196](https://www.kaggle.com/spidermandance/cassava-jpeg-196x196), [256x256](https://www.kaggle.com/spidermandance/cassava-jpeg-256x256), [384x384](https://www.kaggle.com/spidermandance/cassava-jpeg-384x384), [512x512](https://www.kaggle.com/spidermandance/cassava-jpeg-512x512))
- TFRecords ([128x128](https://www.kaggle.com/spidermandance/cassava-tfrecords-128x128), [196x196](https://www.kaggle.com/spidermandance/cassava-tfrecords-196x196), [256x256](https://www.kaggle.com/spidermandance/cassava-tfrecords-256x256), [384x384](https://www.kaggle.com/spidermandance/cassava-tfrecords-384x384), [512x512](https://www.kaggle.com/spidermandance/cassava-tfrecords-512x512))

(11/21/2020 ver3)
- Fix: Removing the `img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)` TFRecords cell.
- Add: Image check.

In [None]:
import os
import cv2
import pandas as pd
from tqdm.notebook import tqdm


DATA_PATH = '/kaggle/input/cassava-leaf-disease-classification'

JPEG_PATH = os.path.join(DATA_PATH, 'train_images')
JPEG_SAVE_PATH = '/kaggle/train_images_jpeg'

CSV_PATH = os.path.join(DATA_PATH, 'train.csv')

RESIZE = 128
NUM_TFREDORDS = 1338
IMG_QUALITY = 95
DEBUG = False


os.makedirs(JPEG_SAVE_PATH, exist_ok=True)
train_df = pd.read_csv(CSV_PATH)

## Size checking

In [None]:
img_size = {}
files = sorted(os.listdir(JPEG_PATH))
targets = train_df['label']

# https://www.kaggle.com/nakajima/duplicate-train-images
# drop_img_id = ['3551135685.jpg', '911861181.jpg', '1562043567.jpg', '3551135685.jpg']
# flies = [f for f in files if f not in drop_img_id]

if DEBUG:
    files = files[:25]
    targets = targets[:25]

for img_id in tqdm(files):
    img = cv2.imread(os.path.join(JPEG_PATH, img_id))
    if img.shape in img_size:
        img_size[img.shape] += 1
    else:
        img_size[img.shape] = 1
        
print(f'Size of each the image: {img_size}')

## JPEG

In [None]:
for img_id in tqdm(files):
    load_path = os.path.join(JPEG_PATH, img_id)
    save_path = os.path.join(JPEG_SAVE_PATH, img_id)
    img = cv2.imread(load_path)
    img = cv2.resize(img, (RESIZE, RESIZE))
    cv2.imwrite(save_path, img)
    
!tar -czf 'train_images_{RESIZE}x{RESIZE}.tar.gz' /kaggle/train_images_jpeg/*.jpg

### image checking

In [None]:
import matplotlib.pyplot as plt


def jpeg_display(directory_path):
    fig, axes = plt.subplots(5, 5, figsize=(16, 16))
    for i in range(25):
        img = cv2.imread(os.path.join(directory_path, files[i]))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[i//5][i%5].imshow(img)
        axes[i//5][i%5].set_title(f'{files[i]}: {targets[i]}')
    plt.show()

In [None]:
jpeg_display(JPEG_PATH)

In [None]:
jpeg_display(JPEG_SAVE_PATH)

## TFRecords

In [None]:
import math
import tensorflow as tf
import matplotlib.pyplot as plt


num_iter = math.ceil(len(files) / NUM_TFREDORDS)


def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'target': _int64_feature(feature2)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()



for i in range(num_iter):
    print(f'Writing TFRecord: {i}')
    cnt = min(NUM_TFREDORDS, len(files) - i*NUM_TFREDORDS)
    tf_filename = f'ld_train{str(i).zfill(2)}-{cnt}.tfrec'
    
    with tf.io.TFRecordWriter(tf_filename) as wf:
        for j in range(cnt):
            img_id = files[NUM_TFREDORDS*i + j]
            img = cv2.imread(os.path.join(JPEG_PATH, img_id))
            img = cv2.resize(img, (RESIZE, RESIZE))
            # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  -> Fix:20201121
            
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            img_id = str.encode(img_id)
            target = train_df['label'][NUM_TFREDORDS*i + j]
            
            example = serialize_example(img, img_id, target)
            
            wf.write(example)

### image checking

In [None]:
import numpy as np


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    return image


def parse_example(example):
    LABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_name': tf.io.FixedLenFeature([], tf.string),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['image_name']
    target = example['target']
    return image, label, target


def display_one(image, title, target, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    plt.title(f'{title}: {target}')
    return (subplot[0], subplot[1], subplot[2]+1)


def display_batch_of_images(databatch):
    images, labels, targets = databatch
    images = images.numpy()
    labels = labels.numpy()
    targets = targets.numpy()
    if labels is None:
        labels = [None for _ in enumerate(images)]
    if targets is None:
        targets = [None for _ in enumerate(targets)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.2
    subplot=(rows, cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label, target) in enumerate(zip(images[:rows*cols], labels[:rows*cols], targets[:rows*cols])):
        title = label
        title = title.decode('utf-8')
        correct = True
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one(image, title, target, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0.2, hspace=0.2)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
resize_file = f'ld_train00-{min(len(files), NUM_TFREDORDS)}.tfrec'
dataset = tf.data.TFRecordDataset([resize_file]).map(parse_example).batch(25)
data = iter(dataset)
display_batch_of_images(next(data))