Hello Fellow Kagglers,

This notebook demonstrates how to create TFRecords out of the extra training data crawled in [this](https://www.kaggle.com/markwijkhuizen/google-landmark-recognition-extra-data-tfrec-pub) notebook, using [this](https://www.kaggle.com/markwijkhuizen/google-landmark-recognition-extra-train-data-pub) dataset created by that notebook. This dataset contains images of classes with a low amount of samples. Using the complete [Google Landmarks Dataset v2](https://github.com/cvdfoundation/google-landmark) dataset, containing over 4 million images, all classes are filled up to 25 samples. This method is further explained in the notebook and resulted in over 400,000 training samples of classes with few samples.

TFRecords are a highly efficient way to read many small files, such as JPEG's. Instead of reading many small images a single TFRecord containing many images can be read at once. Moreover, each record inside a TFRecords can contain additional data, such as the label. Using this TFRecord format several thousands of images per second can be read on a TPU.

The TFRecords dataset produced in this notebook can be found [here](https://www.kaggle.com/markwijkhuizen/google-landmark-recognition-extra-train-tfrecs-pub).

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from kaggle_datasets import KaggleDatasets
from multiprocessing import cpu_count

import sys
import cv2
import imageio
import joblib
import pickle

# Activate pandas progress apply bar
tqdm.pandas()

print(f'tensorflow version: {tf.__version__}')
print(f'tensorflow keras version: {tf.keras.__version__}')
print(f'python version: P{sys.version}')

In [None]:
# Smaller side of the image, can be adjusted
IMG_SIZE = 384
N_CHANNELS = 3
VERSION = '1A'

In [None]:
train = pd.read_pickle('/kaggle/input/google-landmark-recognition-extra-train-data-pub/train_extra.pkl.xz')

In [None]:
display(train.head())

In [None]:
# There are 402,962 additional training images in the dataset
display(train.info())

In [None]:
# Load Landmark ID to label mapper
with open('/kaggle/input/landmark-recognition-2021-tfrecords-384-part-1/landmark_id2label.pkl', 'rb') as f:
    landmark_id2label = pickle.load(f)
    
# Add label to DataFrame
train['label'] = train['landmark_id'].apply(landmark_id2label.get)

In [None]:
# Sanity check, all label fall in the range [0, 81312]
train['label'].describe()

In [None]:
display(train.head())

In [None]:
display(train.info())

# File Path

In [None]:
# Add File Path to Image
def to_file_path(i):
    return f'/kaggle/input/google-landmark-recognition-extra-train-data-pub/train/{i[0]}/{i[0]}/{i[1]}/{i[2]}/{i}.jpg'

train['file_path'] = train['id'].progress_apply(to_file_path).astype('string')

In [None]:
# Shuffle DataFrame
train = train.sample(frac=1, random_state=42)

# Process Image

In [None]:
def process_image(file_path):
    # Read Image
    img = imageio.imread(file_path)
    h, w, _ = img.shape

    r = IMG_SIZE / min(w, h)
    # Check whether image is bigger than IMG_SIZE
    if min(h,w) > IMG_SIZE:
        w_resize = int(w * r)
        h_resize = int(h * r)
        # Resize using high quality LANCZOS algorithm
        img = cv2.resize(img, (w_resize, h_resize), interpolation=cv2.INTER_LANCZOS4)
        # Save as JPEG with quality set to 70, just as original images
        img_jpeg = tf.io.encode_jpeg(img, quality=70, optimize_size=True).numpy()
        return img_jpeg, h_resize, w_resize
    # Otherwise use original image
    else:
        with open(file_path, 'rb') as f:
            img_jpeg = f.read()
        return img_jpeg, h, w

# Split in Chunks

In [None]:
# Split Training Data in Chunks
def split_in_chunks(data, chunk_size):
    return [data[:, i:i + CHUNK_SIZE] for i in range(0, len(data[1]), CHUNK_SIZE)]

In [None]:
# Each TFRecords will yield 3000 images
CHUNK_SIZE = int(3e3)

# Split train data in chunks
train_split = split_in_chunks(np.array((train['file_path'], train['label'])), CHUNK_SIZE)

print(f'train_split chunks: {len(train_split)}')

# Make TFRecords

In [None]:
# Makes the actual TFRecords
def to_tf_records(data_split, name):
    for idx, (fps, lbls) in enumerate(tqdm(data_split)):
        
        # Create image processing jobs and execute them in parallel
        jobs = [joblib.delayed(process_image)(fp) for fp in fps]
        imgs_resized = joblib.Parallel(
            n_jobs=cpu_count(),
            verbose=0,
            batch_size=64,
            pre_dispatch=64*cpu_count(),
            require='sharedmem'
        )(jobs)
        tfrecord_name = f'{VERSION}_EXTRA_DATA_{name}_batch_{idx}.tfrecords'
        
        # Create the actual TFRecords
        with tf.io.TFRecordWriter(tfrecord_name) as file_writer:
            for (img, h, w), lbl in zip(imgs_resized, lbls):
                record_bytes = tf.train.Example(features=tf.train.Features(feature={
                    # Image as JPEG bytes
                    'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img])),
                    # Label of image
                    'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(lbl)])),
                    # Height of image
                    'height': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(h)])),
                    # Width of image
                    'width': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(w)])),
                })).SerializeToString()
                file_writer.write(record_bytes)

# Create TFRecords
to_tf_records(train_split, 'train')

# Check TFRecords

In [None]:
# Imagenet mean and standard deviation per channel
IMAGENET_MEAN = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32)
IMAGENET_STD = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32)

# Number of channels, 3 for RGB images
N_CHANNELS = tf.constant(3, dtype=tf.int64)

In [None]:
# Function to decode the TFRecords
def decode_tfrecord(record_bytes):
    features = tf.io.parse_single_example(record_bytes, {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
        'width': tf.io.FixedLenFeature([], tf.int64),
        'height': tf.io.FixedLenFeature([], tf.int64),
    })

    image = tf.io.decode_jpeg(features['image'])
    label = features['label']
    height = features['height']
    width = features['width']
    
    # Cutout Random Square if image is not square
    if height != width:
        if height > width:
            offset = tf.random.uniform(shape=(), minval=0, maxval=height-width, dtype=tf.int64)
            image = tf.slice(image, [offset, 0, 0], [width, width, N_CHANNELS])
        else:
            offset = tf.random.uniform(shape=(), minval=0, maxval=width-height, dtype=tf.int64)
            image = tf.slice(image, [0, offset, 0], [height, height, N_CHANNELS])
    
    # Reshape and Normalize
    size = tf.math.reduce_min([height, width])
    # Explicit reshape needed for TPU, tell cimpiler dimensions of image
    image = tf.reshape(image, [size, size, N_CHANNELS])
    # Some images are smaller than 384x384 and need to be upscaled
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    # Convert to float32 and normalize to range 0-1
    image = tf.cast(image, tf.float32)  / 255.0
    # Normalize according to ImageNet mean and standard deviation
    image = (image - IMAGENET_MEAN) / IMAGENET_STD
    
    return image, label

In [None]:
# Shows a batch of images
def show_batch(dataset, rows=4, cols=3):
    imgs, lbls = next(iter(dataset))
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*4, rows*4))
    for r in range(rows):
        for c in range(cols):
            img = imgs[r*cols+c].numpy().astype(np.float32)
            img += abs(img.min())
            img /= img.max()
            axes[r, c].imshow(img)
            axes[r, c].set_title(f'Label: {lbls[r*cols+c]}')

In [None]:
# Makes a TFRecordDataser iterator
def get_train_dataset():
    FNAMES_TRAIN_TFRECORDS = tf.io.gfile.glob('./*.tfrecords')
    train_dataset = tf.data.TFRecordDataset(FNAMES_TRAIN_TFRECORDS, num_parallel_reads=1)
    train_dataset = train_dataset.map(decode_tfrecord, num_parallel_calls=1)
    train_dataset = train_dataset.batch(32)
    
    return train_dataset

In [None]:
# Sanity Check, plot some images from the freshly created TFRecords
train_dataset = get_train_dataset()
show_batch(train_dataset)