Hello fellow Kagglers,

This notebook demonstrates how to create TFRecords for the Google Landmark Recognition 2021 competition. Since the dataset is huge with over 1.5 million images the dataset has to be split in 3 parts.
The datasets are made public and can be found here:

[Part 1](https://www.kaggle.com/markwijkhuizen/landmark-recognition-2021-tfrecords-384-part-1)

[Part 2](https://www.kaggle.com/markwijkhuizen/landmark-recognition-2021-tfrecords-384-part-2)

[Part 3](https://www.kaggle.com/markwijkhuizen/landmark-recognition-2021-tfrecords-384-part-3)

The resolution is set to 384 with 384 being the smaller side of the image, thus images can for example have a resolution of 384x512 or 512x384.

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from kaggle_datasets import KaggleDatasets
from multiprocessing import cpu_count

import sys
import cv2
import imageio
import joblib
import pickle

# Activate pandas progress apply bar
tqdm.pandas()

print(f'tensorflow version: {tf.__version__}')
print(f'tensorflow keras version: {tf.keras.__version__}')
print(f'python version: P{sys.version}')

In [None]:
# Smaller side of the image, can be adjusted
IMG_SIZE = 384
N_CHANNELS = 3
VERSION = '1A'

# Total number of images
N_ROWS = 1580470
# Higher resolution will require more splits due to the 20GB dataset limit
N_SPLITS = 3
PART_N = 1

In [None]:
# Read train CSV with correct data types
dtype = { 'id': 'string', 'landmark_id': np.uint32 }
train = pd.read_csv('/kaggle/input/landmark-recognition-2021/train.csv', dtype=dtype)

The landmark ids are not continious, as shown below. To get continous labels in a single line of code the landmark ids are converted to categories, where the label is the ordinal encoded category.

In [None]:
print(f'First 10 landmark ids: {train["landmark_id"].unique()[:10]}')

In [None]:
train['label'] = train['landmark_id'].astype('category').cat.codes

Create mappings between the labels and landmark ids. This can be needed when converting predicted model labels back to landmark ids for example.

In [None]:
label2landmark_id = train[['label', 'landmark_id']].drop_duplicates().set_index('label').squeeze().to_dict()
landmark_id2label = {v: k for k, v in label2landmark_id.items()}

In [None]:
with open('label2landmark_id.pkl', 'wb') as f:
    pickle.dump(label2landmark_id, f)

with open('landmark_id2label.pkl', 'wb') as f:
    pickle.dump(landmark_id2label, f)

In [None]:
display(train.head())

In [None]:
display(train.info())

# Google Cloud File Paths

In [None]:
# Get the Google Cloud Storage path for a given image
def to_gcs_file_path(i):
    return f'{GCS_DS_PATH}/train/{i[0]}/{i[1]}/{i[2]}/{i}.jpg'

# Get Google Cloud Path to dataset
GCS_DS_PATH = KaggleDatasets().get_gcs_path('landmark-recognition-2021')

# Assign Google Cloud Path
train['gcs_file_path'] = train['id'].progress_apply(to_gcs_file_path).astype('string')

# File Path

In [None]:
def to_file_path(i):
    return f'/kaggle/input/landmark-recognition-2021/train/{i[0]}/{i[1]}/{i[2]}/{i}.jpg'

train['file_path'] = train['id'].progress_apply(to_file_path).astype('string')

In [None]:
display(train.head())

In [None]:
# Save the updated train DataFrame
train.to_pickle('train.pkl.xz')

# Part Selection

The dataset has to be split into multiple parts due to limited disk space. In the next cell the start and end index of the DataFrame are computed.

In [None]:
START_IDX = int(N_ROWS * ((PART_N - 1) / N_SPLITS))
END_IDX = int(N_ROWS * (PART_N / N_SPLITS))

print(f'START_IDX: {START_IDX}, END_IDX: {END_IDX}')

In [None]:
# Drop all indices which do note belong to this part
DROP_IDXS = train.loc[(train.index < START_IDX) | (train.index >= END_IDX)].index
train.drop(DROP_IDXS, inplace=True)

print(f'DataFrame idx min: {train.index.min()}, idx max: {train.index.max()}')

# Reset index
train.reset_index(drop=True, inplace=True)

Shuffling the DataFrame is import, as otherwise batches could consists of a single class. Shuffling the DataFrame makes sure every TFRecord will consists of a random set of classes.

In [None]:
# Shuffle DataFrame
train = train.sample(frac=1, random_state=42)

In [None]:
display(train.head())

In [None]:
display(train.info())

# Process Image

Processing the image is rather easy. The image is resized to have a smaller side of size 384. The computing intensize LANCZOS algorithm is used for resizing. I am not an expert on resizing algorithms, however the [PIL documentation](https://pillow.readthedocs.io/en/stable/handbook/concepts.html#filters-comparison-table) on filters state this algorithm performs the best.

In [None]:
def process_image(file_path):
    img = imageio.imread(file_path)
    h, w, _ = img.shape

    r = IMG_SIZE / min(w, h)
    # Check whether image is bigger than IMG_SIZE
    if min(h,w) > IMG_SIZE:
        w_resize = int(w * r)
        h_resize = int(h * r)
        # Resize using LANCZOS algorithm
        img = cv2.resize(img, (w_resize, h_resize), interpolation=cv2.INTER_LANCZOS4)
        # Save as JPEG with quality set to 70, just as original images
        img_jpeg = tf.io.encode_jpeg(img, quality=70, optimize_size=True).numpy()
        return img_jpeg, h_resize, w_resize
    # Otherwise use original image
    else:
        with open(file_path, 'rb') as f:
            img_jpeg = f.read()
        return img_jpeg, h, w

# Sample Size

This helper function computer the estimated part size, this should be less than 20GB due to disk space limitations.

In [None]:
def output_size(N):
    mean_size = 0
    for fp in tqdm(train['file_path'].sample(N, random_state=42)):
        img_jpeg, h, w = process_image(fp)
        mean_size += len(img_jpeg) / N
        
    print(f'Estimated TFRecord output size: {len(train) * mean_size / 2**30:.2f}GB')

output_size(int(1e3))

# Split in Data Chunks

Split the dataset in chunks of 3000 images to get TFRecords of approximately 100MB

In [None]:
def split_in_chunks(data, chunk_size):
    return [data[:, i:i + CHUNK_SIZE] for i in range(0, len(data[1]), CHUNK_SIZE)]

In [None]:
CHUNK_SIZE = int(3e3)

train_split = split_in_chunks(np.array((train['file_path'], train['label'])), CHUNK_SIZE)

print(f'train_split chunks: {len(train_split)}')

# Make TFRecords

This next function creates the actual TFrecords. The images are processed using my favourite parallelism package [joblib](https://joblib.readthedocs.io/en/latest/#).

In [None]:
def to_tf_records(data_split, name):
    for idx, (fps, lbls) in enumerate(tqdm(data_split)):
        
        # Create image processing jobs and execute them in parallel
        jobs = [joblib.delayed(process_image)(fp) for fp in fps]
        imgs_resized = joblib.Parallel(
            n_jobs=cpu_count(),
            verbose=0,
            batch_size=64,
            pre_dispatch=64*cpu_count(),
            require='sharedmem'
        )(jobs)
        tfrecord_name = f'{VERSION}_PART_{PART_N}_{name}_batch_{idx}.tfrecords'
        
        # Create the actual TFRecords
        with tf.io.TFRecordWriter(tfrecord_name) as file_writer:
            for (img, h, w), lbl in zip(imgs_resized, lbls):
                record_bytes = tf.train.Example(features=tf.train.Features(feature={
                    # Image as JPEG bytes
                    'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img])),
                    # Label of image
                    'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(lbl)])),
                    # Height of image
                    'height': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(h)])),
                    # Width of image
                    'width': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(w)])),
                })).SerializeToString()
                file_writer.write(record_bytes)

# Create TFRecords
to_tf_records(train_split, 'train')

# Check TFRecords

Check whether the TFRecords are succesfully created

In [None]:
# Imagenet mean and standard deviation per channel
IMAGENET_MEAN = tf.constant([0.485, 0.456, 0.406], dtype=tf.float32)
IMAGENET_STD = tf.constant([0.229, 0.224, 0.225], dtype=tf.float32)

# Number of channels, 3 for RGB images
N_CHANNELS = tf.constant(3, dtype=tf.int64)

In [None]:
# Function to decode the TFRecords
def decode_tfrecord(record_bytes):
    features = tf.io.parse_single_example(record_bytes, {
        'image': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64),
        'width': tf.io.FixedLenFeature([], tf.int64),
        'height': tf.io.FixedLenFeature([], tf.int64),
    })

    image = tf.io.decode_jpeg(features['image'])
    label = features['label']
    height = features['height']
    width = features['width']
    
    # Cutout Random Square if image is not square
    if height != width:
        if height > width:
            offset = tf.random.uniform(shape=(), minval=0, maxval=height-width, dtype=tf.int64)
            image = tf.slice(image, [offset, 0, 0], [width, width, N_CHANNELS])
        else:
            offset = tf.random.uniform(shape=(), minval=0, maxval=width-height, dtype=tf.int64)
            image = tf.slice(image, [0, offset, 0], [height, height, N_CHANNELS])
    
    # Reshape and Normalize
    size = tf.math.reduce_min([height, width])
    # Explicit reshape needed for TPU, tell cimpiler dimensions of image
    image = tf.reshape(image, [size, size, N_CHANNELS])
    # Some images are smaller than 384x384 and need to be upscaled
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    # Convert to float32 and normalize to range 0-1
    image = tf.cast(image, tf.float32)  / 255.0
    # Normalize according to ImageNet mean and standard deviation
    image = (image - IMAGENET_MEAN) / IMAGENET_STD
    
    return image, label

In [None]:
# Shows a batch of images
def show_batch(dataset, rows=4, cols=3):
    imgs, lbls = next(iter(dataset))
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*4, rows*4))
    for r in range(rows):
        for c in range(cols):
            img = imgs[r*cols+c].numpy().astype(np.float32)
            img += abs(img.min())
            img /= img.max()
            axes[r, c].imshow(img)
            axes[r, c].set_title(f'Label: {lbls[r*cols+c]}')

In [None]:
def get_train_dataset():
    FNAMES_TRAIN_TFRECORDS = tf.io.gfile.glob('./*.tfrecords')
    train_dataset = tf.data.TFRecordDataset(FNAMES_TRAIN_TFRECORDS, num_parallel_reads=1)
    train_dataset = train_dataset.map(decode_tfrecord, num_parallel_calls=1)
    train_dataset = train_dataset.batch(32)
    
    return train_dataset

In [None]:
train_dataset = get_train_dataset()
show_batch(train_dataset)