# Shopee product matching triplet data conversion to TFRecords

[To avoid data bottlenecks in TPU, convert data to TFRecords](https://www.kaggle.com/docs/tpu)

This notebook demonstrates the encoding of triplet sets into TFRecords. Based on [mattbast's work in the Google Landmark Retrieval 2020 competition](https://www.kaggle.com/mattbast/google-landmarks-2020-create-a-tfrecord-dataset/notebook)

For the construction of triplet sets, refer to [xhlulu's excellent notebook](https://www.kaggle.com/xhlulu/shopee-generate-data-for-triplet-loss)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import io
from PIL import Image

from tqdm.notebook import tqdm
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from kaggle_datasets import KaggleDatasets

In [None]:
NUM_SHARDS = 16
IMAGE_SIZE = (224, 224)
SEED=42

In [None]:
df = pd.read_csv('../input/shopee-product-matching/train.csv')
display(df)

In [None]:
def generate_triplets(df):
    random.seed(SEED)
    label_group = dict(list(df.groupby('label_group')))
    
    def aux(row):
        anchor = row['image']
        
        # We sample a positive data point from the same group, but
        # exclude the anchor itself
        ids = label_group[row['label_group']]['image'].tolist()
        ids.remove(row['image'])
        positive = random.choice(ids)
        
        # Now, this will sample a group from all possible groups, then sample 
        # a product from that group
        groups = list(label_group.keys())
        groups.remove(row['label_group'])
        neg_group = random.choice(groups)
        negative = random.choice(label_group[neg_group]['image'].tolist())

        return anchor, positive, negative
    
    return aux

In [None]:
triplet_sets = df.apply(generate_triplets(df), axis=1).tolist()
triplet_sets = pd.DataFrame(triplet_sets, columns=['anchor', 'positive', 'negative'])
display(triplet_sets)

In [None]:
#triplet_sets = pd.read_csv('../input/generate-triplet-data/triplet_sets.csv')
triplet_paths = triplet_sets.applymap(lambda x: os.path.join('../input/shopee-product-matching/train_images', x))
display(triplet_paths)

In [None]:
train_paths, test_paths = train_test_split(triplet_paths, train_size=0.8, random_state=SEED)

## Data pipeline prototyping

### What method should be used to resize images?

In [None]:
#Compare resize methods
filepath = train_paths.iloc[1,0]

methods = ['bilinear', 'lanczos3', 'lanczos5', 'bicubic', 'gaussian', 'nearest', 'area', 'mitchellcubic']
fig = plt.figure(figsize=(30, 30))
ax = fig.subplots(3,3)

image_string = tf.io.read_file(filepath)
original = tf.image.decode_jpeg(image_string, channels=3)
ax[0,0].imshow(original)
ax[0,0].set_title('original', fontsize=24)

for i, method in enumerate(methods):
    image = original
    
    #Must convert dtype to float32 for most resizing methods to work
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, IMAGE_SIZE, method=method, antialias=True)
    
    subplot = (i+1)//3, (i+1)%3
    ax[subplot].imshow(image)
    ax[subplot].set_title(method, fontsize=24)

All look pretty comparable. 

Ranking: "Bilinear" > ... > "area" > "nearest"

### When should resizing occur in the data pipeline?

In [None]:
def encode(image, method, resize=True):
    if resize:
        #Must convert dtype to float32 for most resizing methods to work
        image = tf.image.convert_image_dtype(image, tf.float32)
        image = tf.image.resize(image, IMAGE_SIZE, method=method, antialias=True)
        #Convert dtype to uint8 to be encoded to bytestring for tfrec
        image = tf.image.convert_image_dtype(image, tf.uint8)
    image = tf.image.encode_jpeg(image, optimize_size=True)
    return image

# Example decoding func
def decode(image, method, resize=True):
    image = tf.image.decode_jpeg(image, channels=3)
    if resize:
        image = tf.cast(image, tf.uint8) / 255
        image = tf.image.resize(image, IMAGE_SIZE, method=method, antialias=True) 
    return image

In [None]:
#Compare resize methods
filepath = train_paths.iloc[1,0]

methods = ['bilinear', 'lanczos3', 'lanczos5', 'bicubic', 'gaussian', 'nearest', 'area', 'mitchellcubic']
fig = plt.figure(figsize=(24, 9))
ax = fig.subplots(3,8)

image_string = tf.io.read_file(filepath)
original = tf.image.decode_jpeg(image_string, channels=3)

# Mimic the data pipeline as shown in 
# Encoding to tfrec - https://www.kaggle.com/mattbast/google-landmarks-2020-create-a-tfrecord-dataset/notebook
# Decoding from tfrec - https://www.kaggle.com/mattbast/google-landmark-retrieval-triplet-loss/data
for j in range(3):
    if j == 0:
        encode_resize, decode_resize = True, True
    elif j == 1:
        encode_resize, decode_resize = True, False
    else:
        encode_resize, decode_resize = False, True
    
    for i, method in enumerate(methods):
        image = original
        #encode to tfrec
        image = encode(image, method, resize=encode_resize)
        #decode from tfrec
        image = decode(image, method, resize=decode_resize)
        
        ax[j,i].imshow(image)
        title = ' resize' + (' encode' if encode_resize else '') + (' decode' if decode_resize else '')
        ax[j,i].set_title(method + title)
        ax[j,i].axis('off')

Seems that only "bilinear", "gaussian", "nearest", and "area" survive the original pipeline implementation for whatever reason

In [None]:
#Compare resize methods
filepath = train_paths.iloc[1,0]

methods = ['bilinear', 'gaussian', 'nearest', 'area']
fig = plt.figure(figsize=(32, 24))
ax = fig.subplots(3,4)

image_string = tf.io.read_file(filepath)
original = tf.image.decode_jpeg(image_string, channels=3)

for j in range(3):
    if j == 0:
        encode_resize, decode_resize = True, True
    elif j == 1:
        encode_resize, decode_resize = True, False
    else:
        encode_resize, decode_resize = False, True
    
    for i, method in enumerate(methods):
        image = original
        #encode to tfrec
        image = encode(image, method, resize=encode_resize)
        #decode from tfrec
        image = decode(image, method, resize=decode_resize)
        
        ax[j,i].imshow(image)
        title = ' resize' + (' encode' if encode_resize else '') + (' decode' if decode_resize else '')
        ax[j,i].set_title(method + title, fontsize=24)
        ax[j,i].axis('off')

Best results seem to occur when images are resized during preprocessing. 

The remainder of the code in this notebook will resize the images via the "bilinear" method to 224x224 for EfficientNetb0

## Triplet encoding to tfrecs

In [None]:
def encode_image(filepath, method='bilinear'):
    image_string = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image_string, channels=3)
    #Must convert dtype to float32 for most resizing methods to work
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, IMAGE_SIZE, method=method, antialias=True)
    #Convert dtype to uint8 to be encoded to bytestring for tfrec
    image = tf.image.convert_image_dtype(image, tf.uint8)
    image = tf.image.encode_jpeg(image, optimize_size=True)
    return image

In [None]:
def visualize(df_paths):

    def show(ax, image):
        ax.imshow(image)
        ax.axis('off')

    fig = plt.figure(figsize=(9, 9))

    axs = fig.subplots(3, 3)
    for i in range(3):
        triplet = df_paths.iloc[i, 0:3]
        for j in range(3):
            image = encode_image(triplet[j])
            image = tf.image.decode_jpeg(image, channels=3)
            show(axs[i,j], image)

visualize(train_paths)

In [None]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [None]:
def serialize_example(row):    
    anchor_img = encode_image(row['anchor'])
    positive_img = encode_image(row['positive'])
    negative_img = encode_image(row['negative'])
    
    feature = {
        'anchor_img': _bytes_feature(anchor_img),
        'positive_img': _bytes_feature(positive_img),
        'negative_img': _bytes_feature(negative_img),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def write_tfrecord_file(df, filepath, filename, file_index, file_size, image_indexes):       
    with tf.io.TFRecordWriter(f'{filepath}/{filename}%.2i.tfrec'%(file_index)) as writer:
        start = file_size * file_index
        end = file_size * (file_index + 1)
        for i in tqdm(image_indexes[start:end]):
            example = serialize_example(df.loc[i])
            writer.write(example)

def df_to_tfrecords(df, filepath, filename):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    image_indexes = df.index.values
    file_size = len(image_indexes) // 15
    file_count = len(image_indexes) // file_size + int(len(image_indexes) % file_size != 0)
    for file_index in range(file_count):
        print('Writing TFRecord %i of %i...'%(file_index, file_count))
        write_tfrecord_file(df, filepath, filename, file_index, file_size, image_indexes)

In [None]:
df_to_tfrecords(train_paths, './train', 'train')
df_to_tfrecords(test_paths, './test', 'test')