# Shopee product matching triplet data conversion to TFRecords

Kaggle allows for competitors to expedite deep learning model training via tensor processing units (TPUs). However, it is necessary to convert the data into a TFRecord format and feed these files through Google Cloud Storage (GCS). [Conversion to TFRecords enables one to fully take advantage of the extra processing power TPUs provide by avoiding data bottlenecks](https://www.kaggle.com/docs/tpu)

This notebook demonstrates the encoding of the Shopee image data into TFRecords. Based on [mattbast's work in the Google Landmark Retrieval 2020 competition](https://www.kaggle.com/mattbast/google-landmarks-2020-create-a-tfrecord-dataset/notebook)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import io
from PIL import Image

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from kaggle_datasets import KaggleDatasets

For large datasets, it is good practice to split TFRecords into "shards". 

In [None]:
NUM_SHARDS = 16
IMAGE_SIZE = (600, 600)
SEED=42

IMAGE_DIR = '../input/shopee-product-matching/train_images'

## Basic EDA for confirmation

In [None]:
df = pd.read_csv('../input/shopee-product-matching/train.csv')
display(df)

In [None]:
df.groupby(['label_group'])['posting_id'].nunique().sort_values()

This confirms that the maximum size of a label group to be 50, as stated in the competition rules.

## Detecting matches

The following two cells create a list of matches. This is not required, but may be useful for model training.

In [None]:
match_map = df.groupby(['label_group'])['posting_id'].unique().to_dict()
df['matches'] = df['label_group'].map(match_map)
df

In [None]:
label_mapper = dict(zip(df['label_group'].unique(), np.arange(len(df['label_group'].unique()))))
df['label_group'] = df['label_group'].map(label_mapper)
display(df)

## Convert dataframe to TFRecords

In [None]:
def encode_image(filepath, method='bilinear'):
    image_string = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image_string, channels=3)
    #Must convert dtype to float32 for most resizing methods to work
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, IMAGE_SIZE, method=method, antialias=True)
    #Convert dtype to uint8 to be encoded to bytestring for tfrec
    image = tf.image.convert_image_dtype(image, tf.uint8)
    image = tf.image.encode_jpeg(image, optimize_size=True)
    return image

def featurize(val):
    if isinstance(val, (bytes, str, tf.Tensor)):
        if isinstance(val, type(tf.constant(0))):
            val = val.numpy() 
        elif isinstance(val, str):
            val = str.encode(val)
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[val]))
    elif isinstance(val, (int, np.integer)):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[val]))
    elif isinstance(val, (float, np.floating)):
        return tf.train.Feature(float_list=tf.train.FloatList(value=[val]))
    else:
        raise Exception(f'Cannot featurize due to type {type(val)}')

In [None]:
def serialize_example(row):    
    feature = row.to_dict()
    img_path = os.path.join(IMAGE_DIR, feature['image'])
    feature['image'] = encode_image(img_path)
    feature['matches'] = tf.io.serialize_tensor(tf.convert_to_tensor(feature['matches']))
    for k,v in feature.items():
        feature[k] = featurize(v)
        
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def write_tfr(df, filepath, filename, file_index, file_size, image_indexes):       
    with tf.io.TFRecordWriter(f'{filepath}/{filename}%.2i.tfrec'%(file_index)) as writer:
        start = file_size * file_index
        end = file_size * (file_index + 1)
        for i in tqdm(image_indexes[start:end]):
            example = serialize_example(df.loc[i])
            writer.write(example)

def to_tfr(df, filepath, filename):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    image_indexes = df.index.values
    file_size = len(image_indexes) // 15
    file_count = len(image_indexes) // file_size + int(len(image_indexes) % file_size != 0)
    for file_index in range(file_count):
        print('Writing TFRecord %i of %i...'%(file_index, file_count))
        write_tfr(df, filepath, filename, file_index, file_size, image_indexes)

In [None]:
to_tfr(df, './train', 'train')