Hi!
<br>It's not a secret that using of tfrecords provide better efficienty with tf.data.* API
<br>I've made an attempt to write as compact code as possible for Shopee matching competition tfrecords generation 
<br>Sure, datast splitting is not mandatory
<br>You could find resulting dataset here: https://www.kaggle.com/alturutin/shopee-product-match-tfrecords
<br>Hope it will help someone

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [None]:
# make tfrecords

def image_feature(value):
    """Returns a bytes_list from a jpeg"""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))


def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def preprocess_img(file_path, img_size=512):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_image(image, channels = 3)
    image = tf.image.resize(image, [img_size, img_size])
    image = tf.cast(image, tf.uint8)
    return image

def create_example(image, label, label_name):
    feature = {
        "image": image_feature(image),
        "label": int64_feature(label),
        "label_name": int64_feature(label_name)
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label_name": tf.io.FixedLenFeature([], tf.int64),
        "label": tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.io.decode_jpeg(example["image"], channels=3)
    return example


def validation_split(df):
    x_train, x_val, y_train, y_val = train_test_split(df[['image']], df['label_group'], shuffle = True, random_state = 2021, test_size = 0.5)
    return (
        x_train.squeeze().values, 
        x_val.squeeze().values, 
        y_train.values, 
        y_val.values
    )

def preprocess_train(df):
    tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
    df['matches'] = df['label_group'].map(tmp)
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    encoder = LabelEncoder()
    df['label_group'] = encoder.fit_transform(df['label_group'])
    return df[['posting_id', 'image', 'label_group']], encoder



In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
train, label_encoder = preprocess_train(train)
x_train, x_val, y_train, y_val = validation_split(train)

In [None]:
!mkdir tfrec_512x512
!mkdir tfrec_512x512/train
!mkdir tfrec_512x512/valid

!mkdir tfrec_300x300
!mkdir tfrec_300x300/train
!mkdir tfrec_300x300/valid

In [None]:
def write_tfrecords(x, y, label_encoder, sample_size, path, img_size):
    n_shardes = int(np.ceil(len(x_train) / sample_size))
    for i in tqdm(range(n_shardes)):
        images = x[i*sample_size:(i+1)*sample_size]
        labels = y[i*sample_size:(i+1)*sample_size]
        source_labels = label_encoder.inverse_transform(labels)
        with tf.io.TFRecordWriter(f"{path}/{i}.tfrec") as writer:
            for im, lb, sl in zip(images, labels, source_labels):
                image_path = f"../input/shopee-product-matching/train_images/{im}"
                image = preprocess_img(image_path, img_size)
                example = create_example(image, lb, sl)
                writer.write(example.SerializeToString())
                
write_tfrecords(x_train, y_train, label_encoder, 1024, "tfrec_300x300/train", 300)
write_tfrecords(x_val, y_val, label_encoder, 1024, "tfrec_300x300/valid", 300)

write_tfrecords(x_train, y_train, label_encoder, 1024, "tfrec_512x512/train", 512)
write_tfrecords(x_val, y_val, label_encoder, 1024, "tfrec_512x512/valid", 512)


In [None]:
!ls -l --block-size=MB tfrec_300x300/train/

In [None]:
!ls -l --block-size=MB tfrec_300x300/valid/

In [None]:
!ls -l --block-size=MB tfrec_512x512/train/

In [None]:
!ls -l --block-size=MB tfrec_512x512/valid/

In [None]:
# read tfrecords example 

# import matplotlib.pyplot as plt

# dataset = tf.data.TFRecordDataset(sorted(tf.io.gfile.glob('tfrec/train/*.tfrec')))
# dataset = dataset.map(parse_tfrecord_fn)

# for features in dataset.take(1):
#     for key in features.keys():
#         if key != "image":
#             print(f"{key}: {features[key]}")
#     print(f"Image shape: {features['image'].shape}")
#     plt.figure(figsize=(7, 7))
#     plt.imshow(features["image"].numpy())
#     plt.show()
