Hello everyone, I am a new Kaggler and happy to join this competition with all of you.

I write the notebook to convert train data to tfrecord and save to my Google Cloud Storage. 
I will share this dataset when the progress has done.

In the first version, I will convert the image to size 224x224x3 and save into 128 tfrecord files. 

In [None]:
import pandas as pd
import os
import os, sys, math
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from google.cloud import storage
from PIL import Image
AUTO = tf.data.experimental.AUTOTUNE 

In [None]:
TRAIN_META_PATH = '../input/landmark-recognition-2021/train.csv'
CRE_PATH = '../input/dtjson/datalab-284706-d4063c8726d8.json'
!gcloud auth activate-service-account --key-file '{CRE_PATH}'

In [None]:
train_df = pd.read_csv(TRAIN_META_PATH)
train_df.head(3)

In [None]:
train_df['path'] = train_df['id'].apply(lambda r: os.path.join('../input/landmark-recognition-2021/train',r[0], r[1], r[2], r + '.jpg'))
train_df['path_label'] = train_df['path'] + ',' + train_df['landmark_id'].astype(str)

In [None]:
train_df.head(2)

In [None]:
Image.open(train_df.iloc[11].path).resize((224,224))

In [None]:
GCS_OUTPUT = 'gs://challenge_ngoan/gglm21_kaggle/tfrecords-224x224/train_'  # prefix for output file names
SHARDS = 128
TARGET_SIZE = [224, 224]
CLASSES = list(set(train_df.landmark_id))
nb_images = len(train_df['path_label'])
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

In [None]:
def decode_jpeg_and_label(filename_label):
    results = tf.strings.split(tf.expand_dims(filename_label, axis=-1), sep=',')
    filename = results.values[0]
    label = results.values[1]
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits)
    return image, label

def resize_and_crop_image(image, label):
    w = tf.shape(image)[0]
    h = tf.shape(image)[1]    
    tw = TARGET_SIZE[1]
    th = TARGET_SIZE[0]
    image = tf.image.resize(image, [tw, tw])
    height = tf.shape(image)[0]
    width = tf.shape(image)[1]
    image = tf.cast(image, tf.uint8)
    image = tf.image.encode_jpeg(image, optimize_size=False, chroma_downsampling=False)
    return image, label, height, width

filenames = tf.data.Dataset.from_tensor_slices(list(train_df['path_label'])) # This also shuffles the images
dataset1 = filenames.map(decode_jpeg_and_label)

dataset2 = dataset1.map(resize_and_crop_image, num_parallel_calls=AUTO)  
dataset2 = dataset2.batch(shard_size)

In [None]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  

def to_tfrecord(tfrec_filewriter, img_bytes, label, height, width):  
    class_num = np.argmax(np.array(CLASSES)==label) # 'roses' => 2 (order defined in CLASSES)
    one_hot_class = np.eye(len(CLASSES))[class_num]     # [0, 0, 1, 0, 0] for class #2, roses
  
    feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class": _int_feature([class_num]),        # one class in the list      
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
      "size":          _int_feature([height, width]),         # fixed length (2) list of ints
      "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length  list of floats, n=len(CLASSES)
  }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label, height, width) in enumerate(dataset2):
    shard_size = image.numpy().shape[0]
    filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)
    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(shard_size):
            example = to_tfrecord(out_file,
                            image.numpy()[i],
                            label.numpy()[i],
                            height.numpy()[i],
                            width.numpy()[i])
            out_file.write(example.SerializeToString())
            print("Wrote file {} containing {} records".format(filename, i))