# Setting

In [None]:
import pathlib

import numpy as np
import pandas as pd
import cv2
import tqdm
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
from tqdm.notebook import tqdm as note_book_tqdm

note_book_tqdm.pandas(desc="progress: ")

FOLDS = 2
SEED = 1
IMAGE_SIZE = (384, 384)

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def serialize_example(posting_id, image, title, label_group):
    feature = {
        'posting_id': _bytes_feature(posting_id),
        'image': _bytes_feature(image),
        'title': _bytes_feature(title),
        'label_group': _int64_feature(label_group)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto

# Data Loading

In [None]:
df_train = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
label_mapper = dict(zip(df_train['label_group'].unique(), np.arange(len(df_train['label_group'].unique()))))
df_train['label_group'] = df_train['label_group'].map(label_mapper)

In [None]:
df_train.head()

In [None]:
kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(df_train, df_train['label_group'])):
    df_train.loc[val_ind, 'fold'] = fold
df_train['fold'] = df_train['fold'].astype(int)

In [None]:
df_train.head()

# Convert to TFRecords

In [None]:
def write_tfrecord(df_train):
    for i in range(FOLDS):
        df_fold_train = df_train[df_train['fold'] == i]
        with tf.io.TFRecordWriter(f"train_{i}.tfrecord") as writer:
            for row in df_fold_train.itertuples():
                posting_id = row.posting_id
                posting_id = str.encode(posting_id)
                label_group = row.label_group
                title = row.title
                title = str.encode(title)
                image = cv2.imread('../input/shopee-product-matching/train_images/' + row.image)
                image = cv2.resize(image, IMAGE_SIZE)
                image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
                tf_example = serialize_example(posting_id, image, title, label_group)
                writer.write(tf_example.SerializeToString())
            print(f'finish convert train_{i}.tfrecord')

In [None]:
write_tfrecord(df_train)

In [None]:
df_train.to_csv('train.csv', index = False)