Previous notebook:
1. https://www.kaggle.com/cdeotte/how-to-create-tfrecords
2. https://www.kaggle.com/lhagiimn/shopee-tf-with-tpu-256x256-tfrecs


Split dataset. create TFRecords based on train/val set.

In [None]:
# LOAD LIBRARIES
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
import tensorflow as tf, re, math
import random
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import tensorflow_hub as hub

In [None]:
# set your image size:
img_size = 512

PATH = '../input/shopee-product-matching/train_images/'
IMGS = os.listdir(PATH)
print('There are %i train images' %(len(IMGS)))

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

def get_data(rate = 0.2):
    df = pd.read_csv('../input/shopee-product-matching/train.csv')
    df['image_name'] = df['image']
    df['image'] = ('../input/shopee-product-matching/train_images/' + df['image'])
    le = preprocessing.LabelEncoder()
    df['label_group'] = le.fit_transform(df['label_group'].values)
    tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
    df['matches'] = df['label_group'].map(tmp)
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    target = int(df.shape[0] * (1-rate))
    labels = set(df['label_group'])
    label_dic = {}
    for label in labels:
        label_dic[label] = df[df['label_group'] == label].index.tolist()
    
    train_index = []
    while len(train_index) < target:
        for label in labels:
            if label_dic[label] != []:
                index = random.choice(label_dic[label])
                train_index.append(index)
                label_dic[label].remove(index)
                if len(train_index) >= target:
                    break
    
    train_set = df.loc[train_index,:].reset_index(drop = True)
    val_set = df.drop(train_index).reset_index(drop = True)
    return train_set, val_set

In [None]:
def _bytes_feature(value):
    
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1, feature2, feature3, 
                     feature4, feature5, feature6):
    feature = {
        'image': _bytes_feature(feature0),
        'image_name': _bytes_feature(feature1),
        'posting_id': _bytes_feature(feature2),
        'image_phash': _bytes_feature(feature3),
        'title': _bytes_feature(feature4),
        'label_group': _int64_feature(feature5), 
        'matches': _bytes_feature(feature6)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def create_train_recored(x_train):
    CT = x_train.shape[0]//SIZE + int(x_train.shape[0]%SIZE!=0)
    counter = 0
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,x_train.shape[0]-j*SIZE)
        with tf.io.TFRecordWriter('train%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
            
            for k in range(CT2):
                img = cv2.imread(x_train.image[counter])
                
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = x_train.image_name[counter]
                row = x_train.loc[counter]
                counter+= 1
                example = serialize_example(
                    img, 
                    str.encode(name), 
                    str.encode(row['posting_id']),
                    str.encode(row['image_phash']),
                    str.encode(row['title']),
                    row['label_group'], 
                    str.encode(row['matches']),)
                writer.write(example)
                if k%100==0: print(k,', ',end='')

In [None]:
def create_val_recored(x_val):
    CT = x_val.shape[0]//SIZE + int(x_val.shape[0]%SIZE!=0)
    counter = 0
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,x_val.shape[0]-j*SIZE)
        with tf.io.TFRecordWriter('val%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
            
            for k in range(CT2):
                img = cv2.imread(x_val.image[counter])
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = x_val.image_name[counter]
                row = x_val.loc[counter]
                counter+= 1
                example = serialize_example(
                    img, 
                    str.encode(name), 
                    str.encode(row['posting_id']),
                    str.encode(row['image_phash']),
                    str.encode(row['title']),
                    row['label_group'], 
                    str.encode(row['matches']),)
                writer.write(example)
                if k%100==0: print(k,', ',end='')

In [None]:
def get_record(seed):
    seed_everything(seed)
    train,val = get_data()
    create_train_recored(train)
    create_val_recored(val)
    gc.collect()

In [None]:
SIZE = 2740
seed_list = [2107,2021,8964,9527]
for seed in seed_list:
    get_record(seed)