# create-tfrecord

Produces tfrecords to be used in training

Reference: https://www.kaggle.com/ragnar123/shopee-tf-records-512

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
import math
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
import gc


# Amount of tf records we want to create
FOLDS = 15
# Random seed for stratification
SEED = 123
# Image size 
IMAGE_SIZE = (512, 512)

In [3]:
# Function to read and preprocess our data
def preprocess():
    # Read train and test csv
    train = pd.read_csv('../../data/train.csv')
    label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
    train['label_group'] = train['label_group'].map(label_mapper)
    # Get ground truth labels format
    tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
    train['matches'] = train['label_group'].map(tmp)
    train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    # Calculate naive score using self-post

    return train

train = preprocess()

kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train['label_group'])):
    train.loc[val_ind, 'fold'] = fold
train['fold'] = train['fold'].astype(int)


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    tensor = tf.convert_to_tensor(array)
    result = tf.io.serialize_tensor(tensor)
    return result

def serialize_example(posting_id, image, title, label_group, matches):
    feature = {
        'posting_id': _bytes_feature(posting_id),
        'image': _bytes_feature(image),
        'title': _bytes_feature(title),
        'label_group': _int64_feature(label_group),
        'matches': _bytes_feature(matches)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()



In [7]:
for fold in range(FOLDS):
    print('\n')
    print('-'*50)
    print(f'Writing TFRecord {fold} of {FOLDS - 1}...')
    train_ = train[train['fold'] == fold]
    # Get indices to slice our text features
    with tf.io.TFRecordWriter('../../data/tfrecords-new/train%.2i-%i.tfrec'%(fold, train_.shape[0])) as writer:
        for k in range(train_.shape[0]):
            row = train_.iloc[k]
            image = cv2.imread('../../data/train_images/' + row['image'])
            image = cv2.resize(image, IMAGE_SIZE)
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            title = row['title']
            posting_id = row['posting_id']
            label_group = row['label_group']
            matches = row['matches']
            example = serialize_example(str.encode(posting_id),
                                        image,
                                        str.encode(title),
                                        label_group,
                                        str.encode(matches))
            writer.write(example)
            if k%100==0: print(k,', ',end='')



--------------------------------------------------
Writing TFRecord 0 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 

--------------------------------------------------
Writing TFRecord 1 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 

--------------------------------------------------
Writing TFRecord 2 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 

--------------------------------------------------
Writing TFRecord 3 of 14...
0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 

--------------------------------------------------
Writing TFRecord 4 of 14...