
## Creating TF record files 
This notebook is my attempt to learn creating TF record files . TF-record files are very useful to if you want to you TPU for model solving . A big limiting constrain in TPU using is data feeding into it and TF record helps us to solve that constraint.

I am going to follow Chris [notebook](https://www.kaggle.com/cdeotte/how-to-create-tfrecords) to create the TR-record files for the Shopee Dataset 


In [None]:
import pandas as pd 
import numpy as np 
import os 
import re 
import tensorflow as tf 
import cv2
from sklearn import model_selection,preprocessing
import math
import matplotlib.pyplot as plt 

%matplotlib inline 

In [None]:
df_train=pd.read_csv('../input/shopee-product-matching/train.csv')


#Removing the duplicated 

df_train=df_train.drop_duplicates(subset='image').reset_index(drop=True)

root='../input/shopee-product-matching/train_images'

df_train['fullpaths']=df_train['image'].apply(lambda x:os.path.join(root,x))



targets=df_train.groupby('label_group')['posting_id'].unique().to_dict()
df_train['targets']=df_train['label_group'].map(targets)


### In determining similarity case we cannot use the normal kfold or stratified kfold approach 
## Lets split the dataset using  groupkfold. Groupkfold helps to distribute groups amount train and test set . So this will 
## allow us to train on the data and then test the model on some group examples which the model has not seen before

gf=model_selection.GroupKFold(n_splits=10)

df_train['fold']=-1

for fold,(train_idx,valid_idx) in enumerate(gf.split(df_train,None,df_train['label_group'])):
    df_train.loc[valid_idx,'fold']=fold
    
## Label encoding the label_group 

lb=preprocessing.LabelEncoder()

df_train['label_group_encoded']=lb.fit_transform(df_train['label_group'].values.reshape(-1,1))

df_train.to_csv('./train_encoded.csv',index=False)

In [None]:
df_train=pd.read_csv('./train_encoded.csv')
df_train.head()

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'target': _int64_feature(feature2)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
for groupfold in df_train['fold'].unique():
    images=df_train[df_train['fold']==groupfold].reset_index(drop=True)
    with tf.io.TFRecordWriter(f'train_{groupfold}-{images.shape[0]}.tfrec') as writer:
        for k in np.arange(images.shape[0]):
            img=cv2.imread(images.loc[k,'fullpaths'])
            #img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
            img = cv2.imencode('.jpg', img)[1].tobytes()
            name=images.loc[k,'image']
            example=serialize_example(
                img,
                str.encode(name),
                df_train.loc[k,'label_group_encoded']
            )
            writer.write(example)
    print("Record written for fold {} out of 9 fold".format(groupfold) )

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(str(title)) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = label
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.image.resize(image, size=[*IMAGE_SIZE]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['image_name']
    return image, label # returns a dataset of (image, label) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES
IMAGE_SIZE= [512,512]; BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
print('There are %i train images'%count_data_items(TRAINING_FILENAMES))

In [None]:
# DISPLAY TRAIN IMAGES
training_dataset = get_training_dataset()

training_dataset = training_dataset.unbatch().take(6)
train_batch = iter(training_dataset)

In [None]:
img=[]
for image, _ in training_dataset:
    img.append(image)
    


In [None]:
plt.figure(figsize=(10,10))
for i in range(6):
    plt.subplot(3,2,i+1)
    plt.imshow(img[i]);
    plt.axis('off');