Purpose of this notebook is to enable creation of TFRecords which can be used to train models created using Tensorflow on TPU.  
I was inspired from https://www.kaggle.com/rahulchauhan3j/tfrecords-creation.

## Contents in tfrecord
- "image" in tfrecord : resized image (made by cv2)
- "feature" in tfrecord : 12 features of image('Subject Focus', 'Eyes', ~ 'Blur')
- "Pawpularity" in tfrecord : target of regression(raw Pawpularity)
- "PawpularityCls" in tfrecord : label of classification (=Pawpularity/100.0)

## Dataset
The datasets I had created are available.
- 128 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size128
- 224 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size224
- 256 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size256
- 384 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size384
- 512 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size512
- 768 pixel : https://www.kaggle.com/motono0223/petfinder-pawpularity-tfrecords-size768

The scret dataset on this notebook is the credientials file of kaggle API.
If you would create the dataset, please refer to this site.
https://github.com/Kaggle/kaggle-api

In [None]:
import numpy as np
import pandas as pd, os
from datetime import datetime
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re, math
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

In [None]:
RESIZE = 768
IMAGE_SIZE= [RESIZE,RESIZE] 
BATCH_SIZE = 32
N_FOLDS = 30

In [None]:
PATH = '../input/petfinder-pawpularity-score/train/'
IMGS = os.listdir(PATH)

print('There are %i train images '%(len(IMGS)))

In [None]:
df = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
df.rename({'Id':'image_name'},axis=1,inplace=True)
num_bins = int(np.floor(1 + np.log2(len(df))))
df["PawpularityBins"] = pd.cut(df['Pawpularity'].values.reshape(-1), bins=num_bins, labels=False)

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True)
df["fold"] = 0
for fold, (train_index, test_index) in enumerate(skf.split(df["PawpularityBins"].values, df["PawpularityBins"].values)):
    df.loc[ test_index, "fold"] = fold
    print(f"fold={fold}, num_images={len(test_index)}")

df.head()

## Write TFRecords - Train

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1, feature2, feature3, feature4):
    feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'feature': _bytes_feature(feature2),
      'Pawpularity': _int64_feature(feature3),
      'PawpularityCls': _float_feature(feature4),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
for fold in range(N_FOLDS):
    print(); print('Writing TFRecord %i of %i...'%(fold,N_FOLDS))
    df_fold = df[ df["fold"] == fold ]
    num_contents = df_fold.shape[0]
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(fold, num_contents)) as writer:
        for k, image_name in enumerate(tqdm( df_fold["image_name"].values )):
            image_path = PATH + image_name + ".jpg"
            img = cv2.imread(image_path)
            img = cv2.resize(img, (RESIZE,RESIZE))
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 95))[1].tobytes()
            row = df.loc[df.image_name== image_name]
            
            cnames = ['Subject Focus', 'Eyes', 'Face', 'Near', 
                      'Action', 'Accessory', 'Group','Collage','Human',
                      'Occlusion', 'Info', 'Blur']
            feature = np.array([ row[cname].values[0] for cname in cnames ]).astype(np.int64)
            
            example = serialize_example(
                img, str.encode(image_name),
                feature.tobytes(), 
                row['Pawpularity'].values[0],
                row['Pawpularity'].values[0]/100.0
            )
            writer.write(example)

In [None]:
!ls -l

# Verify TFRecords

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, features, targets, labels = data
    numpy_images = images.numpy()
    numpy_features = features.numpy()
    numpy_targets = targets.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_features, numpy_targets, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, features, targets, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, feat, target, label) in enumerate(zip(images[:rows*cols], features[:rows*cols], targets[:rows*cols], labels[:rows*cols])):
        feat_str=''
        for val in feat:
            feat_str+= str(int(val))
        title = f'Paw:{target},f:{feat_str}'
        correct = True
#         if predictions is not None:
#             title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def decode_feature(feature_data):
    return tf.cast( tf.io.decode_raw(feature_data, tf.int64), tf.float32)

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        'image_name': tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        'feature': tf.io.FixedLenFeature([], tf.string),
        'Pawpularity': tf.io.FixedLenFeature([],tf.int64),
        'PawpularityCls': tf.io.FixedLenFeature([],tf.float32),
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    feature = decode_feature(example['feature'])
    target = example["Pawpularity"]
    label = example["PawpularityCls"]
    return image, feature, target, label

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(512)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
print('There are %i train images'%count_data_items(TRAINING_FILENAMES))

In [None]:
# DISPLAY TRAIN IMAGES
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(20)
train_batch = iter(training_dataset)

imgs, feats, targets, labels = next(train_batch)
print("check shape", imgs.shape, feats.shape, targets.shape, labels.shape)

display_batch_of_images(next(train_batch))

# Create Training tfrecord

In [None]:
USER_NAME = "motono0223"
DATASET_NAME = f'petfinder-pawpularity-tfrecords-size{IMAGE_SIZE[0]}'

In [None]:
%%time
import json

### Create Kaggle Dataset if not exists 

!rm -r /tmp/{DATASET_NAME}

os.makedirs(f'/tmp/{DATASET_NAME}', exist_ok=True)

with open('/kaggle/input/kaggle-api-creds/kaggle.json') as f:
    kaggle_creds = json.load(f)
    
os.environ['KAGGLE_USERNAME'] = USER_NAME
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

!kaggle datasets init -p /tmp/{DATASET_NAME}

with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)
dataset_meta['id'] = f'{USER_NAME}/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME
with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)
print(dataset_meta)

!cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
!ls /tmp/{DATASET_NAME}

!kaggle datasets create -u -p /tmp/{DATASET_NAME} 

In [None]:
!cp /kaggle/working/*tfrec /tmp/{DATASET_NAME}/

In [None]:
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
!kaggle datasets version -m {version_name} -p /tmp/{DATASET_NAME}/ -r zip -q