<br>
<h2 style = "font-size:50px; font-family:Monaco ; font-weight : normal; background-color: #f6f5f5 ; color : #031cfc; text-align: center; border-radius: 100px 100px;">[Tensorflow] Creating TFRecords</h2>
<br>

For more information on **TFRecord Creation**, refer to the [Quick Keras Recipe](https://keras.io/examples/keras_recipes/creating_tfrecords/) by [Dimitre Oliveira](https://www.linkedin.com/in/dimitre-oliveira-7a1a0113a/).

In [None]:
import os
import wandb
import numpy as np
import pandas as pd
from glob import glob
import tensorflow as tf
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from kaggle_secrets import UserSecretsClient

In [None]:
CONFIGS = {
    "data_dir": "../input/happy-whale-and-dolphin",
    "tfrecord_dir": "happy_whale_tfrecords",
    "target_size": 512,
    "anit_aliasing": True
}

if not os.path.exists(os.path.join(CONFIGS["tfrecord_dir"], "train")):
    os.makedirs(os.path.join(CONFIGS["tfrecord_dir"], "train"))

if not os.path.exists(os.path.join(CONFIGS["tfrecord_dir"], "test")):
    os.makedirs(os.path.join(CONFIGS["tfrecord_dir"], "test"))

In [None]:
!tree {CONFIGS["tfrecord_dir"]}

In [None]:
train_data = pd.read_csv(os.path.join(CONFIGS["data_dir"], "train.csv"))
train_data.head()

In [None]:
unique_labels = list(train_data.species.unique())
CONFIGS["label_map"] = {label: idx for idx, label in enumerate(unique_labels)}
CONFIGS["shard_size"] = 1024

# train_data = train_data.head(1000) # For Testing
test_image_list = glob(os.path.join(CONFIGS["data_dir"], "test_images", "*"))

num_train_samples = len(train_data)
num_test_samples = len(test_image_list)

CONFIGS["num_train_tfrecords"] = num_train_samples // CONFIGS["shard_size"]
if num_train_samples % CONFIGS["shard_size"]:
    CONFIGS["num_train_tfrecords"] += 1  # add one record if there are any remaining samples

CONFIGS["num_test_tfrecords"] = num_train_samples // CONFIGS["shard_size"]
if num_train_samples % CONFIGS["shard_size"]:
    CONFIGS["num_test_tfrecords"] += 1  # add one record if there are any remaining samples

In [None]:
def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()])
    )


def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def create_example_train(image, species, individual_id):
    feature = {
        "image": image_feature(image),
        "species": bytes_feature(species),
        "species_label": int64_feature(CONFIGS["label_map"][species]),
        "individual_id": bytes_feature(individual_id)
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def create_example_test(image):
    return tf.train.Example(
        features=tf.train.Features(feature={
            "image": image_feature(image)
        })
    )


def get_samples(data_list, tfrec_num):
    return data_list[
        (tfrec_num * CONFIGS["shard_size"]) : ((tfrec_num + 1) * CONFIGS["shard_size"])
    ]

In [None]:
image_list = list(train_data["image"])
species_list = list(train_data["species"])
individual_id_list = list(train_data["individual_id"])
for tfrec_num in range(CONFIGS["num_train_tfrecords"]):
    image_samples = get_samples(image_list, tfrec_num)
    species_samples = get_samples(species_list, tfrec_num)
    individual_id_samples = get_samples(individual_id_list, tfrec_num)
    num_samples = len(image_samples)
    tfrecord_path = os.path.join(
        CONFIGS["tfrecord_dir"], "train",
        "file_%.2i-%i.tfrec" % (tfrec_num, num_samples)
    )
    print(f"\nWriting Data to {tfrecord_path}, tfrecord ({tfrec_num + 1}/{CONFIGS['num_train_tfrecords']})...\n")
    with tf.io.TFRecordWriter(tfrecord_path) as writer:
        for idx in tqdm(range(num_samples)):
            image_path = os.path.join(
                CONFIGS["data_dir"], "train_images", image_samples[idx]
            )
            image = tf.io.decode_jpeg(tf.io.read_file(image_path))
            image = tf.image.resize(
                image, (CONFIGS["target_size"], CONFIGS["target_size"]),
                antialias=CONFIGS["anit_aliasing"]
            )
            image = tf.cast(image, dtype=tf.uint8)
            species = species_list[idx]
            individual_id = individual_id_list[idx]
            example = create_example_train(image, species, individual_id)
            writer.write(example.SerializeToString())

In [None]:
for tfrec_num in range(CONFIGS["num_test_tfrecords"]):
    image_samples = get_samples(test_image_list, tfrec_num)
    num_samples = len(image_samples)
    tfrecord_path = os.path.join(
        CONFIGS["tfrecord_dir"], "test",
        "file_%.2i-%i.tfrec" % (tfrec_num, num_samples)
    )
    print(f"\nWriting Data to {tfrecord_path}, tfrecord ({tfrec_num + 1}/{CONFIGS['num_test_tfrecords']})...\n")
    with tf.io.TFRecordWriter(tfrecord_path) as writer:
        for idx in tqdm(range(num_samples)):
            image = tf.io.decode_jpeg(tf.io.read_file(image_samples[idx]))
            image = tf.image.resize(
                image, (CONFIGS["target_size"], CONFIGS["target_size"]),
                antialias=CONFIGS["anit_aliasing"]
            )
            image = tf.cast(image, dtype=tf.uint8)
            example = create_example_test(image)
            writer.write(example.SerializeToString())

In [None]:
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("wandb_api_key")
os.environ["WANDB_API_KEY"] = wandb_api_key

with wandb.init(project="happy-whale", entity="geekyrakshit", config=CONFIGS):
    artifact = wandb.Artifact('happy-whale-tfrecords', type='dataset')
    artifact.add_dir(CONFIGS["tfrecord_dir"])
    wandb.log_artifact(artifact)