# How To Create TFRecords
In this notebook i ging to show you how i've created my own TFrecords for cassava leaf [competition](https://www.kaggle.com/c/cassava-leaf-disease-classification/data), i did this because i want to learn more about tfrecords and TPU processing. This notebook contain, how to create TFRecords and how to upload them to google cloud storage.

> # Load Meta Data

In [None]:
# LOAD LIBRARIES
from sklearn.model_selection import StratifiedKFold
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
import tensorflow as tf, re, math
import pandas_profiling as pp
from PIL import Image

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

def plot_distribution(df_table):
    temp = df_table.copy()
    temp["count"] = 1
    temp = temp[["label", "count"]].groupby(["label"]).sum()
    class_count = temp.reset_index()
    fig = px.bar(class_count, x="label", y="count", title="Count per leaf desease",text=class_count['count'])
    fig.update_traces(texttemplate='%{text:.2s}',textposition='outside')
    fig.update_layout(template= "plotly_dark" , 
                      xaxis = dict(title = "Cassava Leaf Desease"))
    fig.show()

In [None]:
# PATHS TO IMAGES
PATH = '../input/cassava-leaf-disease-merged/train/'
IMGS = os.listdir(PATH)
print('There are {} train images '.format(len(IMGS)))

In [None]:
# LOAD TRAIN META DATA
df = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
df.rename({'image_id':'image_name'},axis=1,inplace=True)
df.tail()

> # Connect to GCS

You have to create a projecto on google cloud platform, only pay for storage
* https://cloud.google.com/storage/docs/

Then you need to activate the Add-ons menu buttom above "Google cloud services"

In [None]:
ID_PROJECT = 'spring-paratext-297605'
from google.cloud import storage
storage_client = storage.Client(project=ID_PROJECT)

def create_bucket(dataset_name):
    """Creates a new bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.create_bucket(dataset_name)
    print('Bucket {} created'.format(bucket.name))

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))
    
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket. https://cloud.google.com/storage/docs/"""
    blobs = storage_client.list_blobs(bucket_name)
    for blob in blobs:
        print(blob.name)
        
def download_to_kaggle(bucket_name,destination_directory,file_name):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = storage_client.list_blobs(bucket_name)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

### Create a New Bucket
Only if you want to. Or you only need to call your current bucket

In [None]:
BUCKET_NAME = 'cassava_tfrecords_merge_kfold'         
try:
    create_bucket(BUCKET_NAME)   
except:
    print("Error: truying to create the bucket {}".format(BUCKET_NAME))

> # Write TFRecords - Train
All the code below comes from TensorFlow's docs [here][1]

[1]: https://www.tensorflow.org/tutorials/load_data/tfrecord

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_feature_array(value):## This is for storage arrays
  """Returns a float_list from array"""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1, feature2, feature3,):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'label': _float_feature(feature2),
      'hotencode': _float_feature_array(feature3),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[["label"]])
#enc.transform([[3]]).toarray()[0].tobytes()

### Sorting images in IMGS for Stratify kfold

In [None]:
n_splits = 50
skf = StratifiedKFold(n_splits=n_splits)
sort_list = []
for ind_k, (index_train, index_val) in enumerate(skf.split(df["image_name"], df["label"])):
    sort_list = sort_list + index_val.tolist()
    #plot_distribution(df.iloc[index_val])
df2 = df.iloc[sort_list].reset_index()
IMGS =df2.image_name.tolist()

plot_distribution(df.iloc[index_val])

In [None]:
SIZE = len(IMGS)//n_splits 
CT = len(IMGS)//SIZE + int(len(IMGS)%SIZE!=0)
for j in range(CT):
    print(); print('Writing TFRecord {} of {}...'.format(j,CT))
    CT2 = min(SIZE,len(IMGS)-j*SIZE)
    tfrecord_name = 'train{:.0f}-{:.0f}.tfrec'.format(j,CT2)
    source_tfrecord = "./" + tfrecord_name
    with tf.io.TFRecordWriter(tfrecord_name) as writer:
        for k in range(CT2):
            img = cv2.imread(PATH+IMGS[SIZE*j+k])
            #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Fix incorrect colors
            img = cv2.imencode('.jpeg', img,(cv2.IMWRITE_JPEG_QUALITY, 100 ) )[1].tobytes() #
            name = IMGS[SIZE*j+k]
            row = df.loc[df.image_name==name]
            example = serialize_example(
                                        img, 
                                        str.encode(name),
                                        row.label.values[0],
                                        enc.transform([[row.label.values[0]]]).toarray()[0],
            )
            writer.write(example)
            if k%100==0: print(k,', ',end='')
        #upload_blob(BUCKET_NAME, source_tfrecord, tfrecord_name)

### Maybe you want to upload all tfrecord after they'll be ready

In [None]:
import glob
filenames = glob.glob("./*.tfrec")
destination = [filename.split("/")[-1] for filename in filenames]

for source, destination in zip(filenames, destination):
    upload_blob(BUCKET_NAME, source, destination)
print('Data inside of',BUCKET_NAME,':')
#list_blobs(BUCKET_NAME)

#### see all tfrecord created

In [None]:
! ls -l

> # Verify TFRecords
We will verify the TFRecords we just made by using code from the Flower Comp starter notebook [here][1] to display the TFRecords below.

[1]: https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

IMAGE_SIZE = [456, 456]

CLASSES = 5
batch_size = 32
AUG_BATCH = batch_size
path_ = './*.tfrec'
validation_split = 0.50
filenames = tf.io.gfile.glob(path_)
split = len(filenames) - int(len(filenames) * validation_split)
train_fns = filenames[:split]
validation_fns = filenames[split:]

In [None]:
def parse_tfrecord(example):
  features = {
    "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
    "label": tf.io.VarLenFeature(tf.float32), #tf.io.FixedLenFeature([], tf.int64),  # shape [] means scalar
    "hotencode": tf.io.VarLenFeature(tf.float32),
  }
  example = tf.io.parse_single_example(example, features)
  decoded = tf.io.decode_jpeg(example['image'], channels=3)
  #img_channel_swap = decoded[..., ::-1]
  #img_channel_swap_1 = tf.reverse(decoded, axis=[-1])
  normalized = tf.cast(decoded, tf.float32) / 255.0 # convert each 0-255 value to floats in [0, 1] range
  image_tensor = tf.image.resize( normalized, [600,800], method = "bilinear", preserve_aspect_ratio=True)
  one_hot_class = tf.reshape(tf.sparse.to_dense(example['hotencode']), [CLASSES])
  return normalized, one_hot_class

def load_dataset(filenames_):
  # Read from TFRecords. For optimal performance, we interleave reads from multiple files.
  records = tf.data.TFRecordDataset(filenames_, num_parallel_reads=AUTO)
  return records.map(parse_tfrecord, num_parallel_calls=AUTO)

def get_val_dataset(filenames_val):
    dataset = load_dataset(filenames_val)
    
    def catch_image_size(image, one_hot_class):
        image_tensor = tf.image.resize( image, [*IMAGE_SIZE], method = "bilinear")
        return image_tensor, one_hot_class
    
    return dataset.map(catch_image_size,num_parallel_calls=AUTO).batch(batch_size).prefetch(AUTO)
        
def get_training_dataset(filenames_train):
  dataset = load_dataset(filenames_train)
  # Create some additional training images by randomly flipping and
  # increasing/decreasing the saturation of images in the training set. 
  def data_augment(image, one_hot_class):
    modified = tf.image.random_crop(image,size = [*IMAGE_SIZE, 3])    
    modified = tf.image.random_flip_left_right(modified)
    modified = tf.image.random_saturation(modified, 0, 2)
    modified = tf.image.random_brightness(modified, 0.1)
    modified = tf.image.random_flip_up_down(modified)
    modified = tf.image.random_jpeg_quality(modified, 80 ,100)
    return modified, one_hot_class
  augmented = dataset.map(data_augment, num_parallel_calls=AUTO)

  # Prefetch the next batch while training (autotune prefetch buffer size).
  return augmented.repeat().shuffle(2048).batch(batch_size).prefetch(AUTO) 

training_dataset = get_training_dataset(train_fns)
validation_dataset = get_val_dataset(validation_fns)

In [None]:
CLASSES_ = ['CBB', 'CBSD', 'CGM', 'CMD', 'Healthy']

def display_one_leaf(image, title, subplot, color):
  plt.subplot(subplot)
  plt.axis('off')
  plt.imshow(image)
  plt.title(title, fontsize=16, color=color)
  
# If model is provided, use it to generate predictions.
def display_nine_leafs(images, titles, title_colors=None):
  subplot = 331
  plt.figure(figsize=(13,13))
  for i in range(9):
    color = 'black' if title_colors is None else title_colors[i]
    display_one_leaf(images[i], titles[i], 331+i, color)
  plt.tight_layout()
  plt.subplots_adjust(wspace=0.1, hspace=0.1)
  plt.show()

def get_dataset_iterator(dataset, n_examples):
  return dataset.unbatch().batch(n_examples).as_numpy_iterator()

training_viz_iterator = get_dataset_iterator(training_dataset, 9)
val_viz_iterator = get_dataset_iterator(validation_dataset, 9)

## Training Viz

In [None]:
# Re-run this cell to show a new batch of images
images, classes = next(training_viz_iterator)
class_idxs = np.argmax(classes, axis=-1) # transform from one-hot array to class number
labels = [CLASSES_[idx] for idx in class_idxs]
display_nine_leafs(images, labels)

## Test Viz

In [None]:
# Re-run this cell to show a new batch of images
images, classes = next(val_viz_iterator)
class_idxs = np.argmax(classes, axis=-1) # transform from one-hot array to class number
labels = [CLASSES_[idx] for idx in class_idxs]
display_nine_leafs(images, labels)