## Cassava Stratified K Folds TFRecords maker

### Version history:

Version 6: First Successful Augmented TFRecords   
Version 10: Save description files along with TFRecords   
Version 11: Merged images

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import random
import shutil
from sklearn.model_selection import train_test_split,StratifiedKFold
import gc 
import tensorflow_probability as tfp
tfd = tfp.distributions


In [None]:
df = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
df_images = df['image_id']
df_labels = df['label']


In [None]:
df.head()

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
skf = StratifiedKFold(n_splits=5,shuffle=True)

FOLDS_LIST=[]
a = 0
for train_index,test_index in skf.split(df_images,df_labels):
    #print("TRAIN:", train_index, "TEST:", test_index)
    df_images_train, df_images_test = df_images[train_index], df_images[test_index]
    df_labels_train, df_labels_test = df_labels[train_index], df_labels[test_index]
    df_train = pd.concat([df_images_train,df_labels_train],axis=1)
    df_test = pd.concat([df_images_test,df_labels_test],axis=1)
    
    df_test.to_csv('fold_'+str(a)+'.csv')
    a+=1
    
    FOLDS_LIST.append(df_test)
    


In [None]:

IMG_SIZE = 299


### Image generator

Make a provision such that images are provided and written to TFrecords without too much RAM consumption. 

In [None]:
pair_list = []
df1 = df.sample(frac=1).reset_index(drop=True)  #Shuffle dataframe
df2 = df.sample(frac=1).reset_index(drop=True)  #Shuffle dataframe
for i in range(len(df1)):
    pair_list.append(((df1['image_id'][i],df1['label'][i]),(df2['image_id'][i],df2['label'][i])))

In [None]:

@tf.function
def resize_image(image):
    return tf.image.resize(image,[IMG_SIZE,IMG_SIZE])


In [None]:
#@tf.function
def read_img(image_name):
    filepath = '../input/cassava-leaf-disease-merged/train/'+image_name
    image = tf.io.decode_jpeg(tf.io.read_file(filepath))
    
    return tf.cast(image,tf.float32)

#@tf.function
# def image_generator(image_name):
#     img = read_img(image_name)
#     img = augment_img_randomly(img)
#     return img

def image_generator(idx):
    (image1,label1),(image2,label2) = pair_list[int(idx)]
    img,label = mixup((read_img(image1),tf.one_hot(label1,5)),(read_img(image2),tf.one_hot(label2,5)))
    return img,label

### Augmentations

In [None]:
#@tf.function
def get_augment_list():
    return np.array(list(map(lambda x:x<1,np.random.randint(2, size=7))),dtype='bool')


@tf.function
def gaussian_blur(image, kernel_size=23, padding='SAME'):
	sigma = tf.random.uniform((1,))* 1.9 + 0.1

	radius = tf.cast(kernel_size / 2, tf.int32)
	kernel_size = radius * 2 + 1
	x = tf.cast(tf.range(-radius, radius + 1), tf.float32)
	blur_filter = tf.exp(
		-tf.pow(x, 2.0) / (2.0 * tf.pow(tf.cast(sigma, tf.float32), 2.0)))
	blur_filter /= tf.reduce_sum(blur_filter)
	# One vertical and one horizontal filter.
	blur_v = tf.reshape(blur_filter, [kernel_size, 1, 1, 1])
	blur_h = tf.reshape(blur_filter, [1, kernel_size, 1, 1])
	num_channels = tf.shape(image)[-1]
	blur_h = tf.tile(blur_h, [1, 1, num_channels, 1])
	blur_v = tf.tile(blur_v, [1, 1, num_channels, 1])
	expand_batch_dim = image.shape.ndims == 3
	if expand_batch_dim:
		image = tf.expand_dims(image, axis=0)
	blurred = tf.nn.depthwise_conv2d(
		image, blur_h, strides=[1, 1, 1, 1], padding=padding)
	blurred = tf.nn.depthwise_conv2d(
		blurred, blur_v, strides=[1, 1, 1, 1], padding=padding)
	if expand_batch_dim:
		blurred = tf.squeeze(blurred, axis=0)
	return blurred


def mixup(a, b):
  alpha = [0.2]

  (image1, label1), (image2, label2) = a, b

  dist = tfd.Beta(alpha, alpha)
  l = dist.sample(1)[0][0]
  
  img = l*image1+(1-l)*image2
  lab = l*label1+(1-l)*label2

  return img, lab
    
    
@tf.function
def augment_img_randomly(img):
    '''
    Augmentaions to be used: (use stateless versions of these)
    
    Random hue (0.2)
    Random brightness (0.3)
    Random saturation (0.7,1.3)
    Random contrast  (0.8,1.2)
    ''' 
    augment_list = get_augment_list()
    image = resize_image(img)
     #(32,512,512,3)
    
    if augment_list[0]:
        image = tf.image.random_saturation(image,0.7,1.3)
    if augment_list[1]:
        image = tf.image.random_contrast(image,0.8,1.2)
    if augment_list[2]:
        image = tf.image.random_brightness(image,0.3)
    if augment_list[3]:
        image = tf.image.random_hue(image,0.2)
    if augment_list[4]:
        image = tf.image.random_flip_left_right(image)
    if augment_list[5]:
        image = tf.image.random_flip_up_down(image)
    if augment_list[6]:
        image = gaussian_blur(image)
    
    
    
    image = tf.math.divide(image,255)
    del augment_list,img
    gc.collect()
        
    return image

### Writing TFRecords


In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def generate_example(image_name,fold_df):
    img = augment_img_randomly(read_img(image_name))
    label = fold_df[fold_df['image_id']==image_name]['label']
    
    
    feature={
        'image':_bytes_feature(img.numpy().tobytes()),
        'target':_int64_feature(int(label)),
        'image_id':_bytes_feature(bytes(image_name,encoding='utf8'))
    }
    #del img,label
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


So, we have coded for the following things:
    1. Take image 
    2. Augment
    3. Make a example protobuf

Now we need to take all images in a FOLDS_LIST[i] and put them in a single TFRec. Same for its validation. 
In this way, we will get 5 folds and all will have different images. Note that validation images will be augmented, and we
will be using TTA



In [None]:
for i in range(len(FOLDS_LIST)):
    record_file = 'fold_'+str(i)+'.tfrecords'
    
    print('Writing ',record_file)
    
    image_names = list(FOLDS_LIST[i]['image_id'])
    
    fold_df = FOLDS_LIST[i]
    
    a=1
    num_files = len(list(image_names))
    
    with tf.io.TFRecordWriter(record_file) as writer:
      for k in image_names:
        
        print('Writing image ',a,' of ',num_files)
        proto_example = generate_example(k,fold_df)
        writer.write(proto_example)
        del proto_example
        gc.collect()
        a+=1
    del writer
    gc.collect()
    

In [None]:
# record_file = 'images.tfrecords'
# with tf.io.TFRecordWriter(record_file) as writer:
#   for filename, label in image_labels.items():
#     image_string = open(filename, 'rb').read()
#     tf_example = image_example(image_string, label)
#     writer.write(tf_example.SerializeToString())