In [None]:
import json
import math, re, os, random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import albumentations as A
import cv2

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, BatchNormalization, GlobalAveragePooling2D, Flatten, Input, Activation, Conv2D, Add
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau

from sklearn.model_selection import KFold

BASE_DIR = "/kaggle/input/cassava-leaf-disease-classification/"
TRAIN_DIR = "/kaggle/input/cassava-leaf-disease-classification/train_images/"
TEST_DIR = "/kaggle/input/cassava-leaf-disease-classification/test_images/"

sub = pd.read_csv(f'{BASE_DIR}sample_submission.csv')
train = pd.read_csv(f'{BASE_DIR}train.csv')

In [None]:
l = list(train[train['label'] == 0].iloc[:2]['image_id'].values)
l.extend(list(train[train['label'] == 1].iloc[:2]['image_id'].values))
l.extend(list(train[train['label'] == 2].iloc[:2]['image_id'].values))
l.extend(list(train[train['label'] == 3].iloc[:2]['image_id'].values))
l.extend(list(train[train['label'] == 4].iloc[:2]['image_id'].values))

ims = np.empty((10,600,800,3), dtype=np.float32)

for index, image_id in enumerate(l):
    ims[index] = plt.imread(f'{TRAIN_DIR}{image_id}') / 255

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(y="label", data=train);

# Augmentation
- Label 3 has 60% percent of the data
- The currently trained model is not focused on predicting labels 0 and 1, hence they are going to be the main focus
- Label == 0 should have a huge augmentation where the data should be 8 folded
- The labels 1,2,4 have the same amount and it would be usefull to 4fold them

## Procedure
- The image_ids will be obtained from the train df and a pre-defined quantity will be chosen from each label:
    - 200 for labels 2,3,4, 25 for label 0, 50 for label 1
    - The list containing the image ids will be shuffled to diversify the tfrec file.
    
- Use transforms, augmentate, write to the file.

In [None]:
transform1 = A.Compose([
    A.Blur(blur_limit=3, always_apply=True),
    A.HorizontalFlip(always_apply=True)
])

transform2 = A.Compose([
    A.ChannelDropout(fill_value=10, p=1),
    A.HorizontalFlip(always_apply=True),
])

transform3 = A.Compose([
    A.Blur(blur_limit=3),
    A.CoarseDropout(max_holes=500, max_height=5, max_width=5, 
                    min_holes=150, min_height=5, min_width=5, 
                    fill_value=[100,250,100], always_apply=True)
])

transform4 = A.Compose([
    A.GaussNoise(var_limit=(1e-1, 2e-1), mean=0, always_apply=True),
    A.HorizontalFlip(p=1),
])

transform5 = A.Compose([
    A.RandomBrightnessContrast(p=1, brightness_limit=0.2, contrast_limit=0.5),
    A.HorizontalFlip(p=0.51),
])

transform6 = A.Compose([
    A.Blur(blur_limit=2, always_apply=True),
    A.RandomRain(slant_lower=-1, slant_upper=1, 
                 drop_length=2, drop_width=2,
                 drop_color=(150, 150, 150), 
                 blur_value=2,
                 brightness_coefficient=0.9,
                 rain_type=None, always_apply=True),
    A.HorizontalFlip(p=0.5),
])

transform7 = A.Compose([
    A.Blur(blur_limit=5, always_apply=True),
    A.Downscale(scale_min=0.65, scale_max=0.9, interpolation=0, always_apply=True),
    A.HorizontalFlip(p=1),
])

transform8 = A.Compose([
    A.RandomBrightnessContrast(p=1,brightness_limit=0.5,contrast_limit=0.7),
    A.HorizontalFlip(p=0.51),
])
    

transformers = [
    transform1, transform2, transform3, transform4, transform5, transform6, transform7, transform8
]

# Utilities
- Some of these utilites and techniques have been copied from <a href='https://www.tensorflow.org/tutorials/load_data/tfrecord#walkthrough_reading_and_writing_image_data'>here</a>
- Rest of the functions have been written based on the Augmentation procedure used in this notebook

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize(image, target):
    """ Creates a tf.train.Example message ready to be written to a file. """
    feature = {
          'image': _bytes_feature(image),
          'target': _int64_feature(target),
    }

    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    
    return example_proto.SerializeToString()
    
tfrecord_format = {
    "image": tf.io.FixedLenFeature([], tf.string),
    "target": tf.io.FixedLenFeature([], tf.int64)
}

In [None]:
def get_aug_df(iter_num, label_quantity={"0": 25, "1": 50, "2": 200, "3": 200, "4": 200}):
    
    aug_df = pd.concat([train[train['label'] == int(label)].iloc[q*(iter_num-1):q*(iter_num)] 
                        for label, q in label_quantity.items()]
                      ).sample(frac=1)

    aug_df.reset_index(drop=True, inplace=True)
    
    return aug_df

def augmentate(df, tfrec_name, transformers=transformers):
    if len(df) == 0: print('Dataframe is empty')
    with tf.io.TFRecordWriter(f'/kaggle/working/{tfrec_name}.tfrec') as writer:
        for _, (image_id, label) in df.iterrows():
            img = cv2.imread(f'{TRAIN_DIR}{image_id}')
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            
            
            if label == 0:
                transform_agents = transformers
            elif label == 1:
                transform_agents = random.sample(transformers, 4)
            else:
                transform_agents = [random.choice(transformers)]
                
#             print(f'Image_id: {image_id}, Label: {label}, # of agents: {len(transform_agents)}')

            for transformer in transform_agents:
                aug_img = transformer(image=img)['image']
                aug_img = cv2.imencode('.jpg', aug_img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()

                example = serialize(aug_img, label)
                
                writer.write(example)

# Goal
- The goal is augmentate the first 500 images with labels == 0.
- At each iteration 
    - 25 label 0's are chosen and eight folded
    - 50 label 1's  chosen and four folded
    - 200 label 2,3,4's are chosen and transformed only once

In [None]:
for i in range(1,9):
    aug_df = get_aug_df(i, label_quantity={"0": 125})
    augmentate(aug_df, f'cassava_aug0_{i}')

In [None]:
dataset = tf.data.TFRecordDataset([f'/kaggle/working/cassava_Aug_0.tfrec'])
imgs = np.empty((30, 600,800,3))
for index, raw_record in enumerate(dataset.take(30)):
    example = tf.io.parse_single_example(raw_record, tfrecord_format)
    image = tf.image.decode_jpeg(example['image'], channels=3)
#     print(example['target'], end='')
    image = tf.cast(image, tf.float32) / 255.0
    imgs[index] = tf.reshape(image, [600, 800, 3])

In [None]:
plt.imshow(imgs[0])

In [None]:
plt.figure()
n_row, n_col = 5,6
#subplot(r,c) provide the no. of rows and columns
fig, axarr = plt.subplots(n_row,n_col,figsize=(20,20)) 

fig.tight_layout()
# use the created array to output your multiple images. In this case I have stacked 4 images vertically
for row in range(n_row):
    for col in range(n_col):
        axarr[row, col].imshow(imgs[row * n_col + col])
        
plt.subplots_adjust(left=0.125  , bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.2)
plt.show()