# Create TFRecords

--- Hamed --- Jan 12
- 1) cut 0.5
- 2) shuffle output dataframe

--- Hamed/Ali --- Jan 10
- 1) oversample with cutmix
- 2) cut ratio 0.4

--- Hamed --- Jan 02
- 1) not copying images to working directory
- 2) no undersampling for major class
- 3) cut off 70%

--- Hamed --- Jan 01
- 1) shuffled images in each file

--- Hamed --- Dec 31
- 1) fixed the empty TFR files
- 2) changed the N_FILE to 30 for HDD problem
- 3) fixed size 600x800
- 4) cut-off to 0.6 for HDD problem

--- Hamed --- Dec 27
- 1) added merged dataset 2019-20 from Tom's notebook
- 2) from merged_data.csv only Images to remove: ['train-cmd-2399.jpg']
- 3) "N_FILES" TFRecord files are ready (over/undersampled with no common images between files, no duplicates)
- 4) sizes vary! not 600*800!

--- Hamed --- Dec 24
- 1) kfold
- 2) Random over/undersampling on each fold (cut_ratio can be defined)


- Based on: https://www.kaggle.com/dimitreoliveira/cassava-leaf-disease-stratified-tfrecords-256x256/data
- Discussion [thread](https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/198744)
- Reference: [How To Create TFRecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords)

In [None]:
import shutil
import re, math, os, cv2, random, warnings
import math
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

import glob, torch, imagehash
from tqdm.auto import tqdm
from PIL import Image

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

#GOAL = 'valid'
GOAL = 'train'
DIMS = (800, 600)
N_FILES = 30
WIDTH, HEIGHT  = DIMS
IMG_QUALITY = 100
seed = 2020
seed_everything(seed)
warnings.filterwarnings('ignore')

In [None]:
base_path = '../input/cassavapreprocessed'
images_dir = '../input/cassavapreprocessed/train_images/train_images/'
gen_dir = './gen_images/'
#working_dir = './working/'
train_tfrecords_dir = './train_data_tf/'
valid_tfrecords_dir = './valid_data_tf/'

if GOAL == 'train':
    if not os.path.exists(train_tfrecords_dir):
        os.mkdir(train_tfrecords_dir)
    if not os.path.exists(gen_dir):
        os.mkdir(gen_dir)
        for i in range(N_FILES):
            os.mkdir(gen_dir+f'{i}/')

    # Resampling
    # if 0.6, all classes will be over/undersampled
    # to the 0.6 of the major class
    cut_ratio = 0.5

elif GOAL == 'valid':
    if not os.path.exists(valid_tfrecords_dir):
        os.mkdir(valid_tfrecords_dir)

# LOAD DATA

In [None]:
train = pd.read_csv(base_path + '/merged_data.csv')
train.head()
print('Train samples: %d' % len(train))

In [None]:
# Remove duplicates from train data
remove_pd = ['train-cmd-2399.jpg']
train = train[~train['image_id'].isin(remove_pd)]
train.reset_index(inplace=True)
print('Train samples: %d' % len(train))

# Helper functions

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
                      
    # image = tf.image.resize(image, [HEIGHT, WIDTH])
    # image = tf.reshape(image, [HEIGHT, WIDTH, 3])
    return image

def read_tfrecord(example):
    TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string), 
        'target': tf.io.FixedLenFeature([], tf.int64), 
        'image_name': tf.io.FixedLenFeature([], tf.string), 
    }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    image = decode_image(example['image'])
    target = example['target']
    name = example['image_name']
    return image, target, name

def load_dataset(filenames, HEIGHT, WIDTH, CHANNELS=3):
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
    return dataset

def display_samples(ds, row, col):
    ds_iter = iter(ds)
    plt.figure(figsize=(15, int(15*row/col)))
    for j in range(row*col):
        image, label, name = next(ds_iter)
        plt.subplot(row,col,j+1)
        plt.axis('off')
        plt.imshow(image[0])
        plt.title(f"{label[0]}: {name[0].numpy().decode('utf-8')}", fontsize=12)
    plt.show()

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)


# Create TF Records
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image, target, image_name):
  feature = {
      'image': _bytes_feature(image),
      'target': _int64_feature(target),
      'image_name': _bytes_feature(image_name),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

## Split samples into "N_FILES" different files

In [None]:
folds = StratifiedKFold(n_splits=N_FILES, shuffle=True, random_state=seed)
train['file'] = -1

for fold_n, (train_idx, val_idx) in enumerate(folds.split(train, train['label'])):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train['file'].loc[val_idx] = fold_n
    
display(train.head())
display(train.describe())
#train.to_csv('train.csv', index=False)

# Cutmix function

In [None]:
def gen_cutmix(df, label = None, file = None, probability = 0.2):
    
    f_uni = df.file.unique()
    l_uni = df.label.unique()
    print(f'DataFrame has {len(f_uni)} files!')
    max_label = df.label.value_counts().idxmax()
    sampling_num = max(df.label.value_counts())*cut_ratio // len(f_uni)
    print('sampling_num is', sampling_num)
    rows = []    
    
    if label:
        df = df.loc[df.label == label]
        
    if file:
        df = df.loc[df.file == file]
    f_counter = 0    
    for f in f_uni:
        f_counter += 1
        path = f"{gen_dir}{f}/"
        for l in l_uni:
            
            n = len(df.loc[(df.file == f) & (df.label == l)])
            i = 0
            while (n + i) < sampling_num:
                
                P = np.random.rand()                
                if (P < probability) & (P > (1 - probability)):
                    continue
                
                row = []    
                # CHOOSE RANDOM PHOTOS OF THE SAME FILE AND CLASS
                img1_name = df.iloc[np.random.randint(len(df))].image_id
                img1_path = f'{images_dir}{img1_name}'
                img1 = cv2.imread(img1_path)
                if img1.shape[:2] != (HEIGHT, WIDTH):
                    img1 = cv2.resize(img1, (WIDTH, HEIGHT))              

                img2_name = df.iloc[np.random.randint(len(df))].image_id
                img2_path = f'{images_dir}{img2_name}'
                img2 = cv2.imread(img2_path)
                if img2.shape[:2] != (HEIGHT, WIDTH):
                    img2 = cv2.resize(img2, (WIDTH, HEIGHT))

                # CHOOSE RANDOM LOCATION
                x = (np.random.rand() * WIDTH)//1
                y = (np.random.rand() * HEIGHT)//1
                width = (WIDTH * P)//1
                height = (HEIGHT * P)//1

                ya = int(max(0,y-height//2))
                yb = int(max(HEIGHT,y+height//2))
                xa = int(max(0,x-width//2))
                xb = int(max(WIDTH,x+width//2))

                # MAKE CUTMIX IMAGE
                one = img1[ya:yb,0:xa,:]
                two = img2[ya:yb,xa:xb,:]
                three = img1[ya:yb,xb:WIDTH,:]
                middle = np.concatenate((one,two,three),axis=1)
                img = np.concatenate((img1[0:ya,:,:],middle,img1[yb:HEIGHT,:,:]),axis=0)
                i = i + 1    
                img_name = f"x{i}_{img1_name}"
                cv2.imwrite(f"{path}{img_name}",img)

                row.append(img_name)
                row.append(l)
                row.append(f)
                rows.append(row)
        print(f"{f_counter} files are done!")
    new_df = pd.DataFrame(rows, columns = ['image_id','label','file'])
    return new_df

In [None]:
if GOAL == 'train':
    if os.path.exists(gen_dir):
        shutil.rmtree(gen_dir)
        os.mkdir(gen_dir)
        for i in range(N_FILES):
            os.mkdir(gen_dir+f'{i}/')
    else:
        os.mkdir(gen_dir)
        for i in range(N_FILES):
            os.mkdir(gen_dir+f'{i}/')
    imgs = gen_cutmix(train)
    imgs.head()

In [None]:
'''
if GOAL == 'train':
    _, ax = plt.subplots(10, figsize = (10,50))
    for i in range(10):
        img = plt.imread(f"{gen_dir}{imgs.file[i]}/{imgs.image_id[i]}")
        ax[i].imshow(img)
'''

In [None]:
if GOAL == 'train':
    bl_train = pd.concat([imgs,train[['image_id','label','file']]],ignore_index=True)
    bl_train = bl_train.sample(frac=1).reset_index(drop=True)
    bl_train.to_csv('train.csv', index=False)
    bl_train.label.value_counts()

# Generate TF records

In [None]:
if GOAL == 'train':
    for tfrec_num in range(N_FILES):
        print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
        samples = bl_train[bl_train['file'] == tfrec_num]
        n_samples = len(samples)
        print(f'{n_samples} samples')
        with tf.io.TFRecordWriter(train_tfrecords_dir + 'Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
            for row in samples.itertuples():
                label = row.label
                image_name = row.image_id
                if image_name[0] == 'x':
                    img_path = f'{gen_dir}{tfrec_num}/{image_name}'
                else:
                    img_path = f'{images_dir}{image_name}'

                img = cv2.imread(img_path)
                img = cv2.resize(img, (WIDTH, HEIGHT))
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()

                example = serialize_example(img, label, str.encode(image_name))
                writer.write(example)
        shutil.rmtree(f"{gen_dir}{tfrec_num}")

In [None]:
if os.path.exists(gen_dir):
    shutil.rmtree(gen_dir)

In [None]:
if GOAL == 'valid':
    for tfrec_num in range(N_FILES):
        print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
        samples = train[train['file'] == tfrec_num]
        n_samples = len(samples)
        print(f'{n_samples} samples')
        with tf.io.TFRecordWriter(valid_tfrecords_dir + 'Id_valid%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
            for row in samples.itertuples():
                label = row.label
                image_name = row.image_id
                img_path = f'{images_dir}{image_name}'

                img = cv2.imread(img_path)
                img = cv2.resize(img, (WIDTH, HEIGHT))
                img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()

                example = serialize_example(img, label, str.encode(image_name))
                writer.write(example)

# Visualize created TF records

## Class map

```
0: Cassava Bacterial Blight (CBB)
1: Cassava Brown Streak Disease (CBSD)
2: Cassava Green Mottle (CGM)
3: Cassava Mosaic Disease (CMD)
4: Healthy
```

In [None]:
if GOAL == 'train':
    path = train_tfrecords_dir
elif GAL == 'valid':
    path = valid_tfrecords_dir
    
AUTO = tf.data.experimental.AUTOTUNE
FILENAMES = tf.io.gfile.glob(path + '*.tfrec')
print(f'TFRecords files: {FILENAMES}')
print(f'Created image samples: {count_data_items(FILENAMES)}')

display_samples(load_dataset(FILENAMES, WIDTH, HEIGHT).batch(1), 6, 6)

# Complete set label distribution

In [None]:
CLASSES = ['Cassava Bacterial Blight', 
           'Cassava Brown Streak Disease', 
           'Cassava Green Mottle', 
           'Cassava Mosaic Disease', 
           'Healthy']
if GOAL == 'train':
    df = bl_train
elif GOAL == 'valid':
    df = train

label_count = df.groupby('label', as_index=False).count()
label_count.rename(columns={'image_id': 'Count', 'label': 'Label'}, inplace=True)
label_count['Label'] = label_count['Label'].apply(lambda x: CLASSES[x])

fig, ax = plt.subplots(1, 1, figsize=(14, 8))
ax = sns.barplot(x=label_count['Count'], y=label_count['Label'], palette='viridis')
ax.tick_params(labelsize=16)

plt.show()

## Labels distribution for each file

In [None]:
for fold_n in range(folds.n_splits):
    label_count = df[df['file'] == fold_n].groupby('label', as_index=False).count()
    label_count.rename(columns={'image_id': 'Count', 'label': 'Label'}, inplace=True)
    label_count['Label'] = label_count['Label'].apply(lambda x: CLASSES[x])

    fig, ax = plt.subplots(1, 1, figsize=(14, 8))
    fig.suptitle(f'File {fold_n+1}', fontsize=22)
    ax = sns.barplot(x=label_count['Count'], y=label_count['Label'], palette='viridis')
    ax.tick_params(labelsize=16)

    plt.show()