In [None]:
import re, math, os, cv2, random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

## Notes

- In this notebook I am going to create tfrecords of all the images. 

- V11: reverted to 512x512 no albumentations

### Seed

In [None]:
#setting seeds for reproducability
SEED = 31415

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(SEED)    

### TFrecord Functions

In [None]:
def _bytes_feature(value):
    #Returns a bytes_list from a string / byte.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    #Returns a float_list from a float / double.
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    #Returns an int64_list from a bool / enum / int / uint.
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

The following cell should be altered based on what we want to encode, and their data types. 

In [None]:
def serialize_example(image, target, image_name):
    feature = {
      'image': _bytes_feature(image),
      'target': _int64_feature(target),
      'image_name': _bytes_feature(image_name),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

### Varaibles

In [None]:
N_FILES = 50 # split images into 50 files
NEW_SIZE = (512)
IMG_QUALITY = 100

IMG_PATH = '../input/plant-pathology-2021-fgvc8/train_images'

### Reading Meta-Data

In [None]:
train_df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
print("train_images: {}".format(train_df.shape[0]))
train_df['labels'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

train_df['encoded_labels'] = labelencoder.fit_transform(train_df['labels'])
train_df.head(20)

### Sampling Images

- Sampling the three test_images
- we can see from these images alone that all the images are going to be of different sizes. I am going to start by resizing all the images to 2000 x 2000.

In [None]:
from PIL import Image

img1 = Image.open('../input/plant-pathology-2021-fgvc8/test_images/85f8cb619c66b863.jpg')
img2 = Image.open('../input/plant-pathology-2021-fgvc8/test_images/ad8770db05586b59.jpg')
img3 = Image.open('../input/plant-pathology-2021-fgvc8/test_images/c7b03e718489f3ca.jpg')

for i in img1,img2,img3:
    print(i.size)

### Stratified Split

In [None]:
folds = StratifiedKFold(n_splits=N_FILES, shuffle=True, random_state=SEED)
train_df['file'] = -1

#folds.split is the train_test_split of stratifiedkfolding
for fold_n, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['labels'])):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train_df['file'].loc[val_idx] = fold_n

### Writing Tfrecords

In [None]:
train_df = train_df.rename(columns={'image': 'image_id'})

train_df.to_csv('train.csv',index=False)

In [None]:
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
    samples = train_df[train_df['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
        for row in samples.itertuples():
            label = row.encoded_labels
            image_name = row.image_id
            img_path = f'{IMG_PATH}/{image_name}'
            
            #read image and convert to RGB colorspace
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (NEW_SIZE, NEW_SIZE))
            
            #writed image to bytes to be stored as tfrec
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            example = serialize_example(img, label, str.encode(image_name))
            writer.write(example)