In [None]:
import os
import re
from sklearn.model_selection import StratifiedKFold
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
import shutil
from functools import partial
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from kaggle_datasets import KaggleDatasets

# Reading Metadata

In [None]:
data = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as file:
    text = file.read()
print(text)
data.head()

# Original Distribution

In [None]:
figure = plt.figure(figsize=(8,4))
(data['label'].value_counts()/len(data)*100).plot(kind='bar')
plt.title("Distribution of Classes")
plt.ylabel('% count of classes')
plt.show()

# Function for converting image to Tfrecord
Taken from tensorflow site

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
                      
    image = tf.image.resize(image, [HEIGHT, WIDTH])
    image = tf.reshape(image, [HEIGHT, WIDTH, 3])
    return image

def read_tfrecord(example):
    TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string), 
        'target': tf.io.FixedLenFeature([], tf.int64), 
        'image_name': tf.io.FixedLenFeature([], tf.string), 
    }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    image = decode_image(example['image'])
    target = example['target']
    name = example['image_name']
    return image, target, name

def load_dataset(filenames, HEIGHT, WIDTH, CHANNELS=3):
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
    return dataset

def display_samples(ds, row, col):
    ds_iter = iter(ds)
    plt.figure(figsize=(15, int(15*row/col)))
    for j in range(row*col):
        image, label, name = next(ds_iter)
        plt.subplot(row,col,j+1)
        plt.axis('off')
        plt.imshow(image[0])
        plt.title(f"{label[0]}: {name[0].numpy().decode('utf-8')}", fontsize=12)
    plt.show()

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)


# Create TF Records
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image, target, image_name):
  feature = {
      'image': _bytes_feature(image),
      'target': _int64_feature(target),
      'image_name': _bytes_feature(image_name),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

# Image Downsampling

In [None]:
# Creating a balanced data set
data['label'].value_counts()
no_images_to_copy = 2580
balanced_df = data[data['label']==3].sample(frac=0.197)
balanced_df = pd.concat([balanced_df, 
                         data[data['label']==0],
                        data[data['label']==1],
                        data[data['label']==2],
                        data[data['label']==4]],
                       axis=0, ignore_index=True)

balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
display(balanced_df.head())

figure = plt.figure(figsize=(8,4))
(balanced_df['label'].value_counts()/len(data)*100).plot(kind='bar')
plt.title("Distribution of Classes")
plt.ylabel('% count of classes')
plt.show()


# Writing an Image to TfRecord

In [None]:
database_base_path = '/kaggle/input/cassava-leaf-disease-classification/'
PATH = f'{database_base_path}train_images/'
IMGS = balanced_df['image_id'].tolist()
N_FILES = 10
HEIGHT, WIDTH = (512, 512)
IMG_QUALITY = 100
print('No of Images - ', len(IMGS))
train = balanced_df.copy()

display(train.head())



folds = StratifiedKFold(n_splits=N_FILES, shuffle=True, random_state=123)
train['file'] = -1

for fold_n, (train_idx, val_idx) in enumerate(folds.split(train, train['label'])):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train['file'].loc[val_idx] = fold_n
    
train.to_csv('train.csv', index=False)




# Writing to TFRecords
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
    samples = train[train['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
        for row in samples.itertuples():
            label = row.label
            image_name = row.image_id
            img_path = f'{PATH}{image_name}'
            
            img = cv2.imread(img_path)
            img = cv2.resize(img, (HEIGHT, WIDTH))
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            
            example = serialize_example(img, label, str.encode(image_name))
            writer.write(example)