In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import cv2
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import albumentations as album
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

plt.style.use('_mpl-gallery')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Config

In [None]:
JOB_NAME = 'sorghum-id-fgvc-9'
PATH = f'../input/{JOB_NAME}/'
# split data
N_SPLITS = 4
# data process
IMG_HEIGHT = 512
IMG_WIDTH = 512
# train cycle
BATCH_SIZE = 12
PATIENCE = 15
EPOCHS = 100
INIT_LR = 1e-4
MAX_LR = 1e-3
tf.keras.mixed_precision.set_global_policy('mixed_float16')

## Data

In [None]:
df = pd.read_csv(PATH + 'train_cultivar_mapping.csv')
df.dropna(inplace=True)

In [None]:
images = df['image'].to_numpy()
images = np.array([PATH+'train_images/'+i for i in images])
images

In [None]:
num_images = len(images)
num_images

In [None]:
cultivars = df['cultivar'].to_numpy()
cultivars = cultivars
cultivars

In [None]:
class_names = np.unique(cultivars)
num_classes = len(class_names)
num_classes

In [None]:
def get_onehot(label):
    one_hot = label == class_names
    return np.argmax(one_hot)

cultivars = np.array([get_onehot(cultivar) for cultivar in cultivars])

## EDA

In [None]:
counts_each_class = np.zeros((num_classes))
for cultivar in cultivars:
    counts_each_class += (np.arange(num_classes) == cultivar)

plt.subplots(figsize=(20, 5))
plt.bar(np.arange(num_classes)+0.5, counts_each_class)
plt.hlines(counts_each_class.min(), 0, num_classes, colors='r')
plt.show()

## StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for train_index, valid_index in skf.split(images, cultivars):
    train_images, valid_images = images[train_index], images[valid_index]
    train_cultivars, valid_cultivars = cultivars[train_index], cultivars[valid_index]

print(f"train size: {len(train_images)}")
print(f"valid size: {len(valid_images)}")

In [None]:
ds_splits = {'train':[train_images, train_cultivars], 
             'valid':[valid_images, valid_cultivars]}

## Writing TFRecords

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def image_example(image_string, label):
    image = tf.io.decode_png(image_string)
    image = tf.image.resize(image, size=[IMG_HEIGHT, IMG_WIDTH])
    image = tf.cast(image, 'uint8')
    image = tf.io.encode_jpeg(image)

    feature = {
        'label': _int64_feature(label),
        'image_raw': _bytes_feature(image),
    }

    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
# for ds_split in ds_splits:
#     record_file = f'./tfrecords/sorghum-id-fgvc-9_{ds_split}.tfrecords'
#     subset = ds_splits[ds_split]
#     filenames, labels = subset
#     with tf.io.TFRecordWriter(record_file) as writer:
#         for filename, label in tqdm(list(zip(filenames, labels))):
#             image_string = open(filename, 'rb').read()
#             tf_example = image_example(image_string, label)
#             writer.write(tf_example.SerializeToString())

## Reading TFRecords

In [None]:
image_feature_description = {
    'label': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function(example_proto):
    example_message = tf.io.parse_single_example(example_proto, image_feature_description)

    img_raw = example_message['image_raw']
    label = example_message['label']
    
    image = tf.io.decode_jpeg(img_raw, channels=3)
    image = tf.reshape(image, shape=[IMG_HEIGHT, IMG_WIDTH, 3])
    return (image, label)

In [None]:
def get_dataset(filename):
    options = tf.data.Options()
#     options.deterministic = False
    dataset = tf.data.TFRecordDataset(filename)  
    dataset = dataset.with_options(options)  

    dataset = dataset.map(_parse_image_function, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [None]:
# https://www.kaggle.com/datasets/waynewhying/sorghum-512512-jpge-tfrecords
train_ds = get_dataset(f'../input/sorghum-512512-jpge-tfrecords/{JOB_NAME}_train.tfrecords')
valid_ds = get_dataset(f'../input/sorghum-512512-jpge-tfrecords/{JOB_NAME}_valid.tfrecords')

## Data augmentation

In [None]:
transforms = album.Compose([
    album.RandomResizedCrop(height=IMG_HEIGHT, width=IMG_WIDTH),
    album.Flip(p=0.5),
    album.ShiftScaleRotate(border_mode=cv2.BORDER_CONSTANT, p=0.5),
    album.HueSaturationValue(p=0.5),
    album.OneOf([
        album.RandomBrightnessContrast(p=0.5),
        album.RandomGamma(p=0.5),
        ], p=0.5),
    album.OneOf([
        album.Blur(p=0.1),
        album.GaussianBlur(p=0.1),
        album.MotionBlur(p=0.1),
        ], p=0.1),
    album.OneOf([
        album.GaussNoise(p=0.1),
        album.ISONoise(p=0.1),
        album.GridDropout(ratio=0.5, p=0.2),
        album.CoarseDropout(max_holes=16, max_height=16, max_width=16, min_holes=8, min_height=8, min_width=8, p=0.2)
        ], p=0.2),
    ])

In [None]:
def aug_fn(image):
    data = {"image":image}
    aug_data = transforms(**data)
    aug_img = aug_data["image"]
    aug_img = tf.cast(aug_img, tf.float32)
    return aug_img

In [None]:
def process_data_train(img, label):
    aug_img = tf.numpy_function(func=aug_fn, inp=[img], Tout=tf.float32)
    return aug_img, label

def process_data_test(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_png(img, channels=3)
    img = tf.image.resize(img, size=[IMG_HEIGHT, IMG_WIDTH])
    return img

In [None]:
train_ds = train_ds.map(process_data_train, num_parallel_calls=tf.data.AUTOTUNE)

train_ds = train_ds.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE).repeat()
valid_ds = valid_ds.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE).repeat()

In [None]:
image_batch, label_batch = next(iter(train_ds))

plt.figure(figsize=(9, 6))
for i in range(6):
    ax = plt.subplot(2, 3, i + 1)
    plt.imshow(image_batch[i].numpy().astype(np.uint8))
    label = label_batch[i].numpy()
    plt.title(class_names[label])
    plt.axis("off")

## Build Model

In [None]:
base_model = tf.keras.applications.efficientnet.EfficientNetB0(input_shape=(IMG_HEIGHT, IMG_WIDTH, 3), include_top=False)
inputs = tf.keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
# Transfer Learning
x = base_model(inputs)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
# Dense
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(num_classes, activation="softmax", dtype='float32')(x)

model = tf.keras.Model(inputs, outputs)

In [None]:
model.summary()

## Define CLR schedule

In [None]:
steps_per_epoch = len(train_images)//BATCH_SIZE

clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=INIT_LR,
    maximal_learning_rate=MAX_LR,
    scale_fn=lambda x: 1/(2.**(x-1)),
    step_size=2 * steps_per_epoch
)

## Train Model

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=clr), 
              loss=tf.losses.SparseCategoricalCrossentropy(), 
              metrics=['accuracy'])

In [None]:
my_callbacks = tf.keras.callbacks.EarlyStopping(patience=PATIENCE, 
                                                monitor='val_loss', 
                                                restore_best_weights=True)

In [None]:
counts_each_class_train = np.zeros((num_classes))
for cultivar in train_cultivars:
    counts_each_class_train += (np.arange(num_classes) == cultivar)

weight = (1/counts_each_class_train) * (num_images/num_classes)
class_weight = {i:weight[i] for i in range(len(weight))}

In [None]:
history = model.fit(train_ds, 
                    validation_data=valid_ds, 
                    epochs=EPOCHS, 
                    callbacks=[my_callbacks], 
                    class_weight=class_weight, 
                    steps_per_epoch=steps_per_epoch,
                    validation_steps=len(valid_images)//BATCH_SIZE)

## Validate the Model

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.set_title('Accuracy')
ax1.plot(history.history['accuracy'], 'r', marker="o", label='accuracy')
ax1.plot(history.history['val_accuracy'], 'g', marker="x", label='val_accuracy')
ax1.legend()

ax2.set_title('Loss')
ax2.plot(history.history['loss'], 'r', marker="o", label='loss')
ax2.plot(history.history['val_loss'], 'g', marker="x", label='val_loss')
ax2.legend()

plt.show()

In [None]:
plt.figure(figsize=(6, 4))
step = np.arange(0, 50 * steps_per_epoch)
lr = clr(step)
plt.plot(step, lr)
plt.xlabel("Steps")
plt.ylabel("Learning Rate")
plt.show()

In [None]:
val_preds = model.predict(valid_ds, steps=len(valid_images)//BATCH_SIZE)
val_preds = np.argmax(val_preds, axis=1)

In [None]:
matrix = tf.math.confusion_matrix(valid_cultivars[:len(valid_images)//BATCH_SIZE * BATCH_SIZE], val_preds).numpy()

plt.figure(figsize=(6, 6))
plt.imshow(matrix)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.grid(0)
plt.show()

## Submit the Result

In [None]:
df = pd.read_csv(PATH + 'sample_submission.csv')

In [None]:
path = PATH + 'test/'
test_images = np.array([path+i for i in df['filename']])

test_ds = tf.data.Dataset.from_tensor_slices(test_images)
test_ds = test_ds.map(process_data_test, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
test_ds = test_ds.batch(BATCH_SIZE)
preds = model.predict(test_ds)
preds = np.argmax(preds, axis=1)
preds = [class_names[i] for i in preds]

In [None]:
df['cultivar'] = preds
df.to_csv(f'./{JOB_NAME}_submission.csv', index=False)