In [None]:
!readlink -f ../input/sorghum-id-fgvc-9
!ls ../input/sorghum-id-fgvc-9

In [None]:
!nvidia-smi

In [None]:
#Dataloader class for augmentation by albumentation.
#ref https://www.kaggle.com/meaninglesslives/unet-with-efficientnet-encoder-in-keras
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [None]:
tf.__version__

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

proj_dir = "/kaggle/input/sorghum-id-fgvc-9/"
df = pd.read_csv(proj_dir + "train_cultivar_mapping.csv")
# df.groupby("cultivar").describe().to_csv(proj_dir + "cultivars.csv")
base_path = proj_dir + "train_images/"

df["fullpath"] = base_path + df["image"]

In [None]:
# Remove non-existent images
exists = []

for i in df["fullpath"]:
    if not os.path.exists(i):
        exists.append(False)
    else:
        exists.append(True)

df["exist"] = pd.Series(exists)
df = df[df.exist]

In [None]:
# constants
BATCH_SIZE = 16
AUTOTUNE = tf.data.experimental.AUTOTUNE
TRAIN_SIZE = 0.9
# SHUFFLE_SIZE = 5000
test_data_paths = proj_dir + 'test/*.png'
WIDTH = 512
HEIGHT = 512
EPOCHS = 20
PREPROCESS_INPUT = tf.keras.applications.resnet_v2.preprocess_input
IS_SCALED_NEG1_TO_POS1 = True

In [None]:
paths = df["fullpath"]
labels_str = df["cultivar"]

label_to_index = dict((name, index) for index,name in enumerate(labels_str.unique()))
labels_idx = labels_str.map(lambda x: label_to_index[x])

train_paths, val_paths, train_labels, val_labels = train_test_split(paths, labels_idx, train_size=TRAIN_SIZE, shuffle=True, random_state=42, stratify=labels_idx)

In [None]:
from albumentations import (
    Compose, HorizontalFlip, VerticalFlip, CLAHE, HueSaturationValue,
    RandomBrightness, RandomContrast, RandomGamma, OneOf, ToFloat, ShiftScaleRotate, GridDistortion, ElasticTransform, JpegCompression, HueSaturationValue,
    RGBShift, RandomBrightnessContrast, Blur, MotionBlur, MedianBlur, GaussNoise, CenterCrop, IAAAdditiveGaussianNoise, OpticalDistortion, RandomSizedCrop)

AUGMENTATIONS_TRAIN = Compose([
    HorizontalFlip(p=0.5),
    VerticalFlip(p=0.5),
    OneOf([
        RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, brightness_by_max=False),
#         RandomGamma(gamma_limit=(90, 110)),
        ], p=0.5),
    OneOf([
        ElasticTransform(alpha=120, sigma=120*0.05, alpha_affine=120*0.03),
        GridDistortion(),
        OpticalDistortion(distort_limit=2, shift_limit=0.5),
        ], p=0),
    # RandowSizedCrap(in_max_height=(128, 256), height=h, width=w, p=0.5),
],p=1)

In [None]:
def load_and_crop(path, img_size):
    image = np.array(Image.open(path))
    image = cv2.resize(image, img_size)

    return np.uint8(image)

In [None]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, im_paths, labels, batch_size, augmentations=None, img_size=[256, 256], n_channels=3, shuffle=True):
        """
        `im_paths` and `labels` must be pd.Series
        """
    
        self.batch_size = batch_size
        self.im_paths = im_paths
        self.labels = labels
        self.img_size = img_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.augment = augmentations

        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.im_paths)/self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:min((index+1)*self.batch_size, len(self.im_paths))]

        X, y = self.data_generation(indexes)

        if self.augment is None:
            return PREPROCESS_INPUT(X), y

        else:
            im = []
            for x in X:
                augmented = self.augment(image=x)
                im.append(PREPROCESS_INPUT(augmented['image']))
            return np.array(im), y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.im_paths))

        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def data_generation(self, indexes): 
        X = np.empty((len(indexes), self.img_size[0], self.img_size[1], self.n_channels))
        y = np.empty((len(indexes), 1))

        for i, index in enumerate(indexes):
            im_path = self.im_paths.iloc[index]
            
            im = load_and_crop(im_path, self.img_size)
            label = self.labels.iloc[index]

            X[i,] = im
            y[i,] = label

        return np.uint8(X), np.uint8(y)

In [None]:
train_generator = DataGenerator(im_paths=train_paths, labels=train_labels, img_size=[WIDTH, HEIGHT], batch_size=BATCH_SIZE, augmentations=AUGMENTATIONS_TRAIN)
val_generator = DataGenerator(im_paths=val_paths, labels=val_labels, img_size=[WIDTH, HEIGHT], batch_size=BATCH_SIZE, augmentations=None)

In [None]:
images, labels = train_generator.__getitem__(0)
# images, labels = val_generator.__getitem__(0)
max_images = 8
grid_width = 4
scale_factor = 4
grid_height = int(max_images / grid_width)
fig, axs = plt.subplots(grid_height, grid_width, figsize=(grid_width*scale_factor, grid_height*scale_factor))

for i, (im, label) in enumerate(zip(images, labels)):
    if i < max_images:
        ax = axs[int(i / grid_width), i % grid_width]
        ax.set_title(label)
        if IS_SCALED_NEG1_TO_POS1:
            ax.imshow(im+1)
        else:
            ax.imshow(im)
        ax.axis('off')

In [None]:
basemodel = tf.keras.applications.resnet_v2.ResNet50V2(include_top=False,weights="imagenet",input_shape=(WIDTH,HEIGHT,3))
image_input = tf.keras.layers.Input(shape=(WIDTH,HEIGHT,3))
out = basemodel(image_input)
out = tf.keras.layers.GlobalAveragePooling2D()(out)
# out = tf.keras.layers.Dropout(0.2)(out)
out = tf.keras.layers.Dense(len(label_to_index), activation="softmax")(out)

model = tf.keras.Model(image_input, out)

model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='sparse_categorical_crossentropy',
              metrics=["accuracy"])

In [None]:
steps_per_epoch=tf.math.ceil(len(train_paths)/BATCH_SIZE).numpy()
callback = tf.keras.callbacks.EarlyStopping(monitor='accuracy',mode='max', patience=20)

history = model.fit(train_generator, epochs=EPOCHS, validation_data=val_generator, steps_per_epoch=steps_per_epoch, callbacks=callback)

In [None]:
import glob
test_image_paths = list(glob.glob(test_data_paths))
test_image_paths = [str(path) for path in test_image_paths]
len(test_image_paths)
path_label_test = tf.data.Dataset.from_tensor_slices(test_image_paths)
test_ds = path_label_test.map(load_and_preprocess_image, tf.data.experimental.AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)
predictions = model.predict(test_ds, batch_size=BATCH_SIZE)

In [None]:
prediction_output = [("filename", "cultivar")]

for i in range(len(predictions.argmax(axis=1))):
    pred_idx = predictions.argmax(axis=1)[i]
    prediction_output.append((os.path.basename(test_image_paths[i]), [k for k, v in label_to_index.items() if v == pred_idx][0]))
#%%
pd.DataFrame(prediction_output).to_csv("submission.csv", index=None, header=None)