In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import os
from os import listdir
import cv2


from sklearn.model_selection import train_test_split

In [None]:
!pip install adabelief-tf==0.2.0
!pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110

In [None]:
from tqdm import tqdm
import shutil

shutil.copytree("/kaggle/input/plant-seedlings-classification/train", "/dev/shm/train")

In [None]:
from tqdm import tqdm
train_path = "/dev/shm/train/"
label_dict = {k: i for i, k in enumerate(os.listdir(train_path))}
train_file_paths = []
train_labels = []
for label in os.listdir(train_path):
    for file in os.listdir(train_path + label):
        train_file_path = train_path + label + "/{}".format(file)
        train_file_paths.append(train_file_path)
        train_labels.append(label_dict[label])

In [None]:
dev_paths, val_paths, dev_labels, val_labels = train_test_split(train_file_paths, train_labels, test_size=0.2, random_state=42)

In [None]:
rng = tf.random.Generator.from_seed(2434, alg='philox')

def read_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, [21*10, 21*10])
    label = tf.one_hot(label, 12)
    return image, label

def read_and_augment_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, [21*10, 21*10])
    label = tf.one_hot(label, 12)

    seed = rng.make_seeds(2)[0]
    
    image = tf.image.stateless_random_crop(value=image, size=(21*8, 21*8, 3), seed=seed)
    image = tf.image.stateless_random_flip_left_right(image, seed)
    image = tf.image.stateless_random_flip_up_down(image, seed)
    image = tf.image.stateless_random_saturation(image, 0.5, 1., seed)
    image = tf.image.stateless_random_hue(image, 0.05, seed)
    image = tf.image.stateless_random_brightness(image, 0.2, seed)
    return image, label

In [None]:
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = len(dev_labels)

dev_dataset = tf.data.Dataset.from_tensor_slices((dev_paths, dev_labels)).shuffle(SHUFFLE_BUFFER_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))

In [None]:
for image, label in dev_dataset.map(read_and_augment_image, num_parallel_calls=tf.data.AUTOTUNE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE):
    break

In [None]:
import matplotlib.pyplot as plt
plt.imshow(image.numpy()[0].astype(np.uint8))

In [None]:
image[0]

In [None]:
class GeneralizedMeanPooling2D(tf.keras.layers.Layer):
    def __init__(self, p, data_format="channels_last"):
        super(GeneralizedMeanPooling2D, self).__init__()
        if p <= 0:
            raise NotImplementedError
        self.p = p
        self.data_format = data_format
 

    def safe_power(self, x, p):
        return tf.sign(x) * tf.pow(tf.maximum(tf.abs(x), tf.keras.backend.epsilon()), p)


    def call(self, inputs):
        if self.data_format == "channels_last":
            inputs_mean = tf.keras.backend.mean(self.safe_power(inputs, self.p), axis=[1, 2])
        else:
            inputs_mean = tf.keras.backend.mean(self.safe_power(inputs, self.p), axis=[2, 3])
        inputs_res = self.safe_power(inputs_mean, 1/self.p)
        return inputs_res
    

In [None]:
import tensorflow_probability as tfp
class CurricularAdaCos(tf.keras.layers.Layer):
    
    def __init__(self,
                 out_dim=None,
                 margin=0.4,
                 init_scale=None,
                 init_t=0.,
                 momentum=0.9,
                 adaptive_scale=True,
                 adaptive_cface=True,
                 adaptive_t=True,
                 builtin_weight=True,
                 different_weight_per_sample=False,
                 weight_kernel_initializer="glorot_uniform",
                 ):
        super(CurricularAdaCos, self).__init__()
        self.out_dim = out_dim
        self.margin = margin
        self.cos_m = tf.cos(self.margin)
        self.sin_m = tf.sin(self.margin) 
        
        if init_scale is None:
            if out_dim is None:
                self.init_scale = tf.sqrt(2.) * tf.math.log(2**6 - 1)
            else:
                self.init_scale = tf.sqrt(2.) * tf.math.log(tf.cast(out_dim, "float32") - 1)
        self.scale = tf.Variable(self.init_scale, trainable=False)
        self.t = tf.Variable(init_t, trainable=False)
        self.momentum = momentum
        
        self.adaptive_scale = adaptive_scale
        self.adaptive_cface = adaptive_cface
        self.adaptive_t = adaptive_t
        self.builtin_weight = builtin_weight
        self.different_weight_per_sample = different_weight_per_sample
        self.weight_kernel_initializer = weight_kernel_initializer
        
    def build(self, input_shape):
        if self.builtin_weight and self.out_dim:
            last_dim = input_shape[-1]
            self.kernel = self.add_weight("kernel",
                                          shape=[self.out_dim, last_dim],
                                          initializer=self.weight_kernel_initializer,
                                          trainable=True)
        self.built = True
    
    def call(self, X, y, W=None, training=False):
        if not self.built:
            self.build(X.shape)
        
        if self.builtin_weight:
            W = self.kernel
        
        X = tf.math.l2_normalize(X, axis=-1)
        W = tf.math.l2_normalize(W, axis=-1)
        
        if self.different_weight_per_sample:
            cos = tf.reduce_sum(X*W, axis=-1)
        else:
            cos = tf.matmul(X, W, transpose_b=True)
        
        if training:
            if self.adaptive_t:
                new_t = tf.reduce_mean(tf.reduce_sum(cos*y, axis=-1))
            sin = tf.math.sqrt(1. - tf.math.square(cos))
            
            # add margin
            cos = tf.where(y==1., cos*self.cos_m - sin*self.sin_m, cos)
            
            # curricularFace part
            if self.adaptive_cface:
                positive_cos = tf.reduce_mean(cos*y, axis=-1, keepdims=True)
                new_negative_cos = tf.where(cos > positive_cos, cos * (self.t + cos), cos)
                cos = tf.where(y == 1., cos, new_negative_cos)
            
            if self.adaptive_t:
                self.t.assign(self.momentum * self.t + (1 - self.momentum) * new_t)
            
            # AdaCos part
            if self.adaptive_scale:
                B = (1 - y) * tf.exp(self.scale*cos)
                B_avg = tf.reduce_mean(tf.reduce_sum(B, axis=-1), axis=0)

                theta = tf.acos(tf.clip_by_value(cos, -(1. - tf.keras.backend.epsilon()), 1. - tf.keras.backend.epsilon()))
                theta_true = tf.reduce_sum(y*theta, axis=-1)
                theta_med = tfp.stats.percentile(theta_true, q=50)

                # ここでmarginを引くかは要検討
                scale = tf.math.log(tf.maximum(B_avg, tf.keras.backend.epsilon())) / tf.maximum(tf.cos(tf.minimum(np.pi/4 , theta_med - self.margin)), tf.keras.backend.epsilon())
                scale = tf.maximum(tf.keras.backend.epsilon(), scale)
                if tf.math.is_finite(scale):
                    new_scale = self.scale * self.momentum + (1 - self.momentum) * tf.stop_gradient(scale)
                else:
                    new_scale = self.scale * self.momentum + (1 - self.momentum) * self.init_scale
                self.scale.assign(tf.stop_gradient(new_scale))
            
        logit = self.scale * cos
        return logit

In [None]:
from tensorflow.keras.applications import EfficientNetB3

class BaseModel(tf.keras.Model):

    def __init__(self, projection_dim, out_dim):
        super(BaseModel, self).__init__()
        self.cnn = EfficientNetB3(weights = 'imagenet',
                                  include_top=False,
                                  pooling="avg",
                                  input_shape=None)
        self.dense = tf.keras.layers.Dense(projection_dim, activation="relu")
        self.top = tf.keras.layers.Dense(out_dim)
        
    def call(self, X, y, training=False):
        X = self.cnn(X)
        X = self.dense(X)
        pred = self.top(X)
        return pred

class MyModel(tf.keras.Model):

    def __init__(self, projection_dim, out_dim):
        super(MyModel, self).__init__()
        self.cnn = EfficientNetB3(weights = 'imagenet',
                                  include_top=False,
                                  pooling=None,
                                  input_shape=None)
        self.pool = GeneralizedMeanPooling2D(p=3.)
        self.dense = tf.keras.layers.Dense(projection_dim, use_bias=False)
        self.top = CurricularAdaCos(out_dim=12,
                                    margin=0.4)

    def call(self, X, y, training=False):
        X = self.cnn(X)
        X = self.pool(X)
        X = self.dense(X)
        pred = self.top(X, y, training=training)
        return pred


In [None]:
from adabelief_tf import AdaBeliefOptimizer
from tensorflow_addons.metrics import F1Score
from tqdm import tqdm

NUM_EPOCH = 10

model = BaseModel(projection_dim=128, out_dim=12)
optimizer = AdaBeliefOptimizer(learning_rate=1e-3, weight_decay=1e-4, epsilon=1e-14, print_change_log = False) 
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
dev_loss = tf.keras.metrics.Mean(name='dev_loss')
dev_f1 = F1Score(num_classes=12, average="micro")

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_f1 = F1Score(num_classes=12, average="micro")

@tf.function
def train_step(X, y):
    with tf.GradientTape() as tape:
        logit = model.call(X, y, training=True)
        loss = loss_object(y, logit)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    dev_loss.update_state(loss)
    dev_f1.update_state(y, tf.exp(logit))
    
@tf.function
def test_step(X, y):
    logit = model.call(X, y)
    loss = loss_object(y, logit)
    val_loss.update_state(loss)
    val_f1.update_state(y, tf.exp(logit))

last_dev_loss = np.inf
last_val_loss = np.inf
last_dev_f1 = 0.
last_val_f1 = 0.
    
with tqdm(total=NUM_EPOCH) as pbar:
    for epoch in range(NUM_EPOCH):
        step = 0
        for batch_X, batch_y in dev_dataset.map(read_and_augment_image, num_parallel_calls=tf.data.AUTOTUNE)\
                                           .batch(BATCH_SIZE)\
                                           .prefetch(tf.data.AUTOTUNE):
            batch_X = tf.cast(batch_X, "float32")
            batch_y = tf.cast(batch_y, "float32")
            train_step(batch_X, batch_y)
            param_text = ""#"scale: {:.3f} t: {:.3f} ".format(model.top.scale.numpy(), model.top.t.numpy())
            learning_text = "[{}/{}] ".format(str(step).zfill(4), len(dev_dataset)//BATCH_SIZE)
            progress_text = "dev | Loss: {:.5f} f1: {:.5f} val| Loss: {:.5f} f1 {:.5f}".format(dev_loss.result().numpy(),
                                                                                               dev_f1.result().numpy(),
                                                                                               last_val_loss,
                                                                                               last_val_f1)
            pbar.set_postfix_str(learning_text + param_text + progress_text)
        last_dev_loss = dev_loss.result().numpy()
        last_dev_f1 = dev_f1.result().numpy()
        dev_loss.reset_states()
        dev_f1.reset_states()
        
        for batch_X, batch_y in val_dataset.map(read_image, num_parallel_calls=tf.data.AUTOTUNE)\
                                           .batch(BATCH_SIZE)\
                                           .prefetch(tf.data.AUTOTUNE):
            batch_X = tf.cast(batch_X, "float32")
            batch_y = tf.cast(batch_y, "float32")
            test_step(batch_X, batch_y)
            param_text = ""#"scale: {:.3f} t: {:.3f} ".format(model.top.scale.numpy(), model.top.t.numpy())
            learning_text = "[{}/{}] ".format(str(step).zfill(4), len(dev_dataset))
            progress_text = "dev | Loss: {:.5f} f1: {:.5f} val| Loss: {:.5f} f1 {:.5f}".format(last_dev_loss,
                                                                                               last_dev_f1,
                                                                                               val_loss.result().numpy(),
                                                                                               val_f1.result().numpy())
            pbar.set_postfix_str(learning_text + param_text + progress_text) 
        print(progress_text)
        last_val_loss = val_loss.result().numpy()
        last_val_f1 = val_f1.result().numpy()
        val_loss.reset_states()
        val_f1.reset_states()
        pbar.update(1)

In [None]:
from adabelief_tf import AdaBeliefOptimizer
from tensorflow_addons.metrics import F1Score
from tqdm import tqdm

NUM_EPOCH = 10

model = MyModel(projection_dim=64, out_dim=12)
optimizer = AdaBeliefOptimizer(learning_rate=1e-3, weight_decay=1e-4, epsilon=1e-14, print_change_log = False) 
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
dev_loss = tf.keras.metrics.Mean(name='dev_loss')
dev_f1 = F1Score(num_classes=12, average="micro")

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_f1 = F1Score(num_classes=12, average="micro")

@tf.function
def train_step(X, y):
    with tf.GradientTape() as tape:
        logit = model.call(X, y, training=True)
        loss = loss_object(y, logit)
    gradients = [tf.clip_by_norm(g, 10) for g in tape.gradient(loss, model.trainable_variables)]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    dev_loss.update_state(loss)
    dev_f1.update_state(y, tf.exp(logit))
    
@tf.function
def test_step(X, y):
    logit = model.call(X, y)
    loss = loss_object(y, logit)
    val_loss.update_state(loss)
    val_f1.update_state(y, tf.exp(logit))

last_dev_loss = np.inf
last_val_loss = np.inf
last_dev_f1 = 0.
last_val_f1 = 0.

fin = 0
with tqdm(total=NUM_EPOCH) as pbar:
    for epoch in range(NUM_EPOCH):
        step = 0
        for batch_X, batch_y in dev_dataset.map(read_and_augment_image, num_parallel_calls=tf.data.AUTOTUNE)\
                                           .batch(BATCH_SIZE)\
                                           .prefetch(tf.data.AUTOTUNE):
            batch_X = tf.cast(batch_X, "float32")
            batch_y = tf.cast(batch_y, "float32")
            train_step(batch_X, batch_y)

            param_text = "scale: {:.3f} t: {:.3f} ".format(model.top.scale.numpy(), model.top.t.numpy())
            learning_text = "[{}/{}] ".format(str(step).zfill(4), len(dev_dataset)//BATCH_SIZE)
            progress_text = "dev | Loss: {:.5f} f1: {:.5f} val| Loss: {:.5f} f1 {:.5f}".format(dev_loss.result().numpy(),
                                                                                               dev_f1.result().numpy(),
                                                                                               last_val_loss,
                                                                                               last_val_f1)
            pbar.set_postfix_str(learning_text + param_text + progress_text)
            step += 1

        last_dev_loss = dev_loss.result().numpy()
        last_dev_f1 = dev_f1.result().numpy()
        dev_loss.reset_states()
        dev_f1.reset_states()
        
        for batch_X, batch_y in val_dataset.map(read_image, num_parallel_calls=tf.data.AUTOTUNE)\
                                           .batch(BATCH_SIZE)\
                                           .prefetch(tf.data.AUTOTUNE):
            batch_X = tf.cast(batch_X, "float32")
            batch_y = tf.cast(batch_y, "float32")
            test_step(batch_X, batch_y)
            param_text = "scale: {:.3f} t: {:.3f} ".format(model.top.scale.numpy(), model.top.t.numpy())
            learning_text = "[{}/{}] ".format(str(step).zfill(4), len(dev_dataset))
            progress_text = "dev | Loss: {:.5f} f1: {:.5f} val| Loss: {:.5f} f1 {:.5f}".format(last_dev_loss,
                                                                                               last_dev_f1,
                                                                                               val_loss.result().numpy(),
                                                                                               val_f1.result().numpy())
            pbar.set_postfix_str(learning_text + param_text + progress_text)
        print(progress_text)
        last_val_loss = val_loss.result().numpy()
        last_val_f1 = val_f1.result().numpy()
        val_loss.reset_states()
        val_f1.reset_states()
        pbar.update(1)