In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
!pip install -U keras-cv-attention-models
!pip install -q efficientnet
!pip install tensorflow_addons

from kaggle_datasets import KaggleDatasets
from keras_cv_attention_models import nfnets, efficientnet
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pickle
import json
import tensorflow_hub as tfhub
from datetime import datetime

## Config

In [None]:
class config:
    
    bnneck = False
    gem = False
    MIX_UP = False
    PSEUDO = True
    TEST = True
    EVALUATE = True
    
    SEED = 42
    FOLD_TO_RUN = 1
    FOLDS = 10
    DEBUG = False
    RESUME = False
    RESUME_EPOCH = None
    
    
    ### Dataset
    BATCH_SIZE = 16 * strategy.num_replicas_in_sync
    IMAGE_SIZE = 512
    # IMAGE_SIZE = 32
    N_CLASSES = 15587
    
    ### Model
    model_type = 'effnetv1'  # nfnet effnetv1 effnetv2
    EFF_NET = 7
    # EFF_NET = 0
    
    EFF_NETV2 = 's-21k-ft1k'
    FREEZE_BATCH_NORM = True
    head = 'arcface' # arcface  curricular
    EPOCHS = 35
    LR = 0.001
    
    ### Augmentations
    CUTOUT = False
    
    ### Save-Directory
    save_dir = '.'

    
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def is_interactive():
    return 'runtime'    in get_ipython().config.IPKernelApp.connection_file
IS_INTERACTIVE = is_interactive()
print(IS_INTERACTIVE)

In [None]:
MODEL_NAME = config.model_type
if config.model_type == 'effnetv1':
    MODEL_NAME = f'effnetv1_b{config.EFF_NET}'
elif config.model_type == 'effnetv2':
    MODEL_NAME = f'effnetv2_{config.EFF_NETV2}'

config.MODEL_NAME = MODEL_NAME
print(MODEL_NAME)

In [None]:
with open(config.save_dir+'/config.json', 'w') as fp:
    json.dump({x:dict(config.__dict__)[x] for x in dict(config.__dict__) if not x.startswith('_')}, fp)

In [None]:
GCS_PATH = 'gs://kds-d916c3252bf3bc5b3500b904f05f51ce57c8df85221d11b7711bcda9'  # Get GCS Path from kaggle notebook if GCS Path is expired
# if not IS_COLAB:
#     GCS_PATH1 = KaggleDatasets().get_gcs_path('randomdatasetc')
    
GCS_PATH1 = 'gs://kds-c021fa6ca054971f5d9333f955bf9fc12fa28d1ba64b2a91ae9fdf8e'
train_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH1 + '/happywhale-2022-train*.tfrec')))
test_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH1 + '/happywhale-2022-test*.tfrec')))
print(GCS_PATH)
print(len(train_files),len(test_files),count_data_items(train_files),count_data_items(test_files))

## Data

In [None]:
def cutmix(posting_id, image, label, matches, PROBABILITY):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with cutmix applied
    DIM = config.IMAGE_SIZE
    CLASSES = config.N_CLASSES
    AUG_BATCH = config.BATCH_SIZE
    
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        # DO CUTMIX WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.int32)
        # CHOOSE RANDOM IMAGE TO CUTMIX WITH
        k = tf.cast( tf.random.uniform([],0,AUG_BATCH),tf.int32)
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        b = tf.random.uniform([],0,1) # this is beta dist with alpha=1.0
        WIDTH = tf.cast( DIM * tf.math.sqrt(1-b),tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)
        # MAKE CUTMIX IMAGE
        one = image[j,ya:yb,0:xa,:]
        two = image[k,ya:yb,xa:xb,:]
        three = image[j,ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        img = tf.concat([image[j,0:ya,:,:],middle,image[j,yb:DIM,:,:]],axis=0)
        imgs.append(img)
        # MAKE CUTMIX LABEL
        a = tf.cast(WIDTH*WIDTH/DIM/DIM,tf.float32)

        lab1 = label[j,]
        lab2 = label[k,]
        labs.append((1-a)*lab1 + a*lab2)
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return posting_id, image2,label2, matches


def mixup(posting_id, image, label, matches, PROBABILITY):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with mixup applied
    DIM = config.IMAGE_SIZE
    CLASSES = config.N_CLASSES
    AUG_BATCH = config.BATCH_SIZE
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        # DO MIXUP WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.float32)
        # CHOOSE RANDOM
        k = tf.cast( tf.random.uniform([],0,AUG_BATCH),tf.int32)
        # a = tf.random.uniform([],0,1)*P # this is beta dist with alpha=1.0
        a = np.random.beta(0.5, 0.5) * P
        # MAKE MIXUP IMAGE        
        img1 = image[j,]
        img2 = image[k,]
        
        imgs.append((1-a)*img1 + a*img2)

        lab1 = label[j,]
        lab2 = label[k,]
        
        labs.append((1-a)*lab1 + a*lab2)

    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return posting_id, image2, label2, matches


def transform(posting_id, image, label, matches):
    # THIS FUNCTION APPLIES BOTH CUTMIX AND MIXUP
    DIM = config.IMAGE_SIZE
    CLASSES = config.N_CLASSES
    AUG_BATCH = config.BATCH_SIZE
    SWITCH = 0
    CUTMIX_PROB = 0.5
    MIXUP_PROB = 0.5
    # FOR SWITCH PERCENT OF TIME WE DO CUTMIX AND (1-SWITCH) WE DO MIXUP
    _, image2, label2, _ = cutmix(posting_id, image, label, matches, CUTMIX_PROB)
    _, image3, label3, _ = mixup(posting_id, image, label, matches, MIXUP_PROB)
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        P = tf.cast( tf.random.uniform([],0,1)<=SWITCH, tf.float32)
        imgs.append(P*image2[j,]+(1-P)*image3[j,])
        labs.append(P*label2[j,]+(1-P)*label3[j,])
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image4 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label4 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return posting_id, image4, label4, matches



def onehot(posting_id, image, label, matches):
    return posting_id, image,tf.one_hot(label, config.N_CLASSES), matches


def arcface_format(posting_id, image, label_group, matches):
    return posting_id, {'inp1': image, 'inp2': label_group}, label_group, matches

def arcface_inference_format(posting_id, image, label_group, matches):
    return image,posting_id

def arcface_eval_format(posting_id, image, label_group, matches):
    return image,label_group

# Data augmentation function
def data_augment(posting_id, image, label_group, matches):

    ### CUTOUT
    if tf.random.uniform([])>0.5 and config.CUTOUT:
      N_CUTOUT = 6
      for cutouts in range(N_CUTOUT):
        if tf.random.uniform([])>0.5:
           DIM = config.IMAGE_SIZE
           CUTOUT_LENGTH = DIM//8
           x1 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
           x2 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
           filter_ = tf.concat([tf.zeros((x1,CUTOUT_LENGTH)),tf.ones((CUTOUT_LENGTH,CUTOUT_LENGTH)),tf.zeros((DIM-x1-CUTOUT_LENGTH,CUTOUT_LENGTH))],axis=0)
           filter_ = tf.concat([tf.zeros((DIM,x2)),filter_,tf.zeros((DIM,DIM-x2-CUTOUT_LENGTH))],axis=1)
           cutout = tf.reshape(1-filter_,(DIM,DIM,1))
           image = cutout*image

    image = tf.image.random_flip_left_right(image)
    # image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.70, 1.30)
    image = tf.image.random_contrast(image, 0.80, 1.20)
    image = tf.image.random_brightness(image, 0.10)
    return posting_id, image, label_group, matches

def data_augment_test(posting_id, image, label_group, matches):

    return posting_id, image, label_group, matches

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, [config.IMAGE_SIZE,config.IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# This function parse our images and also get the target variable
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64),
#         "matches": tf.io.FixedLenFeature([], tf.string)
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    posting_id = example['image_name']
    image = decode_image(example['image'])
#     label_group = tf.one_hot(tf.cast(example['label_group'], tf.int32), depth = N_CLASSES)
    label_group = tf.cast(example['target'], tf.int32)
#     matches = example['matches']
    matches = 1
    return posting_id, image, label_group, matches

# This function loads TF Records and parse them into tensors
def load_dataset(filenames, ordered = False):
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
#     dataset = dataset.cache()
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# This function is to get our training tensors
def get_training_dataset(filenames):
    dataset = load_dataset(filenames, ordered = False)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.map(onehot, num_parallel_calls=AUTO)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    if config.MIX_UP:
        dataset = dataset.map(transform, num_parallel_calls=AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_val_dataset(filenames):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(onehot, num_parallel_calls=AUTO)
    dataset = dataset.map(data_augment_test, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

## Model

In [None]:
# class CurricularFace(tf.keras.layers.Layer):
#     def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
#                  ls_eps=0.0, **kwargs):
#         super(CurricularFace, self).__init__(**kwargs)

#         self.n_classes = n_classes
#         self.s = s
#         self.m = m
#         self.ls_eps = ls_eps
#         self.easy_margin = easy_margin
#         self.cos_m = tf.math.cos(m)
#         self.sin_m = tf.math.sin(m)
#         self.th = tf.math.cos(math.pi - m)
#         self.mm = tf.math.sin(math.pi - m) * m
#         self._USE_V2_BEHAVIOR = True

#     def _assign_new_value(self, variable, value):
#         with K.name_scope('AssignNewValue') as scope:
#           if tf.compat.v1.executing_eagerly_outside_functions():
#             return variable.assign(value, name=scope)
#           else:
#             with tf.compat.v1.colocate_with(variable):  # pylint: disable=protected-access
#               return tf.compat.v1.assign(variable, value, name=scope)


#     def _get_training_value(self, training=None):
#         if training is None:
#           training = K.learning_phase()
#         if self._USE_V2_BEHAVIOR:
#           if isinstance(training, int):
#             training = bool(training)
#           if not self.trainable:
#             # When the layer is not trainable, it overrides the value passed from
#             # model.
#             training = False
#         return training


#     def get_config(self):

#         config = super().get_config().copy()
#         config.update({
#             'n_classes': self.n_classes,
#             's': self.s,
#             'm': self.m,
#             'ls_eps': self.ls_eps,
#             'easy_margin': self.easy_margin,
#         })
#         return config

#     def build(self, input_shape):
#         super(CurricularFace, self).build(input_shape[0])

#         self.W = self.add_weight(
#             name='W',
#             shape=(int(input_shape[0][-1]), self.n_classes),
#             initializer='glorot_uniform',
#             dtype='float32',
#             trainable=True,
#             regularizer=None)
        
#         self.t = self.add_weight(
#             name='t',
#             shape=(1),
#             initializer=tf.zeros_initializer(),
#             dtype='float32',
#             trainable=False,
#             regularizer=None,
#             aggregation=tf.VariableAggregation.MEAN,
#             experimental_autocast=False,
#             synchronization=tf.VariableSynchronization.ON_READ)
        
#     def call(self, inputs, training=None):
#         X, y = inputs
#         y = tf.cast(y, dtype=tf.int32)

#         do_training = self._get_training_value(training)

#         if do_training:
#             cosine = tf.matmul(
#                 tf.math.l2_normalize(X, axis=1),
#                 tf.math.l2_normalize(self.W, axis=0)
#             )
#             sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
#             phi = cosine * self.cos_m - sine * self.sin_m

#             target_logit = tf.reduce_sum(cosine * tf.cast(tf.one_hot(y, depth=self.n_classes),dtype=cosine.dtype), axis=-1)
#             sin_theta = tf.math.sqrt(1.0 - tf.math.pow(target_logit, 2))
#             cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m

#             phi = tf.where(cosine > self.th, phi, cosine - self.mm)
#             one_hot = tf.cast(
#                 tf.one_hot(y, depth=self.n_classes),
#                 dtype=cosine.dtype
#             )
        
#             t = tf.reduce_mean(target_logit) * 0.01 + (1 - 0.01) * self.t
#             self._assign_new_value(self.t, t)
#             cosine = tf.where(cosine > tf.expand_dims(cos_theta_m, axis=-1), cosine*(self.t+cosine), cosine)

#             if self.ls_eps > 0:
#                 one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

#             output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
#             output *= self.s

#         else:
#             output = tf.matmul(
#                 tf.math.l2_normalize(X, axis=1),
#                 tf.math.l2_normalize(self.W, axis=0)
#             )

#         return output
    
    
# # Arcmarginproduct class keras layer
# class ArcMarginProduct(tf.keras.layers.Layer):
#     '''
#     Implements large margin arc distance.

#     Reference:
#         https://arxiv.org/pdf/1801.07698.pdf
#         https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
#             blob/master/src/modeling/metric_learning.py
#     '''
#     def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
#                  ls_eps=0.0, **kwargs):

#         super(ArcMarginProduct, self).__init__(**kwargs)

#         self.n_classes = n_classes
#         self.s = s
#         self.m = m
#         self.ls_eps = ls_eps
#         self.easy_margin = easy_margin
#         self.cos_m = tf.math.cos(m)
#         self.sin_m = tf.math.sin(m)
#         self.th = tf.math.cos(math.pi - m)
#         self.mm = tf.math.sin(math.pi - m) * m

#     def get_config(self):

#         config = super().get_config().copy()
#         config.update({
#             'n_classes': self.n_classes,
#             's': self.s,
#             'm': self.m,
#             'ls_eps': self.ls_eps,
#             'easy_margin': self.easy_margin,
#         })
#         return config

#     def build(self, input_shape):
#         super(ArcMarginProduct, self).build(input_shape[0])

#         self.W = self.add_weight(
#             name='W',
#             shape=(int(input_shape[0][-1]), self.n_classes),
#             initializer='glorot_uniform',
#             dtype='float32',
#             trainable=True,
#             regularizer=None)

#     def call(self, inputs):
#         X, y = inputs
#         y = tf.cast(y, dtype=tf.int32)
#         cosine = tf.matmul(
#             tf.math.l2_normalize(X, axis=1),
#             tf.math.l2_normalize(self.W, axis=0)
#         )
#         sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
#         phi = cosine * self.cos_m - sine * self.sin_m
#         if self.easy_margin:
#             phi = tf.where(cosine > 0, phi, cosine)
#         else:
#             phi = tf.where(cosine > self.th, phi, cosine - self.mm)
#         one_hot = tf.cast(
#             tf.one_hot(y, depth=self.n_classes),
#             dtype=cosine.dtype
#         )
#         if self.ls_eps > 0:
#             one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

#         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
#         output *= self.s
#         return output
    
# class GeMPoolingLayer(tf.keras.layers.Layer):
#     def __init__(self, p=1., train_p=False):
#         super().__init__()
#         if train_p:
#             self.p = tf.Variable(p, dtype=tf.float32)
#         else:
#             self.p = p
#         self.eps = 1e-6

#     def call(self, inputs: tf.Tensor, **kwargs):
#         inputs = tf.clip_by_value(inputs, clip_value_min=1e-6, clip_value_max=tf.reduce_max(inputs))
#         inputs = tf.pow(inputs, self.p)
#         inputs = tf.reduce_mean(inputs, axis=[1, 2], keepdims=False)
#         inputs = tf.pow(inputs, 1./self.p)
#         return inputs

class CurricularFace(tf.keras.layers.Layer):
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):
        super(CurricularFace, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m
        self._USE_V2_BEHAVIOR = True

    def _assign_new_value(self, variable, value):
        with K.name_scope('AssignNewValue') as scope:
          if tf.compat.v1.executing_eagerly_outside_functions():
            return variable.assign(value, name=scope)
          else:
            with tf.compat.v1.colocate_with(variable):  # pylint: disable=protected-access
              return tf.compat.v1.assign(variable, value, name=scope)


    def _get_training_value(self, training=None):
        if training is None:
          training = K.learning_phase()
        if self._USE_V2_BEHAVIOR:
          if isinstance(training, int):
            training = bool(training)
          if not self.trainable:
            # When the layer is not trainable, it overrides the value passed from
            # model.
            training = False
        return training


    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(CurricularFace, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)
        
        self.t = self.add_weight(
            name='t',
            shape=(1),
            initializer=tf.zeros_initializer(),
            dtype='float32',
            trainable=False,
            regularizer=None,
            aggregation=tf.VariableAggregation.MEAN,
            experimental_autocast=False,
            synchronization=tf.VariableSynchronization.ON_READ)
        
    def call(self, inputs, training=None):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)

        do_training = self._get_training_value(training)

        if do_training:
            cosine = tf.matmul(
                tf.math.l2_normalize(X, axis=1),
                tf.math.l2_normalize(self.W, axis=0)
            )
            sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
            phi = cosine * self.cos_m - sine * self.sin_m

            # target_logit = tf.reduce_sum(cosine * tf.cast(tf.one_hot(y, depth=self.n_classes),dtype=cosine.dtype), axis=-1)
            target_logit = tf.reduce_sum(cosine * tf.cast(y, dtype=cosine.dtype), axis=-1)
            sin_theta = tf.math.sqrt(1.0 - tf.math.pow(target_logit, 2))
            cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m

            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
#             one_hot = tf.cast(
#                 tf.one_hot(y, depth=self.n_classes),
#                 dtype=cosine.dtype
#             )
            one_hot = tf.cast(
            y,
            dtype=cosine.dtype
        ) 
            t = tf.reduce_mean(target_logit) * 0.01 + (1 - 0.01) * self.t
            self._assign_new_value(self.t, t)
            cosine = tf.where(cosine > tf.expand_dims(cos_theta_m, axis=-1), cosine*(self.t+cosine), cosine)

            if self.ls_eps > 0:
                one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

            output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
            output *= self.s

        else:
            output = tf.matmul(
                tf.math.l2_normalize(X, axis=1),
                tf.math.l2_normalize(self.W, axis=0)
            )

        return output
    
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
#         one_hot = tf.cast(
#             tf.one_hot(y, depth=self.n_classes),
#             dtype=cosine.dtype
#         )
        one_hot = tf.cast(
            y,
            dtype=cosine.dtype
        ) 
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output
    
    
class GeMPoolingLayer(tf.keras.layers.Layer):
    def __init__(self, p=1., train_p=False):
        super().__init__()
        if train_p:
            self.p = tf.Variable(p, dtype=tf.float32)
        else:
            self.p = p
        self.eps = 1e-6

    def call(self, inputs: tf.Tensor, **kwargs):
        inputs = tf.clip_by_value(inputs, clip_value_min=1e-6, clip_value_max=tf.reduce_max(inputs))
        inputs = tf.pow(inputs, self.p)
        inputs = tf.reduce_mean(inputs, axis=[1, 2], keepdims=False)
        inputs = tf.pow(inputs, 1./self.p)
        return inputs

In [None]:
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6, efn.EfficientNetB7]

def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

# Function to create our EfficientNetB3 model
def get_model():

    if config.head=='arcface':
        head = ArcMarginProduct
    elif config.head=='curricular':
        head = CurricularFace
    else:
        assert 1==2, "INVALID HEAD"
    
    with strategy.scope():
        
        margin = head(
            n_classes = config.N_CLASSES, 
            s = 30, 
            m = 0.3, 
            name=f'head/{config.head}', 
            dtype='float32'
            )

        inp = tf.keras.layers.Input(shape = [config.IMAGE_SIZE, config.IMAGE_SIZE, 3], name = 'inp1')
        label = tf.keras.layers.Input(shape = (), name = 'inp2')
        
        if config.model_type == 'effnetv1':
            x = EFNS[config.EFF_NET](weights = 'noisy-student', include_top = False)(inp)
            if config.gem:
                embed = GeMPoolingLayer(train_p=False)(x)
            else:
                embed = tf.keras.layers.GlobalAveragePooling2D()(x)
            
        elif config.model_type == 'effnetv2':
            x = efficientnet.EfficientNetV2M(input_shape=[config.IMAGE_SIZE, config.IMAGE_SIZE, 3], pretrained="imagenet", num_classes=0)(inp)
            if config.gem:
                embed = GeMPoolingLayer(train_p=False)(x)
            else:
                embed = tf.keras.layers.GlobalAveragePooling2D()(x)
            
        elif config.model_type == 'nfnet':
            x = nfnets.NFNetL0(input_shape=[config.IMAGE_SIZE, config.IMAGE_SIZE, 3], pretrained="imagenet", num_classes=0)(inp)
            if config.gem:
                embed = GeMPoolingLayer(train_p=False)(x)
            else:
                embed = tf.keras.layers.GlobalAveragePooling2D()(x)
            
        
        if config.bnneck:
            bnneck = tf.keras.layers.BatchNormalization()(embed)
            embed = tf.keras.layers.Dense(512, use_bias=False)(bnneck)
        else:
            embed = tf.keras.layers.Dropout(0.2)(embed)
            embed = tf.keras.layers.Dense(512)(embed)
        x = margin([embed, label])
        
        output = tf.keras.layers.Softmax(dtype='float32')(x)
        
        model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
        embed_model = tf.keras.models.Model(inputs = inp, outputs = embed)  
        
        opt = tf.keras.optimizers.Adam(learning_rate = config.LR)
        # opt = tfa.optimizers.AdamW(learning_rate = config.LR, weight_decay=0.01)
        if config.FREEZE_BATCH_NORM:
            freeze_BN(model)

        model.compile(
            optimizer = opt,
#             loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
#             metrics = [tf.keras.metrics.SparseCategoricalAccuracy(),tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5)]
 
            loss = [tf.keras.losses.CategoricalCrossentropy()],
            metrics = [tf.keras.metrics.CategoricalAccuracy(),tf.keras.metrics.TopKCategoricalAccuracy(k=5)]
        )
        return model,embed_model

In [None]:
def get_lr_callback(plot=False):
    lr_start   = 0.000001
    lr_max     = 0.000005 * config.BATCH_SIZE  
    lr_min     = 0.000001
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if config.RESUME:
            epoch = epoch + config.RESUME_EPOCH
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr
        
    if plot:
        epochs = list(range(config.EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

get_lr_callback(plot=True)

In [None]:
class Snapshot(tf.keras.callbacks.Callback):
    
    def __init__(self,fold,snapshot_epochs=[]):
        super(Snapshot, self).__init__()
        self.snapshot_epochs = snapshot_epochs
        self.fold = fold
        
        
    def on_epoch_end(self, epoch, logs=None):
        # logs is a dictionary
#         print(f"epoch: {epoch}, train_acc: {logs['acc']}, valid_acc: {logs['val_acc']}")
        if epoch in self.snapshot_epochs: # your custom condition         
            self.model.save_weights(config.save_dir+f"/EF{config.MODEL_NAME}_epoch{epoch}.h5")
        self.model.save_weights(config.save_dir+f"/{config.MODEL_NAME}_last.h5")

## Train

In [None]:
TRAINING_FILENAMES = [x for i,x in enumerate(train_files) if i%config.FOLDS!=config.FOLD_TO_RUN]
VALIDATION_FILENAMES = [x for i,x in enumerate(train_files) if i%config.FOLDS==config.FOLD_TO_RUN]
print(len(TRAINING_FILENAMES),len(VALIDATION_FILENAMES),count_data_items(TRAINING_FILENAMES),count_data_items(VALIDATION_FILENAMES))

GCS_PATH2 = 'gs://kds-d79d083c75d25699e49640c8c5dcb58dd05c8b23a3d0ada2fe03b96e'
train_files2 = np.sort(np.array(tf.io.gfile.glob(GCS_PATH2 + '/happywhale-2022-train*.tfrec')))
TRAINING_FILENAMES2 = np.concatenate((TRAINING_FILENAMES, train_files2))

In [None]:
if config.DEBUG:
    TRAINING_FILENAMES = [TRAINING_FILENAMES[0]]
    VALIDATION_FILENAMES = [VALIDATION_FILENAMES[0]]
    print(len(TRAINING_FILENAMES),len(VALIDATION_FILENAMES),count_data_items(TRAINING_FILENAMES),count_data_items(VALIDATION_FILENAMES))
    test_files = [test_files[0]]

In [None]:
seed_everything(config.SEED)
VERBOSE = 1
if config.PSEUDO:
    train_dataset = get_training_dataset(TRAINING_FILENAMES2)
    STEPS_PER_EPOCH = count_data_items(TRAINING_FILENAMES2) // config.BATCH_SIZE

else:
    train_dataset = get_training_dataset(TRAINING_FILENAMES)
    STEPS_PER_EPOCH = count_data_items(TRAINING_FILENAMES) // config.BATCH_SIZE

val_dataset = get_val_dataset(VALIDATION_FILENAMES)
train_logger = tf.keras.callbacks.CSVLogger(config.save_dir+'/training-log-fold-%i.h5.csv'%config.FOLD_TO_RUN)
# SAVE BEST MODEL EACH FOLD        
sv_loss = tf.keras.callbacks.ModelCheckpoint(
    config.save_dir+f"/{config.MODEL_NAME}_loss.h5", monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
# BUILD MODEL
K.clear_session()
model,embed_model = get_model()
snap = Snapshot(fold=config.FOLD_TO_RUN,snapshot_epochs=[5,8])
model.summary()

if config.RESUME:   
    model.load_weights(config.resume_model_wts)

In [None]:
print('#### Image Size %i with EfficientNet B%i and batch_size %i'%
      (config.IMAGE_SIZE,config.EFF_NET,config.BATCH_SIZE))

history = model.fit(train_dataset,
                validation_data = val_dataset,
                steps_per_epoch = STEPS_PER_EPOCH,
                epochs = config.EPOCHS,
                callbacks = [snap,get_lr_callback(),train_logger,sv_loss], 
                verbose = VERBOSE)

In [None]:
# This function is to get our training tensors
def get_eval_dataset(filenames, get_targets = True):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(data_augment_test, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_eval_format, num_parallel_calls = AUTO)
    if not get_targets:
        dataset = dataset.map(lambda image, target: image)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset
# This function is to get our training tensors
def get_test_dataset(filenames, get_names = True):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(data_augment_test, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_inference_format, num_parallel_calls = AUTO)
    if not get_names:
        dataset = dataset.map(lambda image, posting_id: image)
    dataset = dataset.batch(config.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def get_ids(filename):
    ds = get_test_dataset([filename],get_names=True).map(lambda image, image_name: image_name).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy().astype('U')
    return ids

def get_targets(filename):
    ds = get_eval_dataset([filename],get_targets=True).map(lambda image, target: target).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy()
    return ids

def get_embeddings(filename):
    ds = get_test_dataset([filename],get_names=False)
    embeddings = np.mean(np.stack([embed_models[x][1].predict(ds,verbose=0) for x in range(len(embed_models))]), axis=0)
    return embeddings

def get_predictions(test_df,threshold=0.2):
    predictions = {}
    for i,row in tqdm(test_df.iterrows()):
        if row.image in predictions:
            if len(predictions[row.image])==5:
                continue
            predictions[row.image].append(row.target)
        elif row.confidence>threshold:
            predictions[row.image] = [row.target,'new_individual']
        else:
            predictions[row.image] = ['new_individual',row.target]

    for x in tqdm(predictions):
        if len(predictions[x])<5:
            remaining = [y for y in sample_list if y not in predictions]
            predictions[x] = predictions[x]+remaining
            predictions[x] = predictions[x][:5]

    return predictions

def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0


if config.EVALUATE:

    embed_models=[]
    model,embed_model = get_model()
    embed_models.append((model.load_weights(f"{config.MODEL_NAME}_loss.h5"),embed_model))
    len(embed_models)

    target_encodings = np.load('../input/happywhaleyolo/individual_id_map_backfins.npy', allow_pickle=True).item()
    sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

    train_targets = []
    train_embeddings = []
    train_data_list=[] 

    for filename in tqdm(TRAINING_FILENAMES):
        embeddings = get_embeddings(filename)
        with open(config.save_dir+f'train_{filename.split("/")[-1]}_{config.FOLD_TO_RUN}.npy', 'wb') as f:
            np.save(f, embeddings)    
        targets = get_targets(filename)
        train_embeddings.append(embeddings)
        train_targets.append(targets)
        train_data_list.append([filename,embeddings])
    train_embeddings_df = pd.DataFrame(train_data_list, columns=['filename', 'embeddings'])
    train_embeddings_df['FOLD_TO_RUN']=config.FOLD_TO_RUN
    train_embeddings_df.to_csv(config.save_dir+f"/train_embeddings_{config.FOLD_TO_RUN}.csv",index=False)
    train_embeddings = np.concatenate(train_embeddings)
    train_targets = np.concatenate(train_targets)
    from sklearn.neighbors import NearestNeighbors
    neigh = NearestNeighbors(n_neighbors=50,metric='cosine')
    neigh.fit(train_embeddings)


    test_ids = []
    test_nn_distances = []
    test_nn_idxs = []
    val_targets = []
    val_embeddings = []
    val_data_list=[] 
    for filename in tqdm(VALIDATION_FILENAMES):
        embeddings = get_embeddings(filename)
        with open(config.save_dir+f'val_{filename.split("/")[-1]}_{config.FOLD_TO_RUN}.npy', 'wb') as f:
            np.save(f, embeddings) 
        val_data_list.append([filename,embeddings])
        targets = get_targets(filename)
        ids = get_ids(filename)
        distances,idxs = neigh.kneighbors(embeddings, 50, return_distance=True)
        test_ids.append(ids)
        test_nn_idxs.append(idxs)
        test_nn_distances.append(distances)
        val_embeddings.append(embeddings)
        val_targets.append(targets)
    val_embeddings_df = pd.DataFrame(val_data_list, columns=['filename', 'embeddings'])
    val_embeddings_df['FOLD_TO_RUN']=config.FOLD_TO_RUN
    val_embeddings_df.to_csv(config.save_dir+f"/val_embeddings_{config.FOLD_TO_RUN}.csv",index=False)

    test_nn_distances = np.concatenate(test_nn_distances)
    test_nn_idxs = np.concatenate(test_nn_idxs)
    test_ids = np.concatenate(test_ids)
    val_embeddings = np.concatenate(val_embeddings)
    val_targets = np.concatenate(val_targets)

    allowed_targets = set([target_encodings[x] for x in np.unique(train_targets)])
    val_targets_df = pd.DataFrame(np.stack([test_ids,val_targets],axis=1),columns=['image','target'])
    val_targets_df['target'] = val_targets_df['target'].astype(int).map(target_encodings)
    val_targets_df.loc[~val_targets_df.target.isin(allowed_targets),'target'] = 'new_individual'
    val_targets_df.target.value_counts()

    test_df = []
    for i in tqdm(range(len(test_ids))):
        id_ = test_ids[i]
        targets = train_targets[test_nn_idxs[i]]
        distances = test_nn_distances[i]
        subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
        subset_preds['image'] = id_
        test_df.append(subset_preds)
    test_df = pd.concat(test_df).reset_index(drop=True)
    test_df['confidence'] = 1-test_df['distances']
    test_df = test_df.groupby(['image','target']).confidence.max().reset_index()
    test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
    test_df['target'] = test_df['target'].map(target_encodings)
    test_df.to_csv('val_neighbors.csv')
    test_df.image.value_counts().value_counts()

    ## Compute CV
    best_th = 0
    best_cv = 0
    for th in [0.02 * x for x in range(20, 33)]:
        all_preds = get_predictions(test_df,threshold=th)
        cv = 0
        for i,row in val_targets_df.iterrows():
            target = row.target
            preds = all_preds[row.image]
            val_targets_df.loc[i,th] = map_per_image(target,preds)
        cv = val_targets_df[th].mean()
        print(f"CV at threshold {th}: {cv}")
        if cv>best_cv:
            best_th = th
            best_cv = cv

    print("Best threshold",best_th)
    print("Best cv",best_cv)
    val_targets_df.describe()

    ## Adjustment: Since Public lb has nearly 10% 'new_individual' (Be Careful for private LB)
    val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual'
    print(val_targets_df.is_new_individual.value_counts().to_dict())
    val_scores = val_targets_df.groupby('is_new_individual').mean().T
    val_scores['adjusted_cv'] = val_scores[True]*0.1+val_scores[False]*0.9
    best_threshold_adjusted = val_scores['adjusted_cv'].idxmax()
    print("best_threshold",best_threshold_adjusted)
    print(val_scores)
    
    
if config.TEST:
    test_ids = []
    test_nn_distances = []
    test_nn_idxs = []
    test_data_list=[] 
    for filename in tqdm(test_files):
        embeddings = get_embeddings(filename)
        with open(config.save_dir+f'test_{filename.split("/")[-1]}_{config.FOLD_TO_RUN}.npy', 'wb') as f:
            np.save(f, embeddings)

In [None]:
## Arc + mixup nfnetl0
# Epoch 25/25
# 146/146 [==============================] - 163s 1s/step 
# - loss: 0.7047 - categorical_accuracy: 0.9365 - top_k_categorical_accuracy: 0.9892 
# - val_loss: 9.7082 - val_categorical_accuracy: 0.3736 - val_top_k_categorical_accuracy: 0.4502

# curriclar nfnetl0
# Epoch 25/25
# 146/146 [==============================] - 115s 790ms/step 
# - loss: 0.0095 - sparse_categorical_accuracy: 1.0000 - sparse_top_k_categorical_accuracy: 1.0000 
# - val_loss: 9.1595 - val_sparse_categorical_accuracy: 0.6060 - val_sparse_top_k_categorical_accuracy: 0.6606