In [None]:
!pip install -q efficientnet
!pip install -q tensorflow_addons
#!pip install tfimm

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">Import</p></div>

In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras import backend as K
from tensorflow.keras.layers import BatchNormalization, Input, GlobalAveragePooling2D, Dropout, Dense, Softmax
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy, SparseTopKCategoricalAccuracy
from tensorflow.keras.callbacks import LearningRateScheduler, CSVLogger, ModelCheckpoint
from tensorflow.keras.utils import plot_model
import tensorflow_addons as tfa
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pickle
import json
import tensorflow_hub as tfhub
from datetime import datetime

from kaggle_datasets import KaggleDatasets

In [None]:
print(f"\n... ACCELERATOR SETUP STARTING ...\n")

# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  
except ValueError:
    TPU = None

if TPU:
    print(f"\n... RUNNING ON TPU - {TPU.master()}...")
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    print(f"\n... RUNNING ON CPU/GPU ...")
    # Yield the default distribution strategy in Tensorflow
    #   --> Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy() 

# What Is a Replica?
#    --> A single Cloud TPU device consists of FOUR chips, each of which has TWO TPU cores. 
#    --> Therefore, for efficient utilization of Cloud TPU, a program should make use of each of the EIGHT (4x2) cores. 
#    --> Each replica is essentially a copy of the training graph that is run on each core and 
#        trains a mini-batch containing 1/8th of the overall batch size
N_REPLICAS = strategy.num_replicas_in_sync
    
print(f"... # OF REPLICAS: {N_REPLICAS} ...\n")

AUTO = tf.data.experimental.AUTOTUNE

print(f"\n... ACCELERATOR SETUP COMPLTED ...\n")

In [None]:
print(f"\n... XLA OPTIMIZATIONS STARTING ...\n")

print(f"\n... CONFIGURE JIT (JUST IN TIME) COMPILATION ...\n")
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)

print(f"\n... XLA OPTIMIZATIONS COMPLETED ...\n")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">Configuration and Versions</p></div>

In [None]:
run_ts = datetime.now().strftime('%Y%m%d-%H%M%S')
print(run_ts)

In [None]:
class CFG:
    
    SEED = 42
    FOLD_TO_RUN = 0   # To seperate train/validation dataset
    FOLDS = 5 # If FOLDS == FOLD_TO_RUN, using all data for trainning
    DEBUG = False  # If True, get out some data to run model
    EVALUATE = True
    
    ### Dataset
    GCS_PATH = 'happywhale-cropped-removebackground-tfrecords-v1'
#     GCS_PATH = 'happywhale-cropped-tfrecords-v1'
    BATCH_SIZE = 16 * N_REPLICAS
    IMAGE_SIZE = 768
    N_CLASSES = 15587
    
    ### Model
    model_type = f'effnetv2'
    EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
            efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6, efn.EfficientNetB7]
    EFF_NET = 7 # choose EfficientV1
    EFF_NETV2 = f'efficientnetv2-xl-21k-ft1k'
    FREEZE_BATCH_NORM = False # Choose inference or 
    head = 'arcface' # head layer in model
    EPOCHS = 25
    LR = 0.001
    message='baseline'
    RESUME = False # Resume learning from the checkpoint
    
    ### Augmentations
    CUTOUT = False
    N_CUTOUT = 6
    ### Inference
    KNN = 100
    
    ###Learning Rate Scheduler
    RESUME_EPOCH = 9
    
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)
    
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def is_interactive():
    return 'runtime'    in get_ipython().config.IPKernelApp.connection_file
IS_INTERACTIVE = is_interactive()
print(IS_INTERACTIVE)

In [None]:
MODEL_NAME = None
if CFG.model_type == 'effnetv1':
    MODEL_NAME = f'effnetv1_b{CFG.EFF_NET}'
elif CFG.model_type == 'effnetv2':
    MODEL_NAME = f'effnetv2_{CFG.EFF_NETV2}'

CFG.MODEL_NAME = MODEL_NAME
print(MODEL_NAME)

In [None]:
print("\n... DATA ACCESS SETUP STARTED ...\n")
# Choose the 
GCS_PATH = KaggleDatasets().get_gcs_path(CFG.GCS_PATH)
train_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-train*.tfrec')))
test_files = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/happywhale-2022-test*.tfrec')))

print(count_data_items(train_files),count_data_items(test_files))
    
print("\n\n... DATA ACCESS SETUP COMPLETED ...\n")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">Dataset</p></div>

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">EXTRA DATASET</p></div>
****
## 1. happywhale-splits
This dataset is created by @ks2019 with code:
```
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)
train_df.to_csv('train_fixed.csv',index=False)

train_df = pd.read_csv('train_fixed.csv')
train_df['individual_id'] = train_df['individual_id'].map(individual_ids)
train_df['species'] = train_df['species'].map(species)
skf = StratifiedKFold(n_splits=5,random_state=123)
for fold,(train_index, test_index) in enumerate(skf.split(train_df, train_df.species)):
    train_df.loc[test_index,'fold'] = fold
print(train_df.groupby('fold').individual_id.nunique().to_dict())
print(train_df.groupby('fold').species.nunique().to_dict())
print(train_df.groupby('fold').image.nunique().to_dict())
train_df.to_csv('skf_species_5folds.csv',index=False)
```


## 2. TFRecords 
https://www.kaggle.com/datasets/phanttan/happywhale-cropped-removebackground-tfrecords-v1


# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">HELPER FUNCTIONS</p></div>

## ArcFace

In [None]:
def arcface_format(posting_id, image, label_group, matches):
    return posting_id, {'Input1': image, 'Input2': label_group}, label_group, matches

def arcface_inference_format(posting_id, image, label_group, matches):
    return image,posting_id

def arcface_eval_format(posting_id, image, label_group, matches):
    return image,label_group

## Data Augmentation 

In [None]:
def data_augment(posting_id, image, label_group, matches):
    ### CUTOUT
    if tf.random.uniform([])>0.5 and CFG.CUTOUT:
        N_CUTOUT = 6
        for cutouts in range(N_CUTOUT):
            if tf.random.uniform([])>0.5:
               DIM = CFG.IMAGE_SIZE
               CUTOUT_LENGTH = DIM//8
               x1 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
               x2 = tf.cast( tf.random.uniform([],0,DIM-CUTOUT_LENGTH),tf.int32)
               filter_ = tf.concat([tf.zeros((x1,CUTOUT_LENGTH)),tf.ones((CUTOUT_LENGTH,CUTOUT_LENGTH)),tf.zeros((DIM-x1-CUTOUT_LENGTH,CUTOUT_LENGTH))],axis=0)
               filter_ = tf.concat([tf.zeros((DIM,x2)),filter_,tf.zeros((DIM,DIM-x2-CUTOUT_LENGTH))],axis=1)
               cutout = tf.reshape(1-filter_,(DIM,DIM,1))
               image = cutout*image

    image = tf.image.random_flip_left_right(image)
    # image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.70, 1.30)
    image = tf.image.random_contrast(image, 0.80, 1.20)
    image = tf.image.random_brightness(image, 0.10)
    return posting_id, image, label_group, matches

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, [CFG.IMAGE_SIZE,CFG.IMAGE_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image

# This function parse our images and also get the target variable
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64),
#         "matches": tf.io.FixedLenFeature([], tf.string)
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    posting_id = example['image_name']
    image = decode_image(example['image'])
#     label_group = tf.one_hot(tf.cast(example['label_group'], tf.int32), depth = N_CLASSES)
    label_group = tf.cast(example['target'], tf.int32)
#     matches = example['matches']
    matches = 1
    return posting_id, image, label_group, matches

## TF Records to Tensors Functions

In [None]:
def load_dataset(filenames, ordered = False):
    """Loading Dataset from dataset: happywhale-tfrecords-v1.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
#     dataset = dataset.cache()
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# Get Dataset with some configurations for Training
def get_training_dataset(filenames):
    dataset = load_dataset(filenames, ordered = False)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(CFG.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Get Dataset with some configurations for Validation
def get_val_dataset(filenames):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_format, num_parallel_calls = AUTO)
    dataset = dataset.map(lambda posting_id, image, label_group, matches: (image, label_group))
    dataset = dataset.batch(CFG.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our training tensors
def get_eval_dataset(filenames, get_targets = True):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_eval_format, num_parallel_calls = AUTO)
    if not get_targets:
        dataset = dataset.map(lambda image, target: image)
    dataset = dataset.batch(CFG.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Get Dataset with some configurations for Prediction
def get_test_dataset(filenames, get_names = True):
    dataset = load_dataset(filenames, ordered = True)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.map(arcface_inference_format, num_parallel_calls = AUTO)
    if not get_names:
        dataset = dataset.map(lambda image, posting_id: image)
    dataset = dataset.batch(CFG.BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])
        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
# BatchNormalization Mode Function
def freeze_BatchNorm(model):
    # Unfreeze all Layers except for Batch Norm
    for layer in model.layers:
        if not isinstance(layer, BatchNormalization):
            layer.trainable = True 
        else:
            layer.trainable = False # in inference mode
    return model

## For Evaluation 

In [None]:
## Find the name of image in each filename. Exp : 00021adfb725ed.jpg 
def get_id(filename):
    ds = get_test_dataset([filename],get_names=True).map(lambda image, image_name: image_name).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy().astype('U')
    return ids
## Find the Target Encoding (defined in happywhale-splits/individual_ids.json)
def get_target(filename):
    ds = get_eval_dataset([filename],get_targets=True).map(lambda image, target: target).unbatch()
    NUM_IMAGES = count_data_items([filename])
    ids = next(iter(ds.batch(NUM_IMAGES))).numpy()
    return ids
## Find the average of predictions in the best 5-models
def get_embedding(filename):
    ds = get_test_dataset([filename],get_names=False)
    embedding = np.mean(np.stack([embed_models[x][1].predict(ds,verbose=0) for x in range(len(embed_models))]), axis=0)
    return embedding

def get_prediction(test_df,threshold=0.2):
    predictions = {}
    for i,row in tqdm(test_df.iterrows()):
        if row.image in predictions:
            if len(predictions[row.image])==5:
                continue
            predictions[row.image].append(row.target)
        elif row.confidence>threshold:
            predictions[row.image] = [row.target,'new_individual']
        else:
            predictions[row.image] = ['new_individual',row.target]

    return predictions

def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">BUILD MODEL</p></div>

In [None]:
def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False


# Create efficientnetv2-xl-21k-ft1k Model
def get_model():
    if CFG.head == "arcface":
        head = ArcMarginProduct
    else:
        assert 2021==2022 , "INVALID HEAD IN MODEL"
        
    with strategy.scope():
        margin = head(n_classes=CFG.N_CLASSES, 
                      s=30, 
                      m=0.3, 
                      name=f'head/{CFG.head}', 
                      dtype='float32'
                     )
        inp = tf.keras.layers.Input(shape = [CFG.IMAGE_SIZE, CFG.IMAGE_SIZE, 3], name = 'Input1')
        label = Input(shape=(), name='Input2')
        if CFG.model_type == 'effnetv1':
            x = CFG.EFNS[CFG.EFF_NET](weights = 'noisy-student', include_top = False)(inp)
            embed = tf.keras.layers.GlobalAveragePooling2D()(x)
        elif CFG.model_type == 'effnetv2':
            GCS_WEIGHTS_PATH = KaggleDatasets().get_gcs_path('efficientnetv2-tfhub-weight-files')
            FEATURE_VECTOR = f'{GCS_WEIGHTS_PATH}/tfhub_models/{CFG.EFF_NETV2}/feature_vector'
            embed = tfhub.KerasLayer(FEATURE_VECTOR, trainable=True)(inp)
        else:
            embed = tfimm.create_model(CFG.model_type, pretrained="timm")(inp)

        embed = tf.keras.layers.Dropout(0.3)(embed)
        embed = tf.keras.layers.Dense(1024)(embed)
        x = margin([embed, label])
        
        output = tf.keras.layers.Softmax(dtype='float32')(x)
        
        model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
        embed_model = tf.keras.models.Model(inputs = inp, outputs = embed)  
        
        opt = tf.keras.optimizers.Adam(learning_rate = CFG.LR)
        if CFG.FREEZE_BATCH_NORM:
            freeze_BN(model)

        model.compile(
            optimizer = opt,
            loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
            metrics = [tf.keras.metrics.SparseCategoricalAccuracy(),tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5)]
            ) 
        
        return model,embed_model

In [None]:
def get_lr_callback(plot=False):
    lr_start   = 0.000001
    lr_max     = 0.000005 * CFG.BATCH_SIZE  
    lr_min     = 0.000001
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if CFG.RESUME:
            epoch = epoch + CFG.RESUME_EPOCH
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr
        
    if plot:
        epochs = list(range(CFG.EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [None]:
class Snapshot(tf.keras.callbacks.Callback):
    
    def __init__(self,fold,snapshot_epochs=[]):
        super(Snapshot, self).__init__()
        self.snapshot_epochs = snapshot_epochs
        self.fold = fold
        
        
    def on_epoch_end(self, epoch, logs=None):
        if epoch in self.snapshot_epochs: # your custom condition         
            self.model.save_weights(f"./EF{CFG.MODEL_NAME}_epoch{epoch}.h5")
        self.model.save_weights(f"./{CFG.MODEL_NAME}_last.h5")

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">INFERENCE</p></div>

In [None]:
training_filenames = [x for i,x in enumerate(train_files) if i%CFG.FOLDS!=CFG.FOLD_TO_RUN]
validation_filenames = [x for i,x in enumerate(train_files) if i%CFG.FOLDS==CFG.FOLD_TO_RUN]
del train_files

## Callbacks

In [None]:
seed_everything(CFG.SEED)
train_dataset = get_training_dataset(training_filenames)
val_dataset = get_val_dataset(validation_filenames)
STEPS_PER_EPOCH = count_data_items(training_filenames)//CFG.BATCH_SIZE

## Logging
train_logger = CSVLogger('./training-log-fold-%i.h5.csv'%CFG.FOLD_TO_RUN)

# SAVE BEST MODEL EACH FOLD
sv_loss = ModelCheckpoint(f"./{MODEL_NAME}_loss_{CFG.FOLD_TO_RUN}.h5", 
                          monitor='val_loss', 
                          verbose=0, 
                          save_best_only=True, 
                          save_weight_only=True, mode='min', save_freq='epoch')
# Snapshot
snap = Snapshot(fold=CFG.FOLD_TO_RUN,snapshot_epochs=[5,8])
# Learning Rate Scheduler
get_lr_callback(plot=True)

In [None]:
# Build Model
K.clear_session()
embed_models = []
for i in range(5):
    model, embed_model = get_model()
    embed_models.append((model.load_weights(f'../input/happywhale-efficientnetv2xl-removedbackground/effnetv2_efficientnetv2-xl-21k-ft1k_loss_v2_{i}.h5'), 
                         embed_model))

In [None]:
plot_model(model)

In [None]:
plot_model(embed_model)

In [None]:
f = open ('../input/happywhale-splits/individual_ids.json', "r")
target_encodings = json.loads(f.read())
target_encodings = {target_encodings[x]:x for x in target_encodings}

In [None]:
# Prediction from emdedding model
train_embeddings = []
train_targets = []
for file in tqdm(training_filenames):
    train_embeddings.append(get_embedding(file))
    train_targets.append(get_target(file))
# Change dtype/shape of data 
train_embeddings = np.concatenate(train_embeddings)
train_targets = np.concatenate(train_targets)

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">KNN in Validation</p></div>

In [None]:
# Using KNN with Train Dataset to predict the target in Valid Dataset
neigh = NearestNeighbors(n_neighbors=CFG.KNN, metric='cosine')
neigh.fit(train_embeddings)

In [None]:
val_ids = []
val_knn_distances = []
val_knn_idxs = []
val_targets = []
val_embeddings = []
for file in tqdm(validation_filenames):
    embedding = get_embedding(file)
    distances,idxs = neigh.kneighbors(embedding, CFG.KNN, return_distance=True)
    val_knn_idxs.append(idxs)
    val_knn_distances.append(distances)
    val_ids.append(get_id(file))
    val_embeddings.append(get_embedding(file))
    val_targets.append(get_target(file))
val_knn_distances = np.concatenate(val_knn_distances)
val_knn_idxs = np.concatenate(val_knn_idxs)
val_ids = np.concatenate(val_ids)
val_embeddings = np.concatenate(val_embeddings)
val_targets = np.concatenate(val_targets)

In [None]:
# Set label for new Individual target
exist_targets = set([target_encodings[x] for x in np.unique(train_targets)])
val_targets_df = pd.DataFrame(np.stack([val_ids, val_targets], axis=1), columns=['image','target'])
val_targets_df['target'] = val_targets_df['target'].astype(int).map(target_encodings)
val_targets_df.loc[~val_targets_df.target.isin(exist_targets),'target'] = 'new_individual'
val_targets_df.target.value_counts()

In [None]:
val_df = []
for i in tqdm(range(len(val_ids))):
    id_ = val_ids[i]
    targets = train_targets[val_knn_idxs[i]]
    distances = val_knn_distances[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    val_df.append(subset_preds)
val_df = pd.concat(val_df).reset_index(drop=True)
# Create Confidence columns to pick up Top individual_id based on max confidence
val_df['confidence'] = 1- val_df['distances']
val_df = val_df.groupby(['image','target']).confidence.max().reset_index()
val_df = val_df.sort_values('confidence',ascending=False).reset_index(drop=True)
val_df['target'] = val_df['target'].map(target_encodings)
val_df.to_csv('val_neighbors.csv')
val_df.image.value_counts().value_counts()

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:70%;text-align:left">KNN in Prediction</p></div>

In [None]:
train_embeddings_infer = np.concatenate([train_embeddings,val_embeddings])
train_targets_infer = np.concatenate([train_targets,val_targets])
print(train_embeddings_infer.shape,train_targets_infer.shape)

In [None]:
# KNN
neigh = NearestNeighbors(n_neighbors=CFG.KNN, metric='cosine')
# neigh.fit(train_embeddings_infer)
neigh.fit(train_embeddings)

In [None]:
test_ids = []
test_knn_distances = []
test_knn_idxs = []
for file in tqdm(test_files):
    print(file)
    embedding = get_embedding(file)
    distances,idxs = neigh.kneighbors(embedding, CFG.KNN, return_distance=True)
    test_ids.append(get_id(file))
    test_knn_idxs.append(idxs)
    test_knn_distances.append(distances)
test_knn_distances = np.concatenate(test_knn_distances)
test_knn_idxs = np.concatenate(test_knn_idxs)
test_ids = np.concatenate(test_ids)

In [None]:
sample_submission = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv',index_col='image')
print(len(test_ids),len(sample_submission))
test_df = []
for i in tqdm(range(len(test_ids))):
    id_ = test_ids[i]
    targets = train_targets_infer[test_knn_idxs[i]]
    distances = test_knn_distances[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances], axis=1),columns=['target','distances'])
    subset_preds['image'] = id_
    test_df.append(subset_preds)
test_df = pd.concat(test_df).reset_index(drop=True)
test_df['confidence'] = 1 - test_df['distances']
test_df = test_df.groupby(['image','target']).confidence.max().reset_index()
test_df = test_df.sort_values('confidence',ascending=False).reset_index(drop=True)
test_df['target'] = test_df['target'].map(target_encodings)
test_df.to_csv('test_neighbors.csv')
test_df.image.value_counts().value_counts()

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">SUBMISSION</p></div>

In [None]:
best_threshold_adjusted = 0.5
predictions = {}
for i,row in tqdm(test_df.iterrows()):
    if row.image in predictions:
        if len(predictions[row.image])==5:
            continue
        predictions[row.image].append(row.target)
    elif row.confidence>best_threshold_adjusted:
        predictions[row.image] = [row.target,'new_individual']
    else:
        predictions[row.image] = ['new_individual',row.target]
        
for x in tqdm(predictions):
    predictions[x] = ' '.join(predictions[x])
    
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions.to_csv('submission.csv',index=False)
predictions.head()

# <div style="color:white;display:fill;border-radius:5px;background-color:#75B7BF;letter-spacing:0.1px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;margin:0;font-size:100%;text-align:center">REFERENCE</p></div>
****

https://www.kaggle.com/code/aikhmelnytskyy/happywhale-effnet-b7-fork-with-detic-training

https://www.kaggle.com/code/phanttan/arcface-efficientnetv2-training