In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# implement our own triplet loss so that we can handle multiples of same class

# Upvotes are appreciated!

## I'd be happy to answer any questions I can in the comments! or talk off kaggle if you prefer so feel free to comment or reach out via the contact this user on kaggle
### I am also open to any tips or corrections on my work! Constructive criticism is always appreciated.


# Notebook Overview
* this notebook is a step two in this competition
* the goal is to create a feature extractor for both whales and dolphins
* this will help us pick out individuals in a species
* we will use the google pretrained efficient nets with fine tuning to see if we can learn better

# Imports

In [None]:
# DOWNLOADS
# !pip install -q efficientnet
!pip install tensorflow_addons
!pip install wandb --upgrade

# NORMAL IMPORTS
import os
import math
from tqdm.notebook import tqdm, trange
import gc
import random
import pickle
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
print(sklearn.__version__)

# PREPROCESSING IMPORTS
from sklearn.model_selection import StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OrdinalEncoder

# MODEL BUILDING AND TRAINING IMPORTS
import tensorflow_hub as tfhub
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow_addons.losses import TripletSemiHardLoss,TripletHardLoss

# EVALUATION IMPORTS
from sklearn.neighbors import NearestNeighbors

# LOGGING IMPORTS
import wandb

BASE_PATH = '/kaggle/input/'
print(os.listdir(BASE_PATH))
MAIN_DATASET = os.path.join(BASE_PATH, 'happy-whale-and-dolphin')
CV_SPLITS_DATASET = os.path.join(BASE_PATH, 'happywhalefolds', 'folds.csv')
CROPPED_DATASET = os.path.join(BASE_PATH, 'happywhale-cropped-dataset-yolov5-ds')
OUTPUT_FOLDER = '/kaggle/working'

In [None]:
DEBUG = False

# The Setup
* we want to setup the training strategy incase we use a tpu
* we want to define the model config class that holds parameters
* we want to seed the environment for reporducibility
* link to our wandb project

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
    
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
print("REPLICAS: ", strategy.num_replicas_in_sync)

## Config

#### efficientnet is a python module we can use to download the pretrained efficient net model.

#### The models were trained on a lot of image data, and according to the internet (and the fact a lot of other people are using them) they seem like a good option to extract information from our images

#### We want to use a pretrained model that learned to extract meaningful features on a bunch of images because it is likely that it has learned some good general strategies. By starting with a model that generally preforms (or performs I dont know) well, we can leverage this feature extraction information on other images, and just fine tune the weights to learn to extract what we want from our images.

In [None]:
### These are for Efficient net version 2
MODEL_TYPE = "imagenet21k_b1"
MODEL_TASK = 'feature_vector'
EFF_NET_PATH = "https://tfhub.dev/google/imagenet/efficientnet_v2_{model_type}/{task}/2".format(model_type=MODEL_TYPE, task=MODEL_TASK)
print(EFF_NET_PATH)

WANDB_PROJECT = "happy-whale"
WANDB_ENTITY = "all-off-nothing"

BATCHSIZE = 64

class Config():
    """This class basically just holds information for us
    """
    ### General
    COMPETITION = "Happy-Whale-And-Dolphin"
    AUTHOR = "Kiernan"
    APPROACH = "Vector-Embeddings" # we could also do a classification or something between individuals
    RUN_GROUP = wandb.util.generate_id()
    SPECIES = 'dolphin'
    
    ### Hyper parameters
    SEED = 123
    BATCH_SIZE = BATCHSIZE * strategy.num_replicas_in_sync
    EPOCHS = 100
    
    ### Model & Training Hyper Params
    VECTOR_SIZE = 32
    MARGIN = 1.0
    SAMPLES_PER_CLASS = 8
    LOSS = 'SEMIHARD'
    ACTIVATION = 'relu'#'l2'
    
    ### LR Parameters
    LR_STYLE = "Ramp_Up_Then_Decay"
    LR_NOTES = "Basic"
    
    ### Where is the data from
    DATA_ORIGIN = "YOLOV5_Cropped_Dataset"
    DATA_METHOD = "Bounding_Box_Cropping"
    FOLD_DATA = "happywhalefolds"
    
    ### Data params
    IMAGE_SIZE = 240
    IMAGE_CHANNELS = 3
    ROTATION_FACTOR = 0.2
    ZOOM_FACTOR=0.2
    
    ### Efficient net params
    MODEL_GROUP = "imagenet21k_b1"
    MODEL = f"efficientnet_v2_{MODEL_TYPE}/{MODEL_TASK}/2"
    
    
config = Config()

if(DEBUG):
    config.EPOCHS = 2

print(config.MODEL)
print(f"Batch size {config.BATCH_SIZE}")

In [None]:
from wandb.keras import WandbCallback
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("WANDB")
wandb.login(key=api_key)

In [None]:
def seed_everything(seed):
    """We want to seed the env so that results are reproducible and more easily comparable.
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything(config.SEED) 

# Model Setup
* here we want to define the model helper functions
* we want to create the model itself

In [None]:
### COPIED FROM THIS NOTEBOOK
# https://www.kaggle.com/manojprabhaakr/effnet-b6-whale-comp

"""
I'm not well versed in why this is helpful, but aparently actually ramping up the learning rate at the start of training can perform very well.
"""

def get_lr_callback(plot=False):
    lr_start   = 0.00001
    lr_max     = 0.00005 * config.BATCH_SIZE  
    lr_min     = 0.000001
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        return lr
    if(plot):
        epochs = list(range(config.EPOCHS))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        ax = plt.gca()
        ax.get_yaxis().get_major_formatter().set_scientific(False)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

get_lr_callback(plot=True)

In [None]:
"""
Again I learned this on kaggle but apparently freezing the batch normalization layers from a pretrained model
can help prevent our model from overfitting. I learned this from other people in the competition

TLDR from stack overflow
the new data we train on will likely have very different batch normalization parameters (on average the images are different from the ones the model was trained on)
batch norm is calculated over the train set after training is complete (during training it can be the mean and std of that batch or a rolling average).
keeping this frozen means that the ranges and values from the data will stay the same, requiring the model to keep the general pretrained learnings, and 
not require it to relearn a ton of new information (in our case the parameter to data ratio is high so it could just memorize the training set)
Frozen batch norm keeps the gradients smaller and less drastic so we can fine tune more without worrying about overfitting and losing the generalization capabilities
we wanted this pretrained model for to begin with
(please correct me in comments if my understanding is bad, I'll update this section)


https://www.kaggle.com/manojprabhaakr/effnet-b6-whale-comp
https://stackoverflow.com/questions/63016740/why-its-necessary-to-frozen-all-inner-state-of-a-batch-normalization-layer-when#:~:text=This%20can%20help%20to%20minimize,in%20batch%20statistics%2C%20decreasing%20regularization.
https://towardsdatascience.com/batch-norm-explained-visually-why-does-it-work-90b98bcc58a0
https://towardsdatascience.com/batch-norm-explained-visually-how-it-works-and-why-neural-networks-need-it-b18919692739
"""

def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

In [None]:
# def get_resizing_and_rescaling():
#     with strategy.scope():
#         resize_and_rescale = tf.keras.Sequential([
#             tf.keras.layers.Resizing(config.IMAGE_SIZE, config.IMAGE_SIZE),
#             tf.keras.layers.Rescaling(scale=1./127.5, offset=-1)
#         ])
#     return resize_and_rescale

# resizer = get_resizing_and_rescaling()

def get_augmentation():
    with strategy.scope():
        augmentation = tf.keras.Sequential([
            tf.keras.layers.RandomFlip(mode="horizontal", seed=config.SEED),
            tf.keras.layers.RandomZoom(config.ZOOM_FACTOR),
            tf.keras.layers.RandomRotation(config.ROTATION_FACTOR)  
        ])
    return augmentation

augmenter = get_augmentation()

In [None]:
def angular_distances(embeddings):
    embeddings = tf.math.l2_normalize(embeddings, axis=-1)
    angular_distances = 1 - tf.matmul(embeddings, tf.transpose(embeddings))
    angular_distances = tf.maximum(angular_distances, 0.0)
    mask_offdiag = tf.ones_like(angular_distances) - tf.linalg.diag(tf.ones([tf.shape(angular_distances)[0]]))
    angular_distances = tf.math.multiply(angular_distances, mask_offdiag)
    return angular_distances

def positive_angular(labels, embeddings):
    adj = tf.equal(labels, tf.transpose(labels))
    adj = tf.cast(adj, tf.float32) - tf.linalg.diag(tf.ones([tf.shape(labels)[0]]))
    distances = angular_distances(embeddings)
    pos_dist = tf.math.multiply(distances, adj)
    pos_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(pos_dist, mask=tf.math.equal(adj, 1.0)))
    return pos_dist_mean

def negative_angular(labels, embeddings):
    adj = tf.math.logical_not(tf.equal(labels, tf.transpose(labels)))
    adj = tf.cast(adj, tf.float32)
    distances = angular_distances(embeddings)
    neg_dist = tf.math.multiply(distances, adj)
    neg_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(neg_dist, mask=tf.math.equal(adj, 1.0)))
    return neg_dist_mean

In [None]:
def get_model():
    """This function returns the classification model. This inputs the images of whatever shape
        we pick shape based on the expected image size of the different efficient net models
        it then sends the images through efficient net and then uses a single node dense layer 
        with the sigmoid activation to binary classify an image as a whale or a dolphin
    """
    tf.keras.backend.clear_session()
    def internal_get_model():
        # use the strategy scope becuase might be in tpu 
        # we want to make the model inside the scope of our strategy
        # efficient net reads in an image of any size and spits out a 1000 dimension feature vector
        with strategy.scope():
            # input shape is (batch_size, image_size, image_size, image_channels)
            inputs = tf.keras.layers.Input(shape=(None, None, config.IMAGE_CHANNELS), name="images")

#             x = resizer(inputs)
            x = augmenter(inputs)

            # input shape is (batch_size, any, any, 3)
            x = tfhub.KerasLayer(EFF_NET_PATH, trainable=True)(x)

            # takes the feature vector and turns it into a N dimensional vector
            # l2 normalization means the euclidian distance of the vector will be 1
            # kernel regularization provides loss based on the layers weights
            # this promotes the model to maintain smaller weights
            # this is important to stop overfitting as with this number of parameters to data the 
            # network could just memorize the data easily, this means the weights will be more evenly distributed
            # and more likely to generalize
            # l2 regilarization adds weight * sum of the squared coefficients
            x = tf.keras.layers.Dropout(0.2)(x)
            x = tf.keras.layers.Dense(config.VECTOR_SIZE)(x)
            if(config.ACTIVATION == 'l2'):
                outputs = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=-1))(x)
            elif(config.ACTIVATION == 'relu'):
                outputs = tf.keras.layers.Activation('relu')(x)
            else:
                raise Exception(f'Invalid activation values {config.ACTIVATION}')

            # make the model
            model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

            optimizer = tf.keras.optimizers.Adam()
            if(config.LOSS == 'SEMIHARD'):
                loss = TripletSemiHardLoss(config.MARGIN)
            elif(config.LOSS == 'HARD'):
                loss = TripletHardLoss(config.MARGIN)
            else:
                raise Excepttion(f'Invalid loss function specified {config.LOSS}')
            model.compile(loss=loss, optimizer=optimizer, metrics=[positive_angular, negative_angular])
        return model
    model = internal_get_model()
    freeze_BN(model)
    return model

model = get_model()
model.summary()

In [None]:
del model
gc.collect()

# Data
* here we want to load the data

In [None]:
# load the main data file
TEST_FOLD=0
train_df = pd.read_csv(CV_SPLITS_DATASET)
train_df = train_df.loc[train_df.encoded_species==config.SPECIES]
if(DEBUG):
    train_df = train_df.sample(frac=0.005)
print(train_df.shape)
train_df.head()

In [None]:
class CustomDataset(tf.keras.utils.Sequence):
    def __init__(self, df, x_col='image', y_col='encoded_id'):
        # save the meta info
        self.base_dir = os.path.join(CROPPED_DATASET,'train_images', 'train_images')
        self.x_col = x_col
        self.y_col = y_col
        
        # image meta
        self.dims = (config.IMAGE_SIZE, config.IMAGE_SIZE)
        self.channels = 3
        
        # save the meta on what we will be choosing
        self.batch_size = config.BATCH_SIZE
        self.samples_per_class = config.SAMPLES_PER_CLASS
        self.classes_per_batch = self.batch_size // self.samples_per_class
        self.batches_per_epoch = (df.shape[0] // self.batch_size) + 1
        
        # save the conversions
        self.labels_to_images = dict.fromkeys(df[y_col].unique(), [])
        self.labels = df[y_col].unique()
        for _, row in df.iterrows():
            self.labels_to_images[row[y_col]].append(row[x_col])
            
        self.normalizer = tf.keras.layers.Rescaling(scale=1./127.5, offset=-1)
        
        super(CustomDataset, self).__init__()
    
    def __len__(self):
        return self.batches_per_epoch

    def on_epoch_end(self):
        gc.collect()
        return

    def __getitem__(self, idx):
        X = np.empty((self.batch_size, *self.dims, self.channels))
        y = np.empty((self.batch_size), dtype=int)
        individuals = np.random.choice(self.labels, self.classes_per_batch, replace=False)
        batch_index = 0
        for ind in individuals:
            samples = np.random.choice(self.labels_to_images[ind], self.samples_per_class, replace=True)
            for samp in samples:
                X[batch_index, :, :, :] = tf.keras.utils.load_img(os.path.join(self.base_dir, samp), target_size=self.dims)
                y[batch_index] = ind
                batch_index += 1
        return self.normalizer(X),y
    
class TestCustomDataset(tf.keras.utils.Sequence):
    def __init__(self, df, x_col='image', y_col='encoded_id'):
        # save the meta info
        self.base_dir = os.path.join(CROPPED_DATASET,'train_images', 'train_images')
        self.x_col = x_col
        self.y_col = y_col
        
        self.data = []
        for _, row in df.iterrows():
            self.data.append([row[self.x_col], row[self.y_col]])
        self.data = np.array(self.data)
        
        # image meta
        self.dims = (config.IMAGE_SIZE, config.IMAGE_SIZE)
        self.channels = 3
        
        # save the meta on what we will be choosing
        self.batch_size = config.BATCH_SIZE
            
        self.normalizer = tf.keras.layers.Rescaling(scale=1./127.5, offset=-1)
        self.batches_per_epoch = math.ceil(self.data.shape[0] / self.batch_size)
        self.labels = self.data[:, 1]
        
        super(TestCustomDataset, self).__init__()
    
    def __len__(self):
        return self.batches_per_epoch

    def on_epoch_end(self):
        gc.collect()
        return

    def __getitem__(self, idx):
        start = idx * self.batch_size
        batch_size = self.batch_size
        if(self.data.shape[0] < start + self.batch_size):
            batch_size = self.data.shape[0] - start
        X = np.empty((batch_size, *self.dims, self.channels))
        y = np.empty((batch_size), dtype=int)
        for i in range(batch_size):
            X[i, :, :, :] = tf.keras.utils.load_img(os.path.join(self.base_dir, self.data[start+i, 0]), target_size=self.dims)
            y[i] = self.data[start+i, 1]
        return self.normalizer(X),y

# Loss Function

# Train the model

In [None]:
def train_loop(model, train_ds, val_ds, fold):
    stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)
    lr_scheduler = get_lr_callback()
    hist = model.fit(train_ds,
                     validation_data=val_ds,
                     epochs=config.EPOCHS,
                     callbacks=[stopper, lr_scheduler, WandbCallback()])
    run.finish()
    del stopper
    del lr_scheduler
    return hist
        
def get_data(fold):
    assert(TEST_FOLD < fold and fold in train_df.fold.unique())
    fold_train_data = train_df.loc[np.logical_and(train_df.fold!=fold, train_df.fold!=TEST_FOLD)]
    fold_val_data = train_df.loc[train_df.fold==fold]
    print(fold_train_data.shape[0], fold_val_data.shape[0], train_df.loc[train_df.fold==TEST_FOLD].shape[0])
    train_ds = CustomDataset(fold_train_data)
    val_ds = CustomDataset(fold_val_data)
    return train_ds, val_ds
        
def train_model(fold):
    train_ds, val_ds = get_data(fold)
    model = get_model()
    hist = train_loop(model, train_ds, val_ds, fold)
    del train_ds
    del val_ds
    return model, hist

def test_model(model, fold, run):
    test_ds = CustomDataset(train_df.loc[train_df.fold==TEST_FOLD])
    ev = model.evaluate(test_ds)
    run.log({'loss':ev})
    del test_ds
    return

# def graph_training(hist):
#     plt.figure(figsize=(8,8), tight_layout=True)
#     plt.plot(hist.history['loss'])
#     plt.plot(hist.history['val_loss'])
#     plt.title('MSE Loss')
#     plt.ylabel('Error')
#     plt.xlabel('Epoch')
#     plt.legend(['train', 'val'], loc='upper right')
#     plt.show()

def train_on_fold(fold):
    run = wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, group=config.RUN_GROUP, name=f"{config.SPECIES}_fold{fold}_{config.RUN_GROUP}", config=config.__dict__, job_type="train")
    print(f"FOLD {fold}")
    print("===================================================")
    # open the run here
    model, hist = train_model(fold, run)
    test_model(model, fold, run)
    model.save(os.path.join(OUTPUT_FOLDER, f"Model_{fold}"))
    # close the run here 
    # custom test metrics on knn and on cosine
    # similrity
    run.finish()
    del hist
#     del model
    plt.clf()
    gc.collect()
    return model

In [None]:
model = train_on_fold(fold=1)

In [None]:
model = train_on_fold(fold=2)

In [None]:
model = train_on_fold(fold=1)

In [None]:
model = train_on_fold(fold=1)

In [None]:
run = wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, group=config.RUN_GROUP, name=f"{config.SPECIES}_test_fold{TEST_FOLD}_{config.RUN_GROUP}", config=config.__dict__, job_type="eval")

In [None]:
evaluate_train_data = train_df.loc[train_df.fold!=TEST_FOLD]
evaluate_test_data = train_df.loc[train_df.fold==TEST_FOLD]
evaluate_train_ds = TestCustomDataset(evaluate_train_data)
evaluate_test_ds = TestCustomDataset(evaluate_test_data)

In [None]:
NEIGHBORS = 1000
if(DEBUG):
    NEIGHBORS = 50

In [None]:
train_embeddings = model.predict(evaluate_train_ds)
print(train_embeddings.shape)
neighbors = NearestNeighbors(n_neighbors=NEIGHBORS,metric='cosine')
neighbors.fit(train_embeddings, evaluate_train_ds.labels)
allowed_classes = set(evaluate_train_ds.labels.astype(np.int32))
# del model
# gc.collect()

In [None]:
NEW_INDIVIDUAL_ID=-1
test_embeddings = model.predict(evaluate_test_ds)
test_predictions = []
for test_embedding in test_embeddings:
    distances, indexes = neighbors.kneighbors(test_embedding.reshape(1,-1), NEIGHBORS, return_distance=True)
    predictions = list(dict.fromkeys(indexes[0,:]))
    predictions = predictions[:4]
    predictions.append(NEW_INDIVIDUAL_ID)
    test_predictions.append(predictions)
test_predictions = np.array(test_predictions)

In [None]:
test_targets = evaluate_test_ds.labels.astype(np.int32).copy()
test_targets = np.array([x if x in allowed_classes else -1 for x in test_targets])
top_k_accuracy = 0
for target, predictions in zip(test_targets, test_predictions):
    if(target in predictions):
        top_k_accuracy += 1
accuracy = top_k_accuracy / test_targets.shape[0]
print(f"Top K Accuracy: ", )
run.log({'test/top_k_acc':accuracy})

In [None]:
del evaluate_train_data
del evaluate_test_data
del evaluate_train_ds
del evaluate_test_ds
gc.collect()

In [None]:
run.finish()