In [None]:
import numpy as np
import pandas as pd
import os, sys, cv2, math
from kaggle_datasets import KaggleDatasets
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers

TPU=True

In [None]:
# data.csv excluded id_freq>150
df= pd.read_csv('../input/dataframe-startnotebook/data.csv')
Encoder=LabelEncoder()
df['id_label']=Encoder.fit_transform(df.individual_id)
np.save('classes.npy', Encoder.classes_)
# enc.classes_ = np.load('classes.npy', allow_pickle=True)
# enc.inverse_transform([y1, y2])
df.head()

In [None]:
evaldf= pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
evaldf.predictions= 'new_individual '
evaldf.head()

In [None]:
n_classes= df.id_label.max()+1
img_size =456
seed= 2001
batch_size=45
k= 4
n_classes

In [None]:
def auto_select_accelerator():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy

In [None]:
def readImg(target_size=(512, 512)):
    def readOnly(path):
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_jpeg(file_bytes, channels=3)
        img= tf.cast(img, tf.bfloat16)/255.0
        return tf.image.resize(img, target_size)
    return readOnly

def build_dataset(paths, bsize=20, decode_fn=None):
    if decode_fn is None:
        decode_fn = readImg()
    AUTO = tf.data.experimental.AUTOTUNE
    dset = tf.data.Dataset.from_tensor_slices(paths)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.batch(bsize).prefetch(AUTO) # overlaps data preprocessing and model execution while training
    return dset

In [None]:
DATASET_NAME = "happy-whale-and-dolphin"
strategy = auto_select_accelerator()
batch_size = strategy.num_replicas_in_sync * batch_size
print('batch size', batch_size)

In [None]:
if TPU:
    GCS_DS_PATH = KaggleDatasets().get_gcs_path(DATASET_NAME)
    df['paths'] = df.image.apply(lambda x: GCS_DS_PATH+ '/train_images/' + x)
    evaldf['paths']= evaldf.image.apply(lambda x: GCS_DS_PATH+ '/test_images/' + x)
    print(GCS_DS_PATH)
else:
    df['paths'] = df.image.apply(lambda x: '../input/happy-whale-and-dolphin/train_images/' + x)
    evaldf['paths']= evaldf.image.apply(lambda x: '../input/happy-whale-and-dolphin/test_images' + x)

In [None]:
decoder = readImg(target_size=(img_size, img_size))

# Build the tensorflow datasets
dtrain = build_dataset(df['paths'].values,
                       bsize=batch_size, decode_fn=decoder)

deval = build_dataset(evaldf['paths'].values, 
                      bsize=batch_size, decode_fn=decoder)

In [None]:
class eluDistance(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

def buildModel():
    anchor_input = layers.Input(name="anchor", shape=(img_size, img_size, 3))
    positive_input = layers.Input(name="positive", shape=(img_size, img_size, 3))
    negative_input = layers.Input(name="negative", shape=(img_size, img_size, 3))
    
    base= tf.keras.applications.ResNet50V2(input_shape=(img_size, img_size, 3),
                                           include_top=False, pooling='avg')
    for layer in base.layers:
        if isinstance(layer, layers.BatchNormalization):
            layer.trainable = False
        else:
            layer.trainable = True
    
    dropout = layers.Dropout(0.25, name='dropout')
    reduce = layers.Dense(512, activation='linear', name='reduce')
    
    distances = eluDistance()(
        reduce(dropout(base(anchor_input))),
        reduce(dropout(base(positive_input))),
        reduce(dropout(base(negative_input))),
    )
    
    return  tf.keras.Model(inputs=[anchor_input, positive_input, negative_input], outputs=distances)

class SiameseModel(tf.keras.Model):
    def __init__(self, siamese_network, margin=0.5):
        super(SiameseModel, self).__init__()
        self.siamese_network = siamese_network
        self.margin = margin
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")
        
    def call(self, inputs):
        return self.siamese_network(inputs)
    
    def _compute_loss(self, data):
        ap_distance, an_distance= self.siamese_network(data)
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss
    
    def train_step(self, data):
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)
        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)
        self.optimizer.apply_gradients(zip(gradients, self.siamese_network.trainable_weights))
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}
    
    def test_step(self, data):
        loss = self._compute_loss(data)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}
    @property
    def metrics(self):
        return [self.loss_tracker]

In [None]:
with strategy.scope():
    model=buildModel()
    siamese_model = SiameseModel(model)
model.summary()

In [None]:
with strategy.scope():
    encoder = tf.keras.Sequential([
        siamese_model.siamese_network.get_layer('resnet50v2'),
        siamese_model.siamese_network.get_layer('dropout'),
        siamese_model.siamese_network.get_layer('reduce'),
    ])
    encoder.load_weights('../input/learn-image-embedding-copy2/encoder.h5')
del siamese_model

In [None]:
with strategy.scope():
    trainDataX= encoder.predict(dtrain, verbose=1)
    testDataX= encoder.predict(deval, verbose=1)
trainDataY= df.id_label.values

In [None]:
print(trainDataX.shape, testDataX.shape, trainDataY.shape)
np.save('trainDataXv2', trainDataX)
np.save('testDataXv2', testDataX)
np.save('trainDataYv2', trainDataY)