Submission notebook for [*Siamese ResNet-50 with triplet loss on TPU*](https://www.kaggle.com/xhlulu/shopee-siamese-resnet-50-with-triplet-loss-on-tpu)

## Acknowledgement

The previous notebook was derived from [this excellent Keras tutorial](https://keras.io/examples/vision/siamese_network/).

I added `kt.accelerator.limit_gpu_memory` function to `keras-toolkit` based on [Chris Deotte's notebook](https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700). The function was taken from the notebook with very little modification; please go give him an upvote for finding out that neat trick!

In [None]:
!pip install ../input/keras-toolkit -q

In [None]:
import os

import cupy as cp
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_toolkit as kt
from tqdm.auto import tqdm

In [None]:
# # Source: https://www.robots.ox.ac.uk/~albanie/notes/Euclidean_distance_trick.pdf
# # Not currently used
# def euclidean_dist_matrix(X):
#     G = X.T.dot(X)
#     diagG = np.diagonal(G).reshape(-1, 1)
#     D = diagG + diagG.T - 2*G
    
#     return D

In [None]:
def find_matches_cupy(X, posting_ids, threshold, limit=50):
    X = cp.array(X)
    N = X.shape[1]
    matches = []

    for i in tqdm(range(N)):
        v = X[:, i][..., np.newaxis]
        dists = cp.linalg.norm(v - X, axis=0)
        indices = cp.where(dists < threshold)[0][:limit].get()
        match = " ".join(posting_ids[indices])
        matches.append(match)
    
    return matches

In [None]:
kt.accelerator.limit_gpu_memory(2*1024)

In [None]:
target_shape = (200, 200)
def preprocess_image(filename, target_shape=target_shape):
    """
    Load the specified file as a JPEG image, preprocess it and
    resize it to the target shape.
    """

    img_str = tf.io.read_file(filename)
    img = tf.image.decode_jpeg(img_str, channels=3)
    img = tf.image.resize(img, target_shape)
    
    # Resnet-style preprocessing, see: https://git.io/JYo77
    mean = [103.939, 116.779, 123.68]
    img = img[..., ::-1]
    img -= mean

    return img

In [None]:
COMPETITION_NAME = 'shopee-product-matching'
strategy = kt.accelerator.auto_select(verbose=True)
BATCH_SIZE = strategy.num_replicas_in_sync * 32

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
test = pd.read_csv('../input/shopee-product-matching/test.csv')
submission = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')

train['path'] = os.path.join('/kaggle', 'input', COMPETITION_NAME, 'train_images/') + train['image']
test['path'] = os.path.join('/kaggle', 'input', COMPETITION_NAME, 'test_images/') + test['image']

test.head()

In [None]:
with strategy.scope():
    encoder = tf.keras.models.load_model(
        '../input/shopee-siamese-resnet-50-with-triplet-loss-on-tpu/encoder.h5'
    )

encoder.summary()

In [None]:
# dtrain = kt.image.build_dataset(
#     train['path'],
#     decode_fn=preprocess_image,
#     bsize=BATCH_SIZE,
# )

# train_embeds = encoder.predict(dtrain, verbose=1)

# train_matches = find_matches_cupy(
#     X=train_embeds.T,
#     posting_ids=train.posting_id.values,
#     threshold=3.25
# )

In [None]:
dtest = kt.image.build_dataset(
    test['path'],
    decode_fn=preprocess_image,
    bsize=BATCH_SIZE
)

test_embeds = encoder.predict(dtest, verbose=1)

submission.matches = find_matches_cupy(
    X=test_embeds.T,
    posting_ids=submission.posting_id.values,
    threshold=4.23
)

In [None]:
submission.to_csv('submission.csv', index=False)