In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


import matplotlib.pyplot as plt
import seaborn as sns
import cv2

In [None]:
#!pip download efficientnet 

In [None]:
!pip install ../input/modules/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/modules/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from scipy import spatial
from tqdm.notebook import tqdm

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 12
BATCH_SIZE = 8
IMAGE_SIZE = [384, 384]
# Seed
SEED = 42
# Learning rate
LR = 0.001
# Verbosity
VERBOSE = 1

In [None]:
# Function to read and preprocess our data
def preprocess():
    # Read train and test csv
    train = pd.read_csv('../input/shopee-product-matching/train.csv')
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
   
    
    # Drop duplicates images to avoid leakage (dont know if this is correct)
    train.drop_duplicates(subset = ['image'], inplace = True)
    train.reset_index(drop = True, inplace = True)
    label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
    label_mapper_inv = dict(zip(np.arange(len(train['label_group'].unique())), train['label_group'].unique()))
    train['label_group'] = train['label_group'].map(label_mapper)
    # Number of classes
    N_CLASSES = train['label_group'].nunique()
    return test, N_CLASSES

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image_test(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

def get_test_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image_test, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Function to create our EfficientNetB0 model
def get_model():
        
    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3))
    x = efn.EfficientNetB3(include_top = False, weights = None)(inp) #EfficientNetB0-> EfficientNetB3
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(512, activation = 'relu')(x)
    output = tf.keras.layers.Dense(N_CLASSES, activation = 'softmax')(x)

    model = tf.keras.models.Model(inputs = [inp], outputs = [output])

    opt = tf.keras.optimizers.Adam(learning_rate = LR)

    model.compile(
        optimizer = opt,
        loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

def inference(test):
    print('\n')
    print('-'*50)
    model = get_model()
    model.load_weights('../input/weight2/EfficientNetB3_128_42.h5') #loading the trained model
    model = tf.keras.models.Model(inputs = model.input, outputs = model.layers[-2].output)
    test_image = '../input/shopee-product-matching/test_images/' + test['image']
    test_dataset = get_test_dataset(test_image)
    # Predict the test images and get embeddings
    embeddings = model.predict(test_dataset)

    # Iterate over each test image and use cosine distance to find similar images
    predictions = []
    for test_index in tqdm(range(embeddings.shape[0])):
        distances = spatial.distance.cdist(
            embeddings[np.newaxis, test_index, :], embeddings, 'cosine')[0]
        # Only get small distances
        TOP = len(distances[distances <= .11])
        top_k = list(np.argsort(distances)[:TOP])
        predictions.append(' '.join(test['posting_id'].iloc[top_k].values))
        
    submission = pd.DataFrame({'posting_id' :test['posting_id'], 'matches': predictions})
    return submission
    


In [None]:
test, N_CLASSES = preprocess()
submission = inference(test)
# Save predictions
submission.to_csv('/kaggle/working/submission.csv', index = False)
submission.head()

### Important Comment
- in version 14 Top distance <=0.05
- in version 14.1 Top distance <=0.11 like previous accepted versions
