# Shopee Product Matching - KNN Threshold Searching

This notebook outlines the construnction of a hybrid model to determine product similary. This model consists of:
* Image similarity - CNN model. For construction of this model, please reference [this link](https://www.kaggle.com/sandersli/shopee-product-matching-efficientnet-gem-arcface)
* Title similarity - TF-IDF model

In [None]:
!pip install tqdm -q

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tqdm.notebook import tqdm

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver().connect()
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print(f"Running on TPU {tpu.master()} with {strategy.num_replicas_in_sync} replicas")
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')

BATCH_SIZE = 32 * strategy.num_replicas_in_sync
SEED = 42

# Hyperparameters should be the same as model in training
IMAGE_SIZE = (300, 300)
N_CLASSES = train_df['label_group'].nunique()

# On submission, change this variable to false
TRAIN = True
MODEL_FILENAME = '../input/models/EfficientNetB3_300_42_m0.5_s30.h5'

# Max cluster size as stated in rules
# If statement avoids sklearn ValueError: Expected n_neighbors <= n_samples
if TRAIN or len(test_df) > 50:
    KNN_LIMIT = 50
else:
    KNN_LIMIT = len(test_df)

## Create evaluation tools

In [None]:
# Function to create dataframe for easy calculation of f1 score
def construct_df():
    if TRAIN:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        df['image_paths'] = '../input/shopee-product-matching/train_images/' + df['image']
        match_map = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches']  = df['label_group'].map(match_map)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df['image_paths'] = '../input/shopee-product-matching/test_images/' + df['image']
    return df

# Function to decode images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read image from file
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

def build_dataset(image_paths):
    dataset = tf.data.Dataset.from_tensor_slices(image_paths)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
# Function to calculate f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

#Calculate the f1 score
def get_score(df, predictions):
    scores = f1_score(df['matches'], predictions)
    return np.mean(scores)

# Function to assist in submissions
def submit(df, predictions):
    df['matches'] = predictions
    df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

## Construct model

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(np.pi - m)
        self.mm = tf.math.sin(np.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output
    
class GeMPoolingLayer(tf.keras.layers.Layer):
    '''
    Implements Generalized-Mean Pooling layer
    Reference:
        https://arxiv.org/pdf/1711.02512.pdf
    '''
    def __init__(self, p=1., eps=1e-6):
        super().__init__()
        self.p = p
        self.eps = eps

    def call(self, inputs: tf.Tensor, **kwargs):
        inputs = tf.clip_by_value(inputs, clip_value_min=self.eps, clip_value_max=tf.reduce_max(inputs))
        inputs = tf.pow(inputs, self.p)
        inputs = tf.reduce_mean(inputs, axis=[1, 2], keepdims=False)
        inputs = tf.pow(inputs, 1. / self.p)
        return inputs
    
    def get_config(self):
        return {
            'p': self.p,
            'eps': self.eps
        }

In [None]:
# Function to construct the model
def get_model(params):
    # Do not include weights in function call; internet required
    backbone = tf.keras.applications.EfficientNetB3(weights = None, include_top = False)
    margin = ArcMarginProduct(
        n_classes = N_CLASSES, 
        s = params['s'],
        m = params['m'],
        name='arc_margin_product', 
        dtype='float32'
        )
    inp = tf.keras.layers.Input(shape = IMAGE_SIZE + (3,), name = 'image')
    label = tf.keras.layers.Input(shape = (), name = 'label')
    x = tf.keras.applications.efficientnet.preprocess_input(inp)
    x = backbone(x)
    x = GeMPoolingLayer()(x)
    x = tf.keras.layers.Dense(512, kernel_regularizer=tf.keras.regularizers.l2(), activation=None)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = margin([x, label])
    output = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])

    return model

In [None]:
def build_models():
    model = get_model(params)
    model.load_weights(MODEL_FILENAME)
    #Cut off model, ArcFace is used as loss and not for embeddings
    image_model = tf.keras.models.Model(
        inputs = model.input[0], 
        outputs = model.layers[-4].output
    )
    image_model.summary()
    
    text_model = TfidfVectorizer(stop_words='english', binary=True)
    return image_model, text_model

In [None]:
def get_image_embeddings(model, dataset):
    embeddings = []
    image_dataset = build_dataset(df['image_paths'])
    embeddings.append(model.predict(image_dataset))
    return np.concatenate(embeddings)

def get_text_embeddings(model, df):
    text_embeddings = model.fit_transform(df['title'])
    return text_embeddings

def get_neighbors(df, image_embeddings, text_embeddings, params):
    image_neighbors = NearestNeighbors(n_neighbors = KNN_LIMIT, metric='cosine').fit(image_embeddings)
    image_distances, image_indices = image_neighbors.kneighbors(image_embeddings)
    
    text_neighbors = NearestNeighbors(n_neighbors = KNN_LIMIT, metric='cosine').fit(text_embeddings)
    text_distances, text_indices = text_neighbors.kneighbors(text_embeddings)

    predictions=[]
    image_threshold = params['image_threshold']
    text_threshold = params['text_threshold']
    
    for k in range(len(df)):
        idx_image = np.where(image_distances[k,] < image_threshold)[0]
        ids_image = image_indices[k, idx_image]
        idx_text = np.where(text_distances[k,] < text_threshold)[0]
        ids_text = text_indices[k, idx_text]
        ids = np.union1d(ids_image, ids_text).tolist()
        posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
    return pd.Series(predictions)

In [None]:
def predict(image_model, text_model, df, params):
    image_embeddings = get_image_embeddings(image_model, df)
    text_embeddings = get_text_embeddings(text_model, df)
    predictions = get_neighbors(df, image_embeddings, text_embeddings, params)
    return predictions

In [None]:
params = {
    'm': 0.5,
    's': 30,
    'image_threshold': 0.25,
    'text_threshold': 0.37
}

In [None]:
#search for optimal text_embed features
with strategy.scope():
    df = construct_df()
    image_model, text_model = build_models()

## Fine tuning threshold

In [None]:
def grid_search(image_thresholds, text_thresholds):
    grid = np.zeros( (len(image_thresholds), len(text_thresholds)) )
    with tqdm(total=len(image_thresholds) * len(text_thresholds)) as pbar:
        for i, image_threshold in enumerate(image_thresholds):
            for j, text_threshold in enumerate(text_thresholds):
                params['image_threshold'] = image_threshold
                params['text_threshold'] = text_threshold 
                predictions = predict(image_model, text_model, df, params)
                grid[i,j] = get_score(df, predictions)
                pbar.update(1)
    max_score = grid.max()
    max_idx = np.where(grid == max_score)
    print(f'max of f1={max_score} occurs at image_threshold={image_thresholds[max_idx[0][0]]} and at text_threshold={text_thresholds[max_idx[1][0]]}')
    ax = sns.heatmap(grid, annot=True, fmt=".4f", xticklabels=text_thresholds, yticklabels=image_thresholds)
    ax.set_xlabel('Text thresholds')
    ax.set_ylabel('Image thresolds')
    plt.savefig('grid_search.png')
    plt.show()

with strategy.scope():
    # arange() is not end-inclusive
    image_thresholds = np.arange(0.2, 0.45, 0.05)
    text_thresholds = np.arange(0.2, 0.45, 0.05)
    grid_search(image_thresholds, text_thresholds)

In [None]:
#Final model
with strategy.scope():
    # Set to optimal values
    params['image_threshold'] = 0.25
    params['text_threshold'] = 0.35 
    predictions = predict(image_model, text_model, df, params)
    score = get_score(df, predictions)
    print(f'Mean F1 Score: {score}')

In [None]:
with strategy.scope():
    # Set to optimal values
    params['image_threshold'] = 0.25
    params['text_threshold'] = 0.35 
    predictions = predict(image_model, text_model, df, params)
    submit(df, predictions)