In [None]:
!pip install -q efficientnet
!pip install tensorflow_addons
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from tqdm.notebook import tqdm
from kaggle_datasets import KaggleDatasets

In [None]:
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

BATCH_SIZE = 8
IMAGE_SIZE =[512,512]

N_CLASSES = 11011
LIMIT = 4.0

gpus = tf.config.experimental.list_physical_devices('GPU')


In [None]:
tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
logical_gpus = tf.config.experimental.list_logical_devices('GPU')

In [None]:
def read_dataset():
    df = pd.read_csv('../input/shopee-product-matching/train.csv')
    tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
    df['matches'] = df['label_group'].map(tmp)
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    image_paths = '../input/shopee-product-matching/train_images/' + df['image']
    return df, image_paths

def read_image(image):
    image = tf.io.read_file(image)
    image = tf.image.decode_jpeg(image, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) /255.0
    return image

def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    def __init__(self, n_classes, s = 30, m = 0.50, **kwargs):
        super(ArcMarginProduct, self).__init__(**kwargs)
        
        self.n_classes = N_CLASSES
        self.s = s
        self.m = m
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi -m)
        self.mm = tf.math.sin(math.pi -m) *m
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
        })
        
    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])
        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)
        
    def call(self,inputs):
        X,y = inputs
        y = tf.cast(y,dtype=tf.int32)
        
        cosine= tf.matmul(tf.math.l2_normalize(X,axis =1), 
                          tf.math.l2_normalize(self.W,axis=0))
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine,2))
        
        phi = cosine*self.cos_m - sine*self.sin_m
        phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.one_hot(y, depth = self.n_classes)
        
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output
    

In [None]:
def get_image_embeddings(image_paths):
    with tf.device('/GPU:0'):
        embeddings = []
        margin = ArcMarginProduct(
            n_classes = N_CLASSES,
            s = 30,
            m = 0.7,
            name = 'arc_margin',
            dtype = 'float32')

        inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE,3), name = 'inp1')
        label = tf.keras.layers.Input(shape=(), name = 'inp2')
        x = efn.EfficientNetB5(weights = 'imagenet', include_top = False)(inp)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        x = margin([x,label])

        output = tf.keras.layers.Softmax(dtype='float32')(x)
        model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
        model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
        
        #The following seems to be the common consensus on minimizing memeory errors
        #In summary, process the images in chunks

        subset = 2000
        iterator = np.arange(np.ceil(len(data)/subset))
        for i in iterator:
            a = int(i * subset)
            b = int((i+1) * subset)
            image_dataset = get_dataset(image_paths[a:b])
            image_embeddings = model.predict(image_dataset)
            embeddings.append(image_embeddings)
            print(f'finished {i + 1}/{iterator[-1]} of the images')

        del image_embeddings
        image_embeddings = np.concatenate(embeddings)
        
        return image_embeddings

    
    

In [None]:
def get_neighbors(df, embeddings, KNN = 50, image = True):
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold

  
    thresholds = list(np.arange(3.0, 5.0, 0.1))
    scores = []
    for threshold in thresholds:
        predictions = []
        for k in range(embeddings.shape[0]):
            idx = np.where(distances[k,] < threshold)[0]
            ids = indices[k,idx]
            posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
            predictions.append(posting_ids)
        df['pred_matches'] = predictions
        df['f1'] = f1_score(df['matches'], df['pred_matches'])
        score = df['f1'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')

    # Use threshold
    predictions = []
    for k in range(embeddings.shape[0]):
        # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
        idx = np.where(distances[k,] < 4.0)[0]
       
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)

    del model, distances, indices
    return df, predictions

In [None]:
data, image_paths = read_dataset()

image_embeddings = get_image_embeddings(image_paths)


In [None]:
df, image_predictions = get_neighbors(data, image_embeddings, KNN = 50, image = True)