In [None]:
!pip install ../input/shopee-external-models/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/shopee-external-models/efficientnet-1.1.0-py3-none-any.whl
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import gc
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import load_model
from efficientnet.tfkeras import EfficientNetB3
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
from shutil import copyfile
copyfile(src = "../input/bert-arcface-trained-weights/tokenization.py", dst = "../working/tokenization.py")
from tokenization import FullTokenizer
import tensorflow_hub as hub
from sklearn.preprocessing import LabelEncoder
import re

In [None]:
LIMIT = 5.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('TensorFlow Limit {}GB'.format(LIMIT))
print('RAPIDS Limit {}GB'.format(16-LIMIT))

In [None]:
test=pd.read_csv(r'../input/shopee-product-matching/test.csv')
if len(test)<=3:
    train=pd.read_csv(r'../input/shopee-product-matching/train.csv')
    image_paths='../input/shopee-product-matching/train_images/'+train.image
else:
    train=pd.read_csv(r'../input/shopee-product-matching/test.csv')
    image_paths='../input/shopee-product-matching/test_images/'+train.image

train.shape

In [None]:
del test
gc.collect()

In [None]:
eff_b3_embedder=load_model('../input/shopee-arcface-trained-image-embedders/EFNetb3_embedder_50_epochs.h5')

In [None]:
def read_image(image):
    image = tf.io.read_file(image)
    image= tf.image.decode_jpeg(image, channels = 3)
    image = tf.image.resize(image,(512,512))
    image = tf.cast(image, tf.float32) / 255.0
    return image

def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
image_embeddings_effb3=[]
chunksize=4096
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    images=get_dataset(image_paths[start:end])
    embeddings=eff_b3_embedder.predict(images)
    image_embeddings_effb3.append(embeddings)
image_embeddings_effb3=np.concatenate(np.array(image_embeddings_effb3))
image_embeddings_effb3.shape

In [None]:
del eff_b3_embedder
gc.collect()

In [None]:
image_embeddings_effb3=np.array([i/np.linalg.norm(i) for i in tqdm(image_embeddings_effb3)])
image_embeddings_effb3.shape

In [None]:
image_embeddings_effb3=torch.from_numpy(image_embeddings_effb3)
image_embeddings_effb3=image_embeddings_effb3.cuda()

In [None]:
eff_matches=[]
chunksize=2048
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(image_embeddings_effb3,image_embeddings_effb3[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=0.6)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        eff_matches.append(match.tolist())

In [None]:
del image_embeddings_effb3
gc.collect()

In [None]:
def preprocess(description):
  # Actually not required as what we have is titles which usually doesn't contain words that we use for general communication.
    description=description.lower()
    description=re.sub('[-\n\t]+',' ',description)
    description= re.sub(r"won\'t", "will not",description)
    description=re.sub(r"can\'t", "can not",description)
    description=re.sub(r"n\'t", " not",description)
    description=re.sub(r"\'re", " are",description)
    description=re.sub(r"\'s", " is",description)
    description=re.sub(r"\'d", " would",description)
    description=re.sub(r"\'ll", " will",description)
    description=re.sub(r"\'t", " not",description)
    description=re.sub(r"\'ve", " have",description)
    description=re.sub(r"\'m", " am",description)
    description=re.sub('[^a-z0-9]+',' ',description)
    description=re.sub('\s+',' ',description)
    return description.strip()

In [None]:
cleansed_train=[preprocess(title) for title in tqdm(train.title.values)]

In [None]:
bert_layer = hub.KerasLayer("../input/shopee-external-models/bert_en_uncased_L-24_H-1024_A-16_1", trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file, do_lower_case)

In [None]:
max_seq_length=84
title_tokens=[]
for i in tqdm(cleansed_train):
    tok=['[CLS]']+tokenizer.tokenize(i)
    if len(tok)>=max_seq_length-1:
        tok=tok[:max_seq_length-1]+['[SEP]']
    else:
        tok=tok+['[SEP]']
        tok=tok+(['[PAD]']*(max_seq_length-len(tok)))
    title_tokens.append(np.array(tokenizer.convert_tokens_to_ids(tok)))
title_tokens=np.array(title_tokens)

title_masks=np.array([np.array([1 if j!=0 else 0 for j in i]) for i in title_tokens])

#As we are not using any sentence seperation in titles
title_segments=np.zeros(title_masks.shape)

title_tokens.shape, title_masks.shape, title_segments.shape

In [None]:
class ArcFace(tf.keras.layers.Layer):
    
    # Implementation reference from https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/blob/master/src/modeling/metric_learning.py
    
    def __init__(self, n_classes, scale, margin, **kwargs):

        super(ArcFace, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.scale = scale
        self.margin = margin
        self.cos_m = tf.math.cos(margin)
        self.sin_m = tf.math.sin(margin)

    def get_config(self):

        config = super().get_config().copy()
        config.update({'n_classes': self.n_classes,'scale': self.scale,'margin': self.margin})
        return config

    def build(self, input_shape):
        super(ArcFace, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)

        # Normalizing vectors( Unit Vectors ) to make dot product depend only on angle between vectors.
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )

        # Sin(angle)^2 + Cos(angle)^2 = 1
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        
        # Cos(angle+margin)=Cos(angle)*Cos(margin)-Sin(angle)*Sin(margin)
        phi = cosine * self.cos_m - sine * self.sin_m

        # Add margin only when angle in greate than 90 degrees(Cos(90)=0)
        phi = tf.where(cosine > 0, phi, cosine)

        one_hot = tf.cast(tf.one_hot(y, depth=self.n_classes),dtype=cosine.dtype)
    
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        #
        output *= self.scale
        return output

In [None]:
tf.keras.backend.clear_session()

max_seq_length=84

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")

input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")

segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

label = tf.keras.layers.Input(shape = (), name = 'label')

#bert layer 
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

# margin=0.35 radians= 20 degrees approx, scale=30
arc_face=ArcFace(11014,30,0.35, dtype='float32')([pooled_output, label])

out=tf.keras.layers.Softmax(dtype='float32')(arc_face)

Bert_ArcFace_model=tf.keras.models.Model(inputs=[input_word_ids,input_mask,segment_ids,label], outputs=out)
Bert_ArcFace_model.summary()

In [None]:
tf.keras.utils.plot_model(Bert_ArcFace_model, show_shapes=True)

In [None]:
Bert_ArcFace_model.load_weights('../input/bert-arcface-trained-weights/BERT_ArcFace_epoch_26.hdf5')

In [None]:
tf.keras.backend.clear_session()

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

pooled_embed,seq_embed=Bert_ArcFace_model.layers[3]([input_word_ids, input_mask, segment_ids])

bert_arcface_encoder=tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_embed])
tf.keras.utils.plot_model(bert_arcface_encoder, show_shapes=True)

In [None]:
del Bert_ArcFace_model
gc.collect()

In [None]:
bert_embeddings=[]
chunksize=512
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    embeddings=bert_arcface_encoder.predict([title_tokens[start:end], title_masks[start:end], title_segments[start:end]], batch_size=8)
    bert_embeddings.append(embeddings)
bert_embeddings=np.concatenate(np.array(bert_embeddings))
bert_embeddings.shape

In [None]:
del bert_arcface_encoder
gc.collect()

In [None]:
bert_embeddings=np.array([i/np.linalg.norm(i) if np.linalg.norm(i)!=0 else i  for i in bert_embeddings])
bert_embeddings.shape

In [None]:
bert_embeddings=torch.from_numpy(bert_embeddings)
bert_embeddings=bert_embeddings.cuda()

In [None]:
CosSim_thres=0.65

bert_matches=[]
chunksize=1024
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(bert_embeddings,bert_embeddings[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=CosSim_thres)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        bert_matches.append(match.tolist())

In [None]:
del bert_embeddings
gc.collect()

In [None]:
combined=[]
for i in tqdm(range(len(bert_matches))):
    comb_arr=np.concatenate((np.array(eff_matches[i]),np.array(bert_matches[i])), axis=0)
    sort, indices=np.unique(comb_arr, return_index=True)
    combined.append(' '.join(comb_arr[np.sort(indices)]))

In [None]:
del eff_matches, bert_matches
gc.collect()

In [None]:
train['matches']=combined
train.head()

In [None]:
del combined
gc.collect()

In [None]:
submission=train[['posting_id','matches']]
submission.head()

In [None]:
del train
gc.collect()

In [None]:
submission.to_csv('submission.csv',index=False)