In [None]:
import os 
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf 
import re
import string
import shutil
shutil.copy(src="../input/tokenization/tokenization.py",dst = "./")
import tokenization
import tensorflow_hub as hub
import math
from tqdm import tqdm
import gc
from tensorflow.keras.utils import Sequence
import cv2 as cv

# Load datas:

In [None]:
submission = False
test = pd.read_csv("../input/shopee-product-matching/test.csv")
images_path = "../input/shopee-product-matching/test_images"
if len(test) <=3 :
   test = pd.read_csv("../input/shopee-product-matching/train.csv")
   #test = pd.concat([test,test,test.iloc[:5000]])
   images_path = "../input/shopee-product-matching/train_images"
   submission = True

# Usefuls functions:

In [None]:
def max_length(text):
    max_l = 0
    for tx in text :
        l = len(tx.split())
        if l > max_l :
            max_l = l
    max_l = min(512,max_l)
    return max_l

In [None]:
def clean_text (title) :
    """This function, allows to clean title from useless characters and symbols.
    
    @ params :
    title(str) : the title text that the function will clean up.
    
    @ returns :
    title(str) : cleaned title

    
    """
    title = re.sub(r"\-"," ",title)
    title = re.sub(r"\+"," ",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|"," ",title)
    title = re.sub(r"\\"," ",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}"," ",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
def combine(row):
    x = np.concatenate([row.pred_eff,row.pred_bert])
   
    return np.unique(x)
def combine_matches(row):
    return " ".join(row.pred)

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
def bert_encode(texts,tokenizer,max_len = 512):
    all_tokens = []
    all_masks = []
    all_segments = []
    for tx in texts :
        token = tokenizer.tokenize(tx)
        token = token[:max_len -2]
        token = ["[CLS]"] + token + ["[SEP]"]
        tokens = tokenizer.convert_tokens_to_ids(token)
        le = len(tokens)
        pad = max_len - le 
        tokens = tokens + [0] * pad 
        masks = [1] * le + [0] * pad 
        segments = [0] * max_len 
        all_tokens.append(tokens)
        all_masks.append(masks)
        all_segments.append(segments)
    return np.array(all_tokens),np.array(all_masks),np.array(all_segments)

In [None]:
def BertToolsConfiguration():
    bert_layer = hub.KerasLayer("../input/bert-en-uncased-l-12-h-768-a-12-1",trainable=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    max_le = max_length(test["cleaned_title"].values)
    return bert_layer,tokenizer,max_le

In [None]:
def BertImplementation(bert_layer,tokenizer,max_len,N_CLASSES):
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    inputs_ids = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="inputs_ids")
    inputs_masks = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="inputs_masks")
    inputs_segments = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="inputs_segments")
    labels = tf.keras.layers.Input(shape=(None,),dtype=tf.int32,name="labels")
    _,sequence_output = bert_layer([inputs_ids,inputs_masks,inputs_segments])
    clf_output = sequence_output[:,0,:]
    clf_output = tf.keras.layers.BatchNormalization()(clf_output)
    clf_output = tf.keras.layers.Dropout(0.4)(clf_output)
    x = margin([clf_output,labels])
    output = tf.keras.layers.Softmax()(x)
    model = tf.keras.models.Model(inputs=[inputs_ids,inputs_masks,inputs_segments,labels],\
                                  outputs=[output])
    model.load_weights("../input/bert-arc-face-training/bert_weight.h5")
    model = tf.keras.models.Model(inputs=model.input[:3],outputs=model.layers[-6].output)
    return model

In [None]:
class DataGenerateur(Sequence):
    
    def __init__(self,df,img_size=256,batch_size = 4 ,path=images_path ):
        self.df = df 
        self.path = path 
        self.batch_size = batch_size 
        self.img_size = img_size
        self.indices = np.arange(len(self.df))
    def __len__(self):
        cls = (len(self.df))//(self.batch_size)
        cls += int (((len(self.df)) % (self.batch_size))!=0)
        return cls 
    def __getitem__(self,index):
        ind = self.indices [index * self.batch_size : (index+1) * self.batch_size]
        return self.__datagenerator(ind)
    def __datagenerator(self,ind):
        dff = self.df.iloc[ind]
        images = np.zeros((len(dff),self.img_size,self.img_size,3),dtype=np.float)
        
        for i ,(j,row) in enumerate(dff.iterrows()):
            img = os.path.join(self.path,row.image)
            img = cv.imread(img)
            img = cv.resize(img,(self.img_size,self.img_size))
            images[i,] = img 
        return images 

In [None]:
def effnetImplementation(N_CLASSES) :
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    WGT = "../input/effnetb0/efficientnetb0_notop.h5"
    inp = tf.keras.layers.Input(shape=(256,256,3),dtype=tf.float32)
    labels = tf.keras.layers.Input(shape=(None,),dtype=tf.int32,name="labels")
    effnet = tf.keras.applications.EfficientNetB0(weights=None,include_top= False,input_shape=None,\
                                                 pooling="AVG")
    out = effnet(inp)
    #out = tf.keras.layers.BatchNormalization()(out)
    #out = tf.keras.layers.Dropout(0.3)(out)
    out = tf.keras.layers.GlobalAveragePooling2D()(out)
    x = margin([out,labels])
    output = tf.keras.layers.Softmax()(x)
    model = tf.keras.models.Model(inputs=[inp,labels],outputs=[output])
    model.load_weights("../input/effnetb0-arcface-training/effnet_weights.h5")
    
    model = tf.keras.models.Model(inputs=model.input[0],outputs=model.layers[-4].output)
    
    return model

# Datas preparation :

In [None]:
if submission : 
   tmp = test.groupby("label_group").posting_id.unique()
   test["target"] = test["label_group"].map(tmp)

In [None]:
test["cleaned_title"] = test["title"].map(clean_text)

In [None]:
N_CLASSES = test["label_group"].nunique()

In [None]:
print(N_CLASSES)

# Duplicated products based on images:

In [None]:
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus :
    try :
       tf.config.experimental.set_virtual_device_configuration(gpus[0],\
                                                           [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
       logical_gpus = tf.config.experimental.list_logical_devices("GPU")
    
    except RuntimeError as e :
       print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
effnet_model = effnetImplementation(11014)

In [None]:
image_embedding = []
chunk = 4 * 1024
cls = len(test) // chunk
cls += int((len(test) % chunk) !=0)
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    data = DataGenerateur(test.iloc[a:b])
    image_embedding.append(effnet_model.predict(data,use_multiprocessing=True,workers = 4))
    
del (effnet_model)
gc.collect()

In [None]:
image_embedding = np.concatenate(image_embedding,axis=0)

In [None]:
from cuml.neighbors import NearestNeighbors
import cupy 
image_embedding = cupy.array(image_embedding)

In [None]:
mm = NearestNeighbors(n_neighbors=50)
mm.fit(image_embedding)

In [None]:
#from cupy.linalg import norm
#norme = norm(image_embedding,axis=1)

In [None]:
#norme = norme.reshape(-1,1)

In [None]:
#normed_embedding = image_embedding / norme

In [None]:
import cudf 
cudf_test = cudf.DataFrame(test)

In [None]:
pred = []
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = mm.kneighbors(image_embedding[a:b,])
    for j in range(b-a) :
           distance = distances[j,]
           ind = cupy.where(distance < 4.5) [0]
           ind = cupy.asnumpy(ind)
           ind = indices[j,ind]
           ind = cupy.asnumpy(ind)
           pred.append(test.iloc[ind].posting_id.values)
       
     

In [None]:
test["pred_eff"] = pred 
if submission : 
   test["f2"] = test.apply(getMetric("pred_eff"),axis=1)
    
   print('CV score for arc face bert  embedding text =',test.f2.mean())

# Duplicated products based on text :

In [None]:
bert_layer,tokenizer,max_len = BertToolsConfiguration()

In [None]:
print(max_len)

In [None]:
bert_model = BertImplementation(bert_layer,tokenizer,82,11014)

In [None]:
all_tokens,all_masks,all_segments = bert_encode(test["cleaned_title"].values,tokenizer,\
                                                max_len=82)

In [None]:
embedding = []
chunk = 1024 *4 
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    embedding.append(bert_model.predict([all_tokens[a:b,],all_masks[a:b,],all_segments[a:b,]]))
embedding = np.concatenate(embedding,axis = 0 )

In [None]:
del(bert_model)
gc.collect()

In [None]:
from cuml.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=50)
nn.fit(embedding)

In [None]:
import cupy

In [None]:
pred = []
chunk = 4 * 1024 
cls = len(test) // chunk 
cls += int((len(test) % chunk)!=0)
for i in tqdm(range(cls)):
    a = i * chunk
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = nn.kneighbors(embedding[a:b,])
    for j in range(b-a) :
        distance = distances[j,:]
        ind = np.where(distance < 8.5)[0]
        ind = indices[j,ind]
        ind = cupy.asnumpy(ind)
        pred.append(test.iloc[ind].posting_id)

In [None]:
test["pred_bert"] = pred 
if submission :
   test["f0"] = test.apply(getMetric("pred_bert"),axis=1)
    
   print('CV score for bert arc face prediction =',test.f0.mean())

### tf_idf

from cuml.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(stop_words="english",max_features=25000,binary=True)

embedding = tf_idf.fit_transform(cudf_test["cleaned_title"]).toarray()

from cuml.neighbors import NearestNeighbors 
nn = NearestNeighbors(n_neighbors=50)
nn.fit(embedding)

pred = []
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances , indices = nn.kneighbors(embedding[a:b,])
    for j in range(b-a) :
        distance = distances[j,:]
        ind = np.where(distance < 1.0)[0]
        proches = indices[j,ind]
        proches = cupy.asnumpy(proches)
        pred.append(test.iloc[proches].posting_id.values)

test["pred_tf_idf"] = pred 
if submission :
   test["f2"] = test.apply(getMetric("pred_tf_idf"),axis=1)
    
   print('CV score for arc face combined prediction =',test.f2.mean())

### tf 

from cuml.feature_extraction.text import CountVectorizer
tf = CountVectorizer(stop_words="english",max_features=25000,binary=True)
embedding = tf.fit_transform(cudf_test["cleaned_title"]).toarray()

nb = NearestNeighbors(n_neighbors=50,metric="cosine")
nb.fit(embedding)

pred = []
for i in tqdm(range(cls)) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances , indices = nb.kneighbors(embedding[a:b,])
    for j in range(b-a) :
        distance = distances[j,:]
        ind = np.where(distance < 0.4)[0]
        proches = indices[j,ind]
        proches = cupy.asnumpy(proches)
        pred.append(test.iloc[proches].posting_id.values)

test["pred_tf"] = pred 
if submission :
   test["f4"] = test.apply(getMetric("pred_tf"),axis=1)
    
   print('CV score for arc face combined prediction =',test.f4.mean())

def intersect_tx(row):
    x = np.intersect1d(row["pred_tf_idf"],row["pred_tf"])
    return x 
def concat_tx(row):
    x = np.concatenate([row["pred_tf_idf"],row["pred_tf"]])
    x = np.unique(x)
    return x 

test["tfettf_idf"] = test.apply(intersect_tx,axis=1)
if submission :
   test["f5"] = test.apply(getMetric("tfettf_idf"),axis=1)
    
   print('CV score for arc face combined prediction =',test.f5.mean())

test["tfoutf_idf"] = test.apply(concat_tx,axis=1)
if submission :
   test["f5"] = test.apply(getMetric("tfoutf_idf"),axis=1)
    
   print('CV score for arc face combined prediction =',test.f5.mean())

# Combined prediction :

In [None]:
test["pred"] = test.apply(combine,axis=1)
#test["pred"] = pred

In [None]:
if submission :
   test["f3"] = test.apply(getMetric("pred"),axis=1)
    
   print('CV score for arc face combined prediction =',test.f3.mean())

In [None]:
test["matches"] = test.apply(combine_matches,axis=1)

In [None]:
test[["posting_id","matches"]].to_csv("submission.csv",index = False)
sub = pd.read_csv('submission.csv')
sub.head()