In [None]:
! cp ../input/tokenization/tokenization.py  /kaggle/working

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import tensorflow as tf 
import cudf,cupy,cuml
import cv2 as cv 
from PIL import Image
import os
from tqdm import tqdm
import re 
import string
import tensorflow_hub as hub
import tokenization
from tensorflow.keras.optimizers import SGD 
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.utils import Sequence
import cv2 as cv
import gc

# 1. Load datas:

In [None]:
test = pd.read_csv("../input/shopee-product-matching/test.csv")
#test_cudf = cudf.read_csv("../input/shopee-product-matching/test.csv")
images_path = "../input/shopee-product-matching/test_images"
if len(test) <= 3 :
    test = pd.read_csv("../input/shopee-product-matching/train.csv")
    #test_cudf = cudf.read_csv("../input/shopee-product-matching/train.csv")
    images_path = "../input/shopee-product-matching/train_images"

In [None]:
test.head()

In [None]:
targ = test.groupby("label_group").posting_id.unique()

In [None]:
test["target"] = test.label_group.map(targ)

# 2.Useful functions :

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
def clean(title):
    """This function, allows to clean title from useless characters and symbols.
    
    @ params :
    title(str) : the title text that the function will clean up.
    
    @ returns :
    title(str) : cleaned title

    
    """
    title = re.sub(r"\-"," ",title)
    title = re.sub(r"\+"," ",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|"," ",title)
    title = re.sub(r"\\"," ",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}"," ",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
def combine_matches(row):
    return " ".join(row.pred)

In [None]:
def combine(row):
    x = np.concatenate([row.pred_img,row.pred_text])
   
    return np.unique(x)

# Modeling :

In [None]:
label_group = test.groupby("label_group").posting_id.unique()
test["target"] = test.label_group.map(label_group)

# Text matching :

In [None]:
test["cleaned_title"] = test.title.map(clean)

In [None]:
class BertEmbedding :
    def __init__(self):
        self.max_length = 0
        self.bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",\
                        trainable = False)
        vocab_file = self.bert.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = self.bert.resolved_object.do_lower_case.numpy()
        self.tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
       
        
    def length(self,text):
        mx = 0 
        for tx in text :
            mx = max(len(tx.split()),mx)
        self.max_length = mx 
    def encode(self,text):
        self.length(text)
        all_mask = []
        all_tokens = []
        all_segments = []
        for tx in text :
            tokens = self.tokenizer.tokenize(tx)
            tokens = ['[CLS]'] + tokens[:self.max_length - 2] + ['[SEP]']
            tokens = self.tokenizer.convert_tokens_to_ids(tokens)
            l = len(tokens)
            pad_len = self.max_length - l
            tokens = tokens + [0] * pad_len 
            mask_ids = [1] * l + [0] * pad_len
            segments_ids = [0] * self.max_length
            all_tokens.append(tokens)
            all_mask.append(mask_ids)
            all_segments.append(segments_ids)
        return np.asarray(all_tokens),np.asarray(all_mask),np.asarray(all_segments)
    def embedding_model(self):
        input_words_ids = tf.keras.layers.Input(shape=(self.max_length,),dtype=tf.int32)
        input_mask = tf.keras.layers.Input(shape=(self.max_length,),dtype=tf.int32)
        segments_ids = tf.keras.layers.Input(shape=(self.max_length,),dtype=tf.int32)
        pooled_output,sequence_output = self.bert([input_words_ids,input_mask,segments_ids])
        x = sequence_output[:,0,:]
        model = tf.keras.Model(inputs=[input_words_ids,input_mask,segments_ids],outputs= x)
        model.compile(optimizer =SGD(1e-3,momentum=0.8),loss="binary_crossentropy",metrics=['accuracy'])
        return model
    def predict(self,text):
        data = self.encode(text)
        model = self.embedding_model()
        return model.predict(data)

In [None]:
bert_text_embedding = BertEmbedding()
tx_embedding = bert_text_embedding.predict(test["cleaned_title"].values)

tx_embedding = cupy.array(tx_embedding)

In [None]:
from cuml.neighbors import NearestNeighbors
nb = NearestNeighbors(n_neighbors=50)
nb.fit(tx_embedding)

In [None]:
chunk = 1024 *4 
cl = len(test) // chunk
cl += int((len(test)% chunk) != 0)
pr=[]
for i in range(cl) :
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    distances,indices = nb.kneighbors(tx_embedding[a:b,])
    for j in range(b-a):
        distance = distances[j,]
        ind = np.where(distance < 3)[0]
        ind = indices[j,ind]
        pr.append(test.iloc[ind].posting_id.values)
test["pred_text"] = pr

In [None]:
test["f1"] = test.apply(getMetric("pred_text"),axis=1)
    
print('CV score for tf embedding text =',test.f1.mean())

### Images matching :

In [None]:
class DataGenerator(Sequence):
    def __init__(self,df,batch_size,path,img_size):
        self.df = df 
        self.batch_size = batch_size 
        self.path = path 
        self.img_size = img_size
        self.indexes = np.arange(len(df))
    def __len__(self):
        cl = (len(self.df) // self.batch_size)
        cl += int((len(self.df) % self.batch_size) !=0)
        return cl 
    def __getitem__(self,ind):
        indices = self.indexes[ind * self.batch_size : (ind +1) * self.batch_size]
        X = self.__data_generation(indices)
        return X
    def __data_generation(self,indices):
        ddf = self.df.iloc[indices]
        images = np.zeros((len(ddf),self.img_size,self.img_size,3),dtype="float32")
        for i, (j,row) in enumerate(ddf.iterrows()) :
            img = cv.imread(os.path.join(self.path,row.image))
            img = cv.resize(img,(self.img_size,self.img_size))
            images[i,] = img 
        return images
                            

In [None]:
WGT = "../input/effnetb0/efficientnetb0_notop.h5"
model = EfficientNetB0(weights=WGT,include_top=False,pooling="avg",input_shape=None)

In [None]:
chunk = 1024 * 4 
cls = len(test) // chunk 
cls += int (len(test) % chunk != 0)
image_embedding = []
for i in tqdm(range(cls)) :
    
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    data = DataGenerator(test.iloc[a:b],32,images_path,256)
    emb = model.predict(data,use_multiprocessing=True,workers = 4)
    image_embedding.append(emb)

del(model)
image_embedding = np.concatenate(image_embedding,axis=0)
gc.collect()

In [None]:
from cuml.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=50,metric="cosine")
neighbors.fit(image_embedding)

In [None]:
chunk = 4 * 1024
cl = len(test) // chunk 
cl += int ((len(test) % chunk) !=0)
prediction = []
for i in tqdm(range(cl)):
    a = i * chunk
    b = (i+1) * chunk
    b = min(b,len(test))
    distances , indices = neighbors.kneighbors(image_embedding[a:b,])
    for j in range(b-a):
        distance = distances[j,:]
        ind = np.where(distance < 0.2)[0]
        IND = indices[j,ind]
        #IND = cupy.asnumpy(IND)
        prediction.append(test.iloc[IND].posting_id.values)
test["pred_img"] = prediction

In [None]:
test["f1"] = test.apply(getMetric("pred_img"),axis=1)
    
print('CV score for tf embedding text =',test.f1.mean())

In [None]:
test["pred"] = test.apply(combine,axis=1)
test["f"] = test.apply(getMetric("pred"),axis=1)
    
print('CV score for baseline =',test.f.mean())

In [None]:
test["matches"] = test.apply(combine_matches,axis=1)

In [None]:
test[["posting_id","matches"]].to_csv("submission.csv",index = False)
sub = pd.read_csv('submission.csv')
sub.head()

descriptors = products_descriptors[0][1]
for tup in tqdm(products_descriptors[1:]) :
    descriptors = np.vstack((descriptors,tup[1]))