In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
import random
from PIL import Image
import cv2 as cv
import tensorflow as tf 
import tensorflow_hub as hub 
import shutil 
shutil.copy(src="../input/tokenization/tokenization.py",dst="./")
! cp  -r "../input/bert-en-uncased-l-12-h-768-a-12-1" "./"
import tokenization
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import math
from tensorflow.keras.utils import Sequence 
from tensorflow.keras.applications import EfficientNetB0

# Load Datas :

In [None]:
test = pd.read_csv("../input/shopee-product-matching/test.csv")
images_path = "../input/shopee-product-matching/test_images"
if len(test) <= 3 :
    test = pd.read_csv("../input/shopee-product-matching/train.csv")
    images_path = "../input/shopee-product-matching/train_images"

# Useful functions :

In [None]:
SEED = 42

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
def clean(title):
    """This function, allows to clean title from useless characters and symbols.
    
    @ params :
    title(str) : the title text that the function will clean up.
    
    @ returns :
    title(str) : cleaned title

    
    """
    title = re.sub(r"\-"," ",title)
    title = re.sub(r"\+"," ",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|"," ",title)
    title = re.sub(r"\\"," ",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}"," ",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
def get_lr_callback():
    lr_start   = 0.000001
    lr_max     = 0.000005 * BATCH_SIZE
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start   
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max    
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min    
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
    return lr_callback

def max_length(text):
    max_l = 0
    for tx in text :
        l = len(tx.split())
        if l > max_l :
            max_l = l
    max_l = min(512,max_l)
    return max_l
def processing_data(df):
    lb = LabelEncoder()
    df["encoded_label_group"] = lb.fit_transform(df["label_group"])
    N_CLASSES = df["encoded_label_group"].nunique()
    xtr,xts,ytr,yts = train_test_split(df["cleaned_title"].values,df["encoded_label_group"].\
                                       values,stratify=df["encoded_label_group"].values,\
                                       test_size =0.33,random_state=SEED)
    return N_CLASSES,xtr,xts,ytr,yts
def bert_encode(text,tokenizer,max_len=512): 
    """ This function allows to return tokens,masks and segments for series or text array"""
    
   
    all_tokens = []
    all_mask = []
    all_segments = []
    max_l = 0
    
    for tx in text:
        tx = tokenizer.tokenize(tx)
        tokens = tx[:max_len -2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        tokens = tokenizer.convert_tokens_to_ids(tokens)
        seq_len = len(tokens)
        pad = max_len - seq_len
        tokens = tokens + [0] * pad 
        mask = [1] * seq_len + [0] * pad 
        segment = [0] * max_len 
        all_tokens.append(tokens)
        all_mask.append(mask)
        all_segments.append(segment)
    
    return np.array(all_tokens),np.array(all_mask),np.array(all_segments)

def build_bert_model (bert_layer,max_len=512) :
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    inputs_ids = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="input_ids")
    inputs_mask = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="inputs_mask")
    inputs_segment = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32,name="inputs_segment")
    label = tf.keras.layers.Input(shape=(),dtype=tf.int32,name="label")
    
    _,sequence_output = bert_layer([inputs_ids,inputs_mask,inputs_segment])
    clf_output = sequence_output[:,0,:]
    clf_output = tf.keras.layers.BatchNormalization()(clf_output)
    clf_output = tf.keras.layers.Dropout(0.4)(clf_output)
    x = margin([clf_output,label])
    output = tf.keras.layers.Softmax()(x)
    model = tf.keras.models.Model(inputs=[inputs_ids,inputs_mask,inputs_segment,label],outputs=\
                                 [output])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),loss=tf.keras.losses.\
                  sparse_categorical_crossentropy,metrics="accuracy")
    return model 

def bert_model_trainAndsave(xtr,xts,ytr,yts):
    seed_everything(SEED)
    bert_layer = hub.KerasLayer("../input/bert-en-uncased-l-12-h-768-a-12-1",\
                               trainable=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    max_le = max_length(test["cleaned_title"].values)
    xtr_encoded = bert_encode(xtr,tokenizer,max_len=max_le)
    xts_encoded = bert_encode(xts,tokenizer,max_len=max_le)
    y_train = ytr
    y_val = yts
    x_train = (xtr_encoded[0],xtr_encoded[1],xtr_encoded[2],y_train)
    x_val = (xts_encoded[0],xts_encoded[1],xts_encoded[2],y_val)
    md = build_bert_model(bert_layer,max_len=max_le)
    checkpoints = tf.keras.callbacks.ModelCheckpoint(f"bert_weight.h5",\
                                                monitor="val_loss",\
                                                verbose=VERBOSE,\
                                                save_best_only = True,\
                                                save_weights_only = True,\
                                                mode = "min")
    
    histrory = md.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=EPOCHS,callbacks=\
                     [checkpoints,get_lr_callback()],batch_size = BATCH_SIZE,verbose=VERBOSE)

# Data preparation :

In [None]:
label_group = test.groupby("label_group").posting_id.unique()
test["target"] = test["label_group"].map(label_group)

In [None]:
test["cleaned_title"] = test["title"].map(clean)

In [None]:
N_CLASSES,xtr,xts,ytr,yts = processing_data(test)

# Modeling :

In [None]:
VERBOSE = 1
EPOCHS = 30
BATCH_SIZE = 32
SEED = 42

In [None]:
bert_model_trainAndsave(xtr,xts,ytr,yts)