# About this Notebook

In the Quest of generating of more relevant and better embeddings , here is another technique that I thought be useful. This notebooks describes <b> MASK token prediction BERT type pretraining of transformer models on our dataset </b> . My intuition was if we pretrain BERT or any other model using MASK word prediction task and then fine tune that model using arcface loss or simple classification task it might do a better job at creating good embeddings because it might have more idea about the words in the title. 

However it didnt give any significant boost in the performance of xlm-roberta , the reason which I feel for this is the test set is a whole lot different from train set . It was a good learning experience for me though , I am sharing it with the community as I feel many more can learn from it and also I would know if I have done anay mistake while implementing this

Happy Learning

In [None]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler

import transformers
from transformers import AutoModelForMaskedLM
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

# Configuration

In [None]:
NUM_WORKERS = 4
TRAIN_BATCH_SIZE = 32
EPOCHS = 2
SEED = 2020
LR = 3e-5

device = torch.device('cuda')

################################################# MODEL ####################################################################

transformer_model = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)
CONFIG = transformers.AutoConfig.from_pretrained(transformer_model)

############################################################################################################################
if transformer_model == 'bert-base-uncased':
    mask_tok = 103
elif transformer_model == 'roberta-base':
    mask_tok = 50264
elif (transformer_model == 'xlm-roberta-base') or (transformer_model == 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'):
    mask_tok = 250001

# Utils

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Dataset

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        text = row.title
        
        text = TOKENIZER(text,
                         return_attention_mask=False,
                         return_token_type_ids=False,
                         padding='max_length',
                         truncation=True,
                         max_length=64)
        
        input_ids = text['input_ids']
        
        input_ids,labels = self.prepare_mlm_input_and_labels(np.array(input_ids))

        input_ids = torch.tensor(input_ids,dtype=torch.long)
        labels = torch.tensor(labels,dtype=torch.long)
    
        return input_ids,labels
    
    def prepare_mlm_input_and_labels(self,X):
        # 15% BERT masking
        inp_mask = np.random.rand(*X.shape)<0.15 
        # do not mask special tokens
        inp_mask[X<=2] = False
        # set targets to -1 by default, it means ignore
        labels = -100 * np.ones(X.shape, dtype=int)
        # set labels for masked tokens
        labels[inp_mask] = X[inp_mask]
        
        # prepare input
        X_mlm = np.copy(X)
        # set input to [MASK] which is the last token for the 90% of tokens
        # this means leaving 10% unchanged
        inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
        X_mlm[inp_mask_2mask] = mask_tok

        # set 10% to a random token
        inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
        X_mlm[inp_mask_2random] = np.random.randint(3, CONFIG.vocab_size, inp_mask_2random.sum())

        return X_mlm, labels

# Loss

In [None]:
def masked_categorical_crossentropy(output,target):
    y_true_masked = target[target!= -100]
    y_pred_masked = output[target!= -100]
    loss =  nn.CrossEntropyLoss()(y_pred_masked,y_true_masked)
    return loss

# Train Function

In [None]:
def train_fn(dataloader,model,optimizer,device,scheduler,epoch):
    model.train()
    loss_score = AverageMeter()
    
    tk0 = tqdm(enumerate(dataloader), total=len(dataloader))
    for bi,d in tk0:
        
        batch_size = d[0].shape[0]

        input_ids = d[0]
        targets = d[1]

        input_ids = input_ids.to(device,dtype=torch.long)
        targets = targets.to(device)

        optimizer.zero_grad()

        output = model(input_ids=input_ids,labels=targets)
        
        loss = output.loss       
        
        loss.backward()
        optimizer.step()
        
        loss_score.update(loss.detach().item(), batch_size)
        tk0.set_postfix(Train_Loss=loss_score.avg,Epoch=epoch,LR=optimizer.param_groups[0]['lr'])
        
        if scheduler is not None:
                scheduler.step()
        
    return loss_score

# Engine

In [None]:
data = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
def run():
    # Defining DataSet
    train_dataset = ShopeeDataset(
        csv=data
    )
        
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        pin_memory=True,
        drop_last=True,
        num_workers=NUM_WORKERS
    )
    
    
    # Defining Model for specific fold
    model = AutoModelForMaskedLM.from_pretrained(transformer_model)
    print(model)
    model.to(device)

        
    # Defining Optimizer with weight decay to params other than bias and layer norms
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
            ]  
    
    optimizer = AdamW(optimizer_parameters, lr=LR)
    
    #Defining LR SCheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=len(train_loader)*5, 
        num_training_steps=len(train_loader)*EPOCHS
    )
        
    # THE ENGINE LOOP
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_loader, model,optimizer, device,scheduler=scheduler,epoch=epoch)
        
    model.save_pretrained('./')

In [None]:
run()