In [None]:
SAVE_PATH = './'
TRAIN_PATH = '../input/pseudolabels-deberta-base/pseudolabels.csv'
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [None]:
%%capture
!pip install iterative-stratification
!pip install sentencepiece
!pip install transformers==4.21.2
#!pip install iterative-stratification --no-index --find-links=file:../input/iterstratification/iterstrat

import warnings
import sentencepiece
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 
from tqdm import tqdm
import transformers
import torch
import torch.nn as nn
from torch import autocast
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from transformers import AutoTokenizer, AutoModel, AutoConfig, BertModel, BertTokenizer

print('Transformer Version: ', transformers.__version__)

In [None]:
class cfg:
    model= 'microsoft/deberta-v3-base'
    gradient_checkpointing=True
    epochs=10
    eps=1e-6
    num_workers=4
    batch_size=2
    weight_decay=0.01
    target_cols=TARGET_COLS
    seed=42
    train=True
    #scheduler='cosine' # ['linear', 'cosine']
    #batch_scheduler=True
    #num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    print_freq = 100
    #max_len=512





def import_data(tr =TRAIN_PATH):
  df_train = pd.read_csv(tr)
  return df_train

def replace_nl(df_train, df_test):
  df_train['full_text'] = df_train['full_text'].str.replace(pat=r'[\n\r\t\\]', repl= r'', regex=True)
  df_test['full_text'] = df_test['full_text'].str.replace(pat=r'[\n\r\t\\]', repl=r'', regex=True)
  return df_train, df_test

def train_test_split(df, frac=0.2):
    
    # get random sample 
    test = df.sample(frac=frac, axis=0)

    # get everything but the test sample
    train = df.drop(index=test.index)

    return train, test

def load_prepare():
  df_train = import_data()
  df_train, df_val = train_test_split(df_train, frac=0.2)
  return df_train, df_val



In [None]:
df_train, df_val = load_prepare()

In [None]:
df_train

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model)
tokenizer.save_pretrained(SAVE_PATH+'tokenizer/')
cfg.tokenizer = tokenizer



#Dataset Deberta Base
class Dataset_Db(torch.utils.data.Dataset):

    def __init__(self, cfg, df):
        self.cfg = cfg
        self.labels = df[cfg.target_cols].values
        self.texts = df[["full_text"]].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        batch_texts = cfg.tokenizer(self.texts[idx][0], 
                                padding='max_length', 
                                max_length = 1450, 
                                truncation=True, 
                                return_tensors=None, 
                                add_special_tokens=True,
                                pad_to_max_length=True                     
                                )
        
        for k, v in batch_texts.items():
          batch_texts[k] = torch.tensor(v, dtype=torch.long)

        batch_y = torch.tensor(self.labels[idx], dtype=torch.float)
        return batch_texts, batch_y


#Model Deberta Base

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class DBB(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
    self.config.hidden_dropout = 0.
    self.config.hidden_dropout_prob = 0.
    self.config.attention_dropout = 0.
    self.config.attention_probs_dropout_prob = 0.
    self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
    self.pool = MeanPooling()
    self.out = nn.Linear(self.config.hidden_size, 6)
    self._init_weights(self.out)

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
          if module.bias is not None:
              module.bias.data.zero_()
      elif isinstance(module, nn.Embedding):
          module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
          if module.padding_idx is not None:
              module.weight.data[module.padding_idx].zero_()
      elif isinstance(module, nn.LayerNorm):
          module.bias.data.zero_()
          module.weight.data.fill_(1.0)

  def forward(self, inputs):
      outputs = self.model(**inputs)
      last_hidden_states = outputs[0]
      pooled_output = self.pool(last_hidden_states, inputs['attention_mask'])
      final_out = self.out(pooled_output)
      return final_out
    
# ====================================================
#####Loss
#====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
          'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
          'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
          'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs



# Model Training

In [None]:
scaler = torch.cuda.amp.GradScaler()



oof_df = pd.DataFrame()


valid_labels = df_val[cfg.target_cols].values

train_dataset = Dataset_Db(cfg, df_train)
valid_dataset = Dataset_Db(cfg, df_val)

train_loader = DataLoader(train_dataset,
                          batch_size=cfg.batch_size,
                          shuffle=True,
                          num_workers=cfg.num_workers, 
                          pin_memory=True#, 
                          #drop_last=True
                          )
valid_loader = DataLoader(valid_dataset,
                          batch_size=cfg.batch_size * 2,
                          shuffle=False,
                          num_workers=cfg.num_workers, pin_memory=True, drop_last=False)

model = DBB(cfg)
#torch.save(model.config, OUTPUT_DIR+'config.pth')
model.to(device)

optimizer_parameters = get_optimizer_params(model,
                                            encoder_lr=cfg.encoder_lr, 
                                            decoder_lr=cfg.decoder_lr,
                                            weight_decay=cfg.weight_decay)

optimizer = AdamW(optimizer_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)

criterion = RMSELoss() #RMSELoss(reduction="mean")

#best_score = np.inf


for epoch in range(cfg.epochs):

    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    losses = []
    counter = 0

    #train
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)

        labels = labels.to(device)
        batch_size = labels.size(0)

        #with torch.cuda.amp.autocast(enabled=True):
        y_preds = model(inputs)
        loss = criterion(y_preds, labels)

        losses.append(loss*batch_size)
        counter += batch_size

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        total = sum(losses)/counter

        if step % cfg.print_freq == 0 or step == (len(train_loader)-1):
            print(f'Epoch: [{epoch}][{step}/{len(train_loader)}]  \n',
                f'Loss: {total}')



  #validation

    val_losses = []
    ep = []
    ep_losses = []
    val_counter = 0
    preds = []
    model.eval()

    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)

        for k, v in inputs.items():
            inputs[k] = v.to(device)

        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.no_grad():
            val_y_preds = model(inputs)
            val_loss = criterion(val_y_preds, labels)

        val_losses.append(val_loss*batch_size)
        val_counter += batch_size

        total_val = sum(val_losses)/val_counter
        preds.append(val_y_preds.to('cpu').numpy())
        
    
    
    total_val_loss = sum(val_losses)/val_counter
    ep.append(epoch)
    ep_losses.append(total_val_loss)



predictions = np.concatenate(preds)
total_val_loss = sum(val_losses)/val_counter


print(f'***************EVAL: Loss: {total_val_loss}')



import matplotlib.pyplot as plt
plt.plot(ep, ep_losses)


torch.save({'model': model.state_dict(),
                  'predictions': predictions},
                  SAVE_PATH+f"{cfg.model.replace('/', '-')}_pretrain_PL.pth")
