<a href="https://colab.research.google.com/github/pyagoubi/kaggle-Feedback-Prize/blob/main/Db_base_2_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Training Notebook of a Deberte-v3-base model with variable number of mean pooling layers and layerwise lr decay finetuning on FB3 targets.

In [1]:
# SAVE_PATH = './'
# TRAIN_PATH = '../input/feedback-prize-english-language-learning/train.csv'
# TEST_PATH = '../input/feedback-prize-english-language-learning/test.csv'
# SAMPLE_SUB_PATH = '../input/feedback-prize-english-language-learning/sample_submission.csv' 
# MODEL_NAME = 'microsoft/deberta-v3-base'

# TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [2]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/kaggle Feedback/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
SAVE_PATH = './'
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
SAMPLE_SUB_PATH = 'sample_submission.csv' 
MODEL_NAME = 'microsoft/deberta-v3-base'

TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [4]:
%%capture
!pip install iterative-stratification
!pip install sentencepiece
!pip install transformers==4.21.2
#!pip install iterative-stratification --no-index --find-links=file:../input/iterstratification/iterstrat

import warnings
import sentencepiece
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 
from tqdm import tqdm
import transformers
import torch
import torch.nn as nn
from torch import autocast
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from transformers import AutoTokenizer, AutoModel, AutoConfig, BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Transformer Version: ', transformers.__version__)

In [5]:
#
class cfg:
    model= MODEL_NAME
    gradient_checkpointing=True
    epochs=30
    eps=1e-6
    num_workers=4
    batch_size=3
    weight_decay=0.9
    target_cols=TARGET_COLS
    seed=42
    n_fold=4
    train=True
    mp_depth = 4 #number of mean poolings
    num_warmup_steps=0
    lr=2e-4
    layer_decay = 0.9
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    print_freq = 100
    accumulation_steps = 50
    max_norm =1000
    batch_scheduler=True
    num_cycles=0.5


def import_data(tr =TRAIN_PATH, te =TEST_PATH, sample =SAMPLE_SUB_PATH ):
  df_train = pd.read_csv(tr)
  df_test = pd.read_csv(te)
  submission = pd.read_csv(sample)
  return df_train, df_test, submission

def replace_nl(df_train, df_test):
  df_train['full_text'] = df_train['full_text'].str.replace(pat=r'[\n\r\t\\]', repl= r'', regex=True)
  df_test['full_text'] = df_test['full_text'].str.replace(pat=r'[\n\r\t\\]', repl=r'', regex=True)
  return df_train, df_test

def set_folds(df_train):
  Fold = MultilabelStratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
  for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train[cfg.target_cols])):
      df_train.loc[val_index, 'fold'] = int(n)
  df_train['fold'] = df_train['fold'].astype(int)
  display(df_train.groupby('fold').size())
  return df_train

def load_prepare():
  df_train, df_test, submission = import_data()
  df_train, df_test = replace_nl(df_train, df_test)
  df_train=  set_folds(df_train)
  return df_train, df_test, submission

In [6]:
%%capture
!pip install wandb
!wandb login
import wandb


def class2dict(f):
  return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))




In [7]:
#Dataset 
class Dataset_Db(torch.utils.data.Dataset):

    def __init__(self, cfg, df):
        self.cfg = cfg
        self.labels = df[cfg.target_cols].values
        self.texts = df[["full_text"]].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        batch_texts = cfg.tokenizer(self.texts[idx][0], 
                                padding='max_length', 
                                max_length = 1450, 
                                truncation=True, 
                                return_tensors=None, 
                                add_special_tokens=True,
                                pad_to_max_length=True                     
                                )
        
        for k, v in batch_texts.items():
          batch_texts[k] = torch.tensor(v, dtype=torch.long)

        batch_y = torch.tensor(self.labels[idx], dtype=torch.float)
        return batch_texts, batch_y


#Model
class MeanPooling(nn.Module):
    def __init__(self, mpd):
        super(MeanPooling, self).__init__()
        self.mp_depth = mpd
        
    def forward(self, last_hidden_state,hidden_states, attention_mask):
        mp_embeddings = []

        for i in range(self.mp_depth):
            if i ==0:
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) 
            else:
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states[i-1].size()).float()
                sum_embeddings = torch.sum(hidden_states[i-1] * input_mask_expanded, 1)   
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
            mp_embeddings.append(mean_embeddings)
    
        results = torch.cat(mp_embeddings , dim=1)
        results = results.reshape(results.size(0), self.mp_depth, int(results.size(1)/self.mp_depth))
        return results



    
    
class DBB(nn.Module):
    def __init__(self, cfg, mp_depth):
        super().__init__()
        self.cfg = cfg
        self.mp_depth = mp_depth
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.config.output_hidden_states=True
        self.config.hidden_dropout_prob = 0.
        self.config.attention_probs_dropout_prob = 0.
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        self.pool = MeanPooling(self.mp_depth)
        self.mpd = nn.Linear(mp_depth, 1)
        self.out = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.mpd)
        self._init_weights(self.out)
        self._init_weights(self.pool)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)        

    def forward(self, inputs):
        outputs = self.model(**inputs)
#         last_hidden_state = outputs[0]
#         hidden_states = outputs[1]
        pooled_outputs = self.pool(outputs.last_hidden_state,outputs.hidden_states,  inputs['attention_mask'])
        pooled_outputs = pooled_outputs.permute(0,2,1)
        mean_pooled = self.mpd(pooled_outputs)
        mean_pooled =mean_pooled.squeeze(-1)
        final_out = self.out(mean_pooled)
        return final_out

# ====================================================
#####Loss
#====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


    
def get_lr_groups(model, learning_rate=cfg.lr, layer_decay=cfg.layer_decay):
   
    n_layers = len(model.model.encoder.layer) + 6 # + 1 (embedding) +2 layernorm.. +2 lin

    embedding_decayed_lr = learning_rate * (layer_decay ** (n_layers+6))
    grouped_parameters = [{"params": model.model.embeddings.parameters(), 'lr': embedding_decayed_lr}]
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    for depth in range(1, n_layers-5):
        decayed_lr = learning_rate * (layer_decay ** (n_layers + 6 - depth))
        grouped_parameters.append(
            {"params": [p for n, p in model.model.encoder.layer[depth-1].named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': decayed_lr, 'weight_decay': cfg.weight_decay}
        )
        grouped_parameters.append(
            {"params": [p for n, p in model.model.encoder.layer[depth-1].named_parameters() if any(nd in n for nd in no_decay)],
             'lr': decayed_lr, 'weight_decay': 0.0})
        
    #rel embeddings layer
    grouped_parameters.append(
            {"params": [p for n, p in model.model.encoder.rel_embeddings.named_parameters() if not any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 4), 'weight_decay': cfg.weight_decay})
    grouped_parameters.append(
        {"params": [p for n, p in model.model.encoder.rel_embeddings.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': learning_rate * (layer_decay ** 4), 'weight_decay': 0.0})
    
    #layer norm layer
    grouped_parameters.append(
            {"params": [p for n, p in model.model.encoder.LayerNorm.named_parameters() if not any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 3), 'weight_decay': cfg.weight_decay})
    grouped_parameters.append(
        {"params": [p for n, p in model.model.encoder.LayerNorm.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': learning_rate * (layer_decay ** 3), 'weight_decay': 0.0})    
    
    #Pooling layer
    grouped_parameters.append(
            {"params": [p for n, p in model.pool.named_parameters() if not any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 2), 'weight_decay': cfg.weight_decay})
    grouped_parameters.append(
            {"params": [p for n, p in model.pool.named_parameters() if any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 2), 'weight_decay': 0.0})  
    

    #mpd layer
    grouped_parameters.append(
            {"params": [p for n, p in model.mpd.named_parameters() if not any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 1), 'weight_decay': cfg.weight_decay})
    grouped_parameters.append(
            {"params": [p for n, p in model.mpd.named_parameters() if any(nd in n for nd in no_decay)], 
             'lr': learning_rate * (layer_decay ** 1), 'weight_decay': 0.0})    

    #out layer
    grouped_parameters.append(
            {"params": [p for n, p in model.out.named_parameters() if not any(nd in n for nd in no_decay)], 
             'lr': learning_rate, 'weight_decay': cfg.weight_decay }
            )     
    grouped_parameters.append(
            {"params": [p for n, p in model.out.named_parameters() if any(nd in n for nd in no_decay)], 
             'lr': learning_rate, 'weight_decay': 0.0 }
            )    
       
    return grouped_parameters




def get_parms(model, lr):

    # save layer names
    layer_names = []
    for idx, (name, param) in enumerate(model.named_parameters()):
      layer_names.append(name)
    print(f'{idx}: {name}')
    layer_names.reverse()


    lr      = lr
    lr_mult = 0.9

    # placeholder
    parameters = []

    # store params & learning rates
    for idx, name in enumerate(layer_names):
    
      # display info
      print(f'{idx}: lr = {lr:.6f}, {name}')
      
      # append layer parameters
      parameters += [{'params': [p for n, p in model.named_parameters() if n == name and p.requires_grad],
                      'lr':     lr}]
      
      # update learning rate
      lr *= lr_mult
    return parameters


    
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
          'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
          'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
          'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs



In [8]:
def train():

    os.environ['WANDB_NOTEBOOK_NAME'] = '/content/drive/MyDrive/Colab Notebooks/Db-base-2-train.ipynb'

    run = wandb.init(project='FB3-Public', 
                     name=cfg.model,
                     config=class2dict(cfg),
                     group=cfg.model,
                     job_type="train")





    scaler = torch.cuda.amp.GradScaler()


    for val_fold in range(cfg.n_fold):
        oof_df = pd.DataFrame()


        train_folds = df_train[df_train['fold'] != val_fold].reset_index(drop=True)
        valid_folds = df_train[df_train['fold'] == val_fold].reset_index(drop=True)
        valid_labels = valid_folds[cfg.target_cols].values

        train_dataset = Dataset_Db(cfg, train_folds)
        valid_dataset = Dataset_Db(cfg, valid_folds)

        train_loader = DataLoader(train_dataset,
                                  batch_size=cfg.batch_size,
                                  shuffle=True,
                                  num_workers=cfg.num_workers, 
                                  pin_memory=True#, 
                                  #drop_last=True
                                  )
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=cfg.batch_size * 2,
                                  shuffle=False,
                                  num_workers=cfg.num_workers, pin_memory=True, drop_last=False)

        model = DBB(cfg,cfg.mp_depth)
        #torch.save(model.config, OUTPUT_DIR+'config.pth')
        model.to(device)

        lr_groups = get_lr_groups(model, learning_rate=1e-2)
        #parms = get_parms(model, cfg.lr)
        optimizer = AdamW(lr_groups, lr=cfg.lr, eps=cfg.eps, betas=cfg.betas)


        # optimizer_parameters = get_optimizer_params(model,
        #                                         encoder_lr=cfg.lr, 
        #                                         decoder_lr=cfg.lr,
        #                                         weight_decay=cfg.weight_decay)
        # optimizer = AdamW(optimizer_parameters, lr=cfg.lr, eps=cfg.eps, betas=cfg.betas)


        scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=cfg.num_warmup_steps,
                                                    num_training_steps=int(len(train_folds) /cfg.batch_size * cfg.epochs), 
                                                    num_cycles=cfg.num_cycles
                                                    )



        criterion = RMSELoss() #RMSELoss(reduction="mean")

        best_score = np.inf


        for epoch in range(cfg.epochs):

            model.train()
            scaler = torch.cuda.amp.GradScaler(enabled=True)

            losses = []
            counter = 0

            #train
            for step, (inputs, labels) in enumerate(train_loader):
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(device)

                labels = labels.to(device)
                batch_size = labels.size(0)

                #with torch.cuda.amp.autocast(enabled=True):
                y_preds = model(inputs)
                loss = criterion(y_preds, labels)

                if cfg.accumulation_steps > 1:
                    loss = loss / cfg.accumulation_steps

                losses.append(loss*batch_size)
                counter += batch_size

                scaler.scale(loss).backward()
                
#                 # before gradient clipping the optimizer parameters must be unscaled.
#                 scaler.unscale_(optimizer)
    
#                 # perform optimization step
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_norm)

                if (step + 1) % cfg.accumulation_steps == 0 or step == len(train_loader):
                    #Gradient Value Clipping
                    #nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()

                total = sum(losses)/counter

                if step % cfg.print_freq == 0 or step == (len(train_loader)-1):
                    print(f'Epoch: [{epoch}][{step}/{len(train_loader)}]  \n',
                        f'Loss: {total}')
                
                wandb.log({f"[fold{val_fold}] loss": loss*cfg.accumulation_steps,
                       f"[fold{val_fold}] lr": scheduler.get_lr()[0]})

                # Optional
                wandb.watch(model)



          #validation

            val_losses = []
            val_counter = 0
            preds = []
            model.eval()

            for step, (inputs, labels) in enumerate(valid_loader):
                inputs = collate(inputs)

                for k, v in inputs.items():
                    inputs[k] = v.to(device)

                labels = labels.to(device)
                batch_size = labels.size(0)

                with torch.no_grad():
                    val_y_preds = model(inputs)
                    val_loss = criterion(val_y_preds, labels)
                    
                if cfg.accumulation_steps > 1:
                    val_loss = val_loss / cfg.accumulation_steps

                val_losses.append(val_loss*batch_size)
                val_counter += batch_size

                total_val = sum(val_losses)/val_counter
                preds.append(val_y_preds.to('cpu').numpy())



            predictions = np.concatenate(preds)
            total_val_loss = sum(val_losses)/val_counter
            print(f'***************EVAL: Loss: {total_val_loss}')

            wandb.log({f"[fold{val_fold}] loss": val_loss*cfg.accumulation_steps})


            if best_score > total_val_loss:
                best_score = total_val_loss
                torch.save({'model': model.state_dict(),
                              'predictions': predictions},
                              SAVE_PATH+f"{cfg.model.replace('/', '-')}_fold{val_fold}_m1.pth")

        del model


In [9]:
df_train, df_test, submission = load_prepare()
tokenizer = AutoTokenizer.from_pretrained(cfg.model)
#tokenizer.save_pretrained(SAVE_PATH+'tokenizer/')
cfg.tokenizer = tokenizer
train()

fold
0    978
1    977
2    978
3    978
dtype: int64

[34m[1mwandb[0m: Currently logged in as: [33martv[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch: [0][0/978]  
 Loss: 0.07746513187885284
Epoch: [0][100/978]  
 Loss: 0.07628502696752548
Epoch: [0][200/978]  
 Loss: 0.07382301986217499
Epoch: [0][300/978]  
 Loss: 0.0701470822095871
Epoch: [0][400/978]  
 Loss: 0.06563375890254974
Epoch: [0][500/978]  
 Loss: 0.0603409968316555
Epoch: [0][600/978]  
 Loss: 0.054493486881256104
Epoch: [0][700/978]  
 Loss: 0.04876348376274109
Epoch: [0][800/978]  
 Loss: 0.044660650193691254
Epoch: [0][900/978]  
 Loss: 0.04165166616439819
Epoch: [0][977/978]  
 Loss: 0.03947523608803749
***************EVAL: Loss: 0.012957543134689331
Epoch: [1][0/978]  
 Loss: 0.01554647646844387
Epoch: [1][100/978]  
 Loss: 0.013739284127950668
Epoch: [1][200/978]  
 Loss: 0.014567920938134193
Epoch: [1][300/978]  
 Loss: 0.014154591597616673
Epoch: [1][400/978]  
 Loss: 0.01357951108366251
Epoch: [1][500/978]  
 Loss: 0.013420423492789268
Epoch: [1][600/978]  
 Loss: 0.013410741463303566
Epoch: [1][700/978]  
 Loss: 0.013442550785839558
Epoch: [1][800/978]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-eeb6e8f91bf7>", line 5, in <module>
    train()
  File "<ipython-input-8-744c9aa93c0e>", line 97, in train
    scaler.scale(loss).backward()
  File "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 175, in backward
    allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
  File "/usr/local/lib/python3.7/dist-packages/wandb/wandb_torch.py", line 282, in <lambda>
    handle = var.register_hook(lambda grad: _callback(grad, log_track))
  File "/usr/local/lib/python3.7/dist-packages/wandb/wandb_torch.py", line 280, in _callback
    se

BrokenPipeError: ignored

Error in callback <function _WandbInit._pause_backend at 0x7f972a174d40> (for post_run_cell):


BrokenPipeError: ignored

In [None]:
import gc
gc.collect()