<a href="https://colab.research.google.com/github/pyagoubi/kaggle-Feedback-Prize/blob/main/Feedback_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')
import os
os.chdir("/content/gdrive/MyDrive/kaggle Feedback")

Mounted at /content/gdrive


In [2]:
%%capture
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1
!pip install iterative-stratification
!pip install sentencepiece

In [3]:
import os
import gc
import re
import pickle
import random
import itertools
import warnings
import sentencepiece
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 
from tqdm.auto import tqdm
import transformers
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from transformers import AutoTokenizer, AutoModel, AutoConfig, BertModel, BertTokenizer

In [4]:
print('Transformer Version: ', transformers.__version__)

Transformer Version:  4.21.2


# Import Data

In [5]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [6]:
targets = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

# Config File

In [7]:
class cfg:
    model="bert-large-cased"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    num_workers=4
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True

# Dataset

In [8]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values  #.reset_index()
        self.texts = df[["full_text"]].values #.reset_index()
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model)


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.tokenizer(self.texts[idx][0], 
                                padding='max_length', 
                                max_length = 512, truncation=True#, 
                                #return_tensors="pt"
                                )
        
        for k, v in batch_texts.items():
          batch_texts[k] = torch.tensor(v, dtype=torch.long)

        batch_y = torch.tensor(self.labels[idx], dtype=torch.float)

        return batch_texts, batch_y

In [22]:
class FBM(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
    self.model = AutoModel.from_pretrained(cfg.model)
    self.linear = nn.Linear(self.config.hidden_size, 512)
    self.dropout = nn.Dropout(p=0.2)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(512, 6)

  def forward(self, inputs):
    _, out = self.model(**inputs, return_dict=False)
    out = self.linear(out)
    out = self.dropout(out)
    out = self.relu(out)
    final_out = self.fc(out)
        
    return final_out

In [23]:
criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
model = FBM(cfg)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
#parameter freeze
for para in model.parameters():
    para.requires_grad = False

for name, param in model.named_parameters():
    if name in ['linear.weight', 'linear.bias', 'fc.weight', 'fc.bias']:
        param.requires_grad = True

In [25]:
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

In [None]:
mskf = MultilabelStratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
for fold, (train_index, val_index) in enumerate(mskf.split(df_train['full_text'], df_train[cfg.target_cols])):

  ds_train = Dataset(df_train.loc[train_index,:])

  ds_val = Dataset(df_train.loc[val_index, :])
  
  train_loader = DataLoader(ds_train,
                              batch_size=cfg.batch_size,
                              shuffle=True,
                              num_workers=cfg.num_workers, 
                              pin_memory=True 
                              #drop_last=True
                            )
  valid_loader = DataLoader(ds_val,
                              batch_size=cfg.batch_size,
                              shuffle=False,
                              num_workers=cfg.num_workers, 
                            pin_memory=True, 
                            drop_last=False)
  total_loss_train = 0
  total_loss_val = 0

  for epoch in range(cfg.epochs):
  
    for train_input, train_label in tqdm(train_loader):
      
      
      model.train()
      for k, v in train_input.items():
        train_input[k] = v.to(device)




      train_label = train_label.to(device).float()
      #input = train_input['input_ids'].squeeze(1).to(device)
      #mask = train_input['attention_mask'].to(device)
        
      output = model(train_input)
      optimizer = AdamW(model.parameters(), lr=1e-5)
      batch_loss = criterion(output, train_label)
      total_loss_train += batch_loss.item()
      model.zero_grad()
      batch_loss.backward()
      optimizer.step() 

    with torch.no_grad():
      for val_input, val_label in valid_loader:
            
        model.eval()

        for k, v in val_input.items():
          val_input[k] = v.to(device)

        val_label = val_label.to(device)
        #mask = val_input['attention_mask'].to(device)
        #input = val_input['input_ids'].squeeze(1).to(device)

        output = model(val_input)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
        
            
        
    print(f'Epoch: {epoch + 1} | Train Loss: {total_loss_train / len(df_train) * cfg.batch_size} |Val Loss: {total_loss_val / len(val_input) * cfg.batch_size} ')
  
  






  0%|          | 0/367 [00:00<?, ?it/s]

Epoch: 1 | Train Loss: 0.53152684152233 |Val Loss: 69.30112391710281 


  0%|          | 0/367 [00:00<?, ?it/s]

Epoch: 2 | Train Loss: 0.7139528913133378 |Val Loss: 136.95612859725952 


  0%|          | 0/367 [00:00<?, ?it/s]

Epoch: 3 | Train Loss: 0.896641447168578 |Val Loss: 204.616459608078 


  0%|          | 0/367 [00:00<?, ?it/s]

Exception ignored in: self._shutdown_workers()<function _MultiProcessingDataLoaderIter.__del__ at 0x7fa713ef90e0>
Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa713ef90e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1510, in __del__
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1510, in __del__
    
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1493, in _shutdown_workers
Exception ignored in:         self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1456, in _shutdown_workers
    self._pin_memory_thread.join()
  File "/usr/lib/python3.7/threading.py", line 1041, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread
<function _MultiProcessingDataLoaderIter.__del__

Epoch: 4 | Train Loss: 1.0779001885471182 |Val Loss: 271.0993689695994 


  0%|          | 0/367 [00:00<?, ?it/s]

Epoch: 1 | Train Loss: 0.17484651517758312 |Val Loss: 66.59147954980533 


  0%|          | 0/367 [00:00<?, ?it/s]