<a href="https://colab.research.google.com/github/pyagoubi/kaggle-Feedback-Prize/blob/main/Feedback_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')
import os
os.chdir("/content/gdrive/MyDrive/kaggle Feedback")

Mounted at /content/gdrive


In [2]:
%%capture
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1
!pip install iterative-stratification
!pip install sentencepiece

In [3]:
SAVE_PATH = '/content/gdrive/MyDrive/kaggle Feedback'

In [4]:
import os
import gc
import re
import pickle
import random
import itertools
import warnings
import sentencepiece
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 
from tqdm import tqdm
import transformers
import torch
import torch.nn as nn
from torch import autocast
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold



from transformers import AutoTokenizer, AutoModel, AutoConfig, BertModel, BertTokenizer

In [5]:
print('Transformer Version: ', transformers.__version__)

Transformer Version:  4.21.2


# Import Data

In [6]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [7]:
df_train['full_text'] = df_train['full_text'].str.replace(pat=r'[\n\r\t\\]', repl= r'', regex=True)
df_test['full_text'] = df_test['full_text'].str.replace(pat=r'[\n\r\t\\]', repl=r'', regex=True)

# Config File

In [8]:
class cfg:
    model="bert-large-cased"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=1
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    num_workers=4
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True

# Dataset

In [9]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, train = True):

        self.train = train
        if self.train: self.labels = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values  #.reset_index()
        self.texts = df[["full_text"]].values #.reset_index()
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model)


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):

        batch_texts = self.tokenizer(self.texts[idx][0], 
                                padding='max_length', 
                                max_length = 512, truncation=True#, 
                                #return_tensors="pt"
                                )
        
        for k, v in batch_texts.items():
          batch_texts[k] = torch.tensor(v, dtype=torch.long)

        if self.train: batch_y = torch.tensor(self.labels[idx], dtype=torch.float)

        if self.train: return batch_texts, batch_y
        else: return batch_texts

In [10]:
class FBM(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
    self.model = AutoModel.from_pretrained(cfg.model)
    self.linear = nn.Linear(self.config.hidden_size, 512)
    self.dropout = nn.Dropout(p=0.2)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(512, 6)

  def forward(self, inputs):
    _, out = self.model(**inputs, return_dict=False)
    out = self.linear(out)
    out = self.dropout(out)
    out = self.relu(out)
    final_out = self.fc(out)
        
    return final_out

In [11]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, y_pred, y_true):
        criterion = nn.MSELoss(reduction='mean')
        loss = torch.sqrt(criterion(y_pred, y_true) + self.eps) 
        return loss

In [12]:
#parameter freeze
def freeze_pn(model):
  for para in model.parameters():
    para.requires_grad = False

  for name, param in model.named_parameters():
    if name in ['linear.weight', 'linear.bias', 'fc.weight', 'fc.bias']:
        param.requires_grad = True
  return model

In [13]:
scaler = torch.cuda.amp.GradScaler()

criterion = RMSELoss() 

In [14]:
mskf = MultilabelStratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
for fold, (train_index, val_index) in enumerate(mskf.split(df_train['full_text'], df_train[cfg.target_cols])):

  ds_train = Dataset(df_train.loc[train_index,:])

  ds_val = Dataset(df_train.loc[val_index, :])
  
  train_loader = DataLoader(ds_train,
                              batch_size=cfg.batch_size,
                              shuffle=True,
                              num_workers=cfg.num_workers, 
                              pin_memory=True 
                              #drop_last=True
                            )
  valid_loader = DataLoader(ds_val,
                              batch_size=cfg.batch_size,
                              shuffle=False,
                              num_workers=cfg.num_workers, 
                            pin_memory=True, 
                            drop_last=False)
  
  
  
  model = FBM(cfg)
  optimizer = AdamW(model.parameters(), lr=1e-5)
  
  if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
  
  model = freeze_pn(model)

  total_loss_train = 0
  total_loss_val = 0

  for epoch in range(cfg.epochs):
    total_loss_train = 0
    total_loss_val = 0

    model.train()
    for train_input, train_label in tqdm(train_loader):

      for k, v in train_input.items():
        train_input[k] = v.to(device)

      train_label = train_label.to(device).float()

      with torch.cuda.amp.autocast(enabled=True):
        output = model(train_input)
        batch_loss = criterion(output, train_label)
          
      total_loss_train += batch_loss.item()
      

      scaler.scale(batch_loss).backward()
      model.zero_grad()

    with torch.no_grad():
      model.eval()
      for val_input, val_label in valid_loader:
            
        

        for k, v in val_input.items():
          val_input[k] = v.to(device)

        val_label = val_label.to(device)

        with torch.no_grad():
          y_preds = model(val_input)
          batch_val_loss = criterion(y_preds, val_label)


        total_loss_val += batch_val_loss.item()

    torch.save(model.state_dict(), f'{SAVE_PATH}BERT_large_cased_fold{fold}.pth')
        
            
        
    print(f'Epoch: {epoch + 1} | Train Loss: {total_loss_train / len(ds_train)} |Val Loss: {total_loss_val / len(ds_val)} ')
  
  






Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 367/367 [03:06<00:00,  1.97it/s]


Epoch: 1 | Train Loss: 0.40418310723043 |Val Loss: 0.4062377872642564 


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 367/367 [03:02<00:00,  2.01it/s]


Epoch: 1 | Train Loss: 0.38415942772278266 |Val Loss: 0.3855471435320414 


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 367/367 [03:02<00:00,  2.01it/s]


Epoch: 1 | Train Loss: 0.40223157588751945 |Val Loss: 0.4079658719415801 


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 367/367 [03:02<00:00,  2.01it/s]


Epoch: 1 | Train Loss: 0.39397155746761703 |Val Loss: 0.3933001707911735 


# Inference

In [15]:
ds_test = Dataset(df_test, train = False)
test_loader = DataLoader(ds_test, batch_size=1)

In [23]:
#del model
predictions = []
temp = []

model = FBM(cfg)
model.load_state_dict(torch.load(f'{SAVE_PATH}BERT_large_cased_fold0.pth'))
model.eval()
if torch.cuda.is_available():
  model = model.cuda()

with torch.no_grad():
    for test_input in test_loader: 
      
      for k, v in test_input.items():
        test_input[k] = v.to(device)
      
      preds = model(test_input)
      print(preds)
      for pred in preds.cpu():
        predictions.append(np.array([min(max(1.0, i), 5.0) for i in np.array(pred)]))
      print(predictions)


for f in range(1, cfg.n_fold):
  del model
  model = FBM(cfg)
  model.load_state_dict(torch.load(f'{SAVE_PATH}BERT_large_cased_fold{fold}.pth'))
  model.eval()

  if torch.cuda.is_available():
    model = model.cuda()
  
  with torch.no_grad():
    for test_input in test_loader: 
      
      for k, v in test_input.items():
        test_input[k] = v.to(device)
      
      preds = model(test_input)
      for pred in preds.cpu():
        temp.append(np.array([min(max(1.0, i), 5.0) for i in np.array(pred)]))
        print(temp)

  print(predictions)
  predictions += temp
  print(predictions)

predictions = [i/cfg.n_fold for i in predictions]


predictions




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 0.3102,  0.0379, -0.2355, -0.1678, -0.0267, -0.4300]],
       device='cuda:0')
[array([1., 1., 1., 1., 1., 1.])]
tensor([[ 0.2973,  0.0367, -0.2207, -0.1670, -0.0282, -0.4162]],
       device='cuda:0')
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
tensor([[ 0.2695,  0.0294, -0.1861, -0.1613, -0.0271, -0.3907]],
       device='cuda:0')
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.])]
[array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1.]), array

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: ignored