In [7]:
SAVE_PATH = './'
TRAIN_PATH = '../input/feedback-prize-english-language-learning/train.csv'
TEST_PATH = '../input/feedback-prize-english-language-learning/test.csv'
SAMPLE_SUB_PATH = '../input/feedback-prize-english-language-learning/sample_submission.csv' 
MODEL_PATH = '../input/bertlarge'


In [8]:
!pip install iterative-stratification --no-index --find-links=file:../input/iterstratification/iterstrat

Looking in links: file:///../input/iterstratification/iterstrat
[0m

In [9]:
import warnings
import sentencepiece
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000) 
from tqdm import tqdm
import transformers
import torch
import torch.nn as nn
from torch import autocast
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold



from transformers import AutoTokenizer, AutoModel, AutoConfig, BertModel, BertTokenizer

In [10]:
print('Transformer Version: ', transformers.__version__)

Transformer Version:  4.20.1


# Import Data

In [11]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(SAMPLE_SUB_PATH)

In [12]:
df_train['full_text'] = df_train['full_text'].str.replace(pat=r'[\n\r\t\\]', repl= r'', regex=True)
df_test['full_text'] = df_test['full_text'].str.replace(pat=r'[\n\r\t\\]', repl=r'', regex=True)

# Config File

In [13]:
class cfg:
    model= MODEL_PATH
    gradient_checkpointing=True
    epochs=10
    eps=1e-6
    num_workers=4
    batch_size=10
    weight_decay=0.01
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    train=True

# Dataset

In [14]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, train = True):

        self.train = train
        if self.train: self.labels = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values  #.reset_index()
        self.texts = df[["full_text"]].values #.reset_index()
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model)


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):

        batch_texts = self.tokenizer(self.texts[idx][0], 
                                padding='max_length', 
                                max_length = 512, truncation=True#, 
                                #return_tensors="pt"
                                )
        
        for k, v in batch_texts.items():
          batch_texts[k] = torch.tensor(v, dtype=torch.long)

        if self.train: batch_y = torch.tensor(self.labels[idx], dtype=torch.float)

        if self.train: return batch_texts, batch_y
        else: return batch_texts

# Model

In [15]:
class FBM(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.cfg = cfg
    self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
    self.model = AutoModel.from_pretrained(cfg.model)
    self.linear = nn.Linear(self.config.hidden_size, 512)
    self.dropout = nn.Dropout(p=0.1)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(512, 6)

  def forward(self, inputs):
    _, out = self.model(**inputs, return_dict=False)
    out = self.linear(out)
    out = self.dropout(out)
    out = self.relu(out)
    final_out = self.fc(out)
        
    return final_out

# Loss Function

In [16]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, y_pred, y_true):
        criterion = nn.MSELoss(reduction='mean')
        loss = torch.sqrt(criterion(y_pred, y_true) + self.eps) 
        return loss

# Model Training

In [17]:
#parameter freeze
def freeze_pn(model):
  for para in model.parameters():
    para.requires_grad = False

  for name, param in model.named_parameters():
    if name in ['linear.weight', 'linear.bias', 'fc.weight', 'fc.bias']:
        param.requires_grad = True
  return model

In [18]:
scaler = torch.cuda.amp.GradScaler()

criterion = RMSELoss() 

In [None]:
def train_fn(df):
    
    mskf = MultilabelStratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for fold, (train_index, val_index) in enumerate(mskf.split(df['full_text'], df[cfg.target_cols])):

      ds_train = Dataset(df.loc[train_index,:])

      ds_val = Dataset(df.loc[val_index, :])

      train_loader = DataLoader(ds_train,
                                  batch_size=cfg.batch_size,
                                  shuffle=True,
                                  num_workers=cfg.num_workers, 
                                  pin_memory=True 
                                  #drop_last=True
                                )
      valid_loader = DataLoader(ds_val,
                                  batch_size=cfg.batch_size,
                                  shuffle=False,
                                  num_workers=cfg.num_workers, 
                                pin_memory=True, 
                                drop_last=False)



      model = FBM(cfg)
      optimizer = AdamW(model.parameters(), lr=1e-5)

      if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()

      model = freeze_pn(model)

      total_loss_train = 0
      total_loss_val = 0

      model.train()

      for epoch in range(cfg.epochs):
        total_loss_train = 0
        total_loss_val = 0

        for train_input, train_label in tqdm(train_loader):

            for k, v in train_input.items():
                train_input[k] = v.to(device)

            train_label = train_label.to(device).float()

            with torch.cuda.amp.autocast(enabled=True):
                output = model(train_input)
                batch_loss = criterion(output, train_label)

            total_loss_train += batch_loss.item()

            model.zero_grad()
            scaler.scale(batch_loss).backward()
            optimizer.step()


        with torch.no_grad():
          model.eval()
          for val_input, val_label in valid_loader:



            for k, v in val_input.items():
              val_input[k] = v.to(device)

            val_label = val_label.to(device)

            with torch.no_grad():
              y_preds = model(val_input)
              batch_val_loss = criterion(y_preds, val_label)


            total_loss_val += batch_val_loss.item()

        torch.save(model.state_dict(), f'{SAVE_PATH}BERT_large_cased_fold{fold}.pth')



        print(f'Epoch: {epoch + 1} | Train Loss: {total_loss_train / len(ds_train)} |Val Loss: {total_loss_val / len(ds_val)} ')

# Inference

In [None]:
def inference(df_test):

    ds_test = Dataset(df_test, train = False)
    test_loader = DataLoader(ds_test, batch_size=1)

    del model
    predictions = []


    model = FBM(cfg)
    model.load_state_dict(torch.load(f'{SAVE_PATH}BERT_large_cased_fold0.pth'))
    model.eval()
    if torch.cuda.is_available():
      model = model.cuda()

    with torch.no_grad():
        for test_input in test_loader: 

          for k, v in test_input.items():
            test_input[k] = v.to(device)

          preds = model(test_input)
          for pred in preds.cpu():
            predictions.append(np.array([min(max(1.0, i), 5.0) for i in np.array(pred)]))

    pred_df = pd.DataFrame(predictions, columns = cfg.target_cols)
    display(pred_df)

    for f in range(1, cfg.n_fold):
        del model
        model = FBM(cfg)
        model.load_state_dict(torch.load(f'{SAVE_PATH}BERT_large_cased_fold{f}.pth'))
        model.eval()
        temp = []

        if torch.cuda.is_available():
            model = model.cuda()

        with torch.no_grad():
            for i, test_input in enumerate(test_loader):
                for k, v in test_input.items():
                    test_input[k] = v.to(device)

                preds = model(test_input)
                for pred in preds.cpu():
                    temp.append(np.array([min(max(1.0, i), 5.0) for i in np.array(pred)]))

        temp_df = pd.DataFrame(temp, columns = cfg.target_cols)
        pred_df = pred_df.add(temp_df, fill_value=0)

        display(pred_df)


    pred_df = pred_df/cfg.n_fold

    display(pred_df)

    pred_df['text_id'] = submission['text_id']

    pred_df.to_csv("submission.csv", index = False)


In [None]:
inference(df_test)