In [None]:
import warnings
warnings.filterwarnings('ignore')

import tqdm
import torch
import joblib
import transformers
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AdamW, get_cosine_schedule_with_warmup
from transformers import AutoModel, AutoConfig

from sklearn import model_selection

### Config

In [None]:
class Config:
    def __init__(self):
        self.TARGET_COLS = ["target"]
        self.MAX_LEN = 256
        self.CHECKPOINT = "../input/clrp-itpt-roberta-base/clrp-itpt-model-roberta-base"
        self.TOKENIZER_CHECKPOINT = "roberta-base"
        self.EPOCHS = 3
        self.TRAIN_BATCH_SIZE = 16
        self.EVAL_BATCH_SIZE = 16
        self.LR = 5e-5
        self.DEVICE = "cuda"
        self.EVAL_INTERVAL = 20
        self.LOG_INTERVAL = 20
        self.FOLDS = 5
        self.WD = 0.01

config = Config()

### Create Folds

In [None]:
raw_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
raw_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

train_df = raw_train.copy(deep=True)
train_df.loc[:, "excerpt"] = train_df.excerpt.map(lambda x: x.replace("\n", ""))
# sum(train_df.excerpt.str.contains("\n"))

num_bins = int(np.floor(np.log2(len(train_df))))
print(f"Num bins : {num_bins}")

train_df.loc[:, "bins"] = pd.cut(
    train_df["target"], bins=num_bins, labels=False
)

kf = model_selection.StratifiedKFold(n_splits=config.FOLDS)

train_df["fold"] = -1
for fold, (train_idx, valid_idx) in enumerate(kf.split(X=train_df, y=train_df.bins.values)):
    train_df.loc[valid_idx, "fold"] = fold

train_df.head()

In [None]:
train_df.bins.hist()

In [None]:
excerpts = train_df["excerpt"].copy(deep=True)

### Dataset

In [None]:
class CLRPDataset(Dataset):
    def __init__(self, data, checkpoint, max_length: int = 256, is_test: bool = False):
        self.excerpts = data.excerpt.values.tolist()
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.max_len = max_length
        self.targets = data.target.values.tolist()
        self.is_test = is_test
        
    def __getitem__(self, idx):
        item = self.tokenizer(self.excerpts[idx], max_length=self.max_len,
                             return_tensors="pt", truncation=True, padding="max_length")
        if self.is_test:
            return {
                "input_ids": torch.tensor(item["input_ids"], dtype=torch.long).squeeze(0),
                "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long).squeeze(0)
            }
        else:
            target = self.targets[idx]
            return {
                "input_ids": torch.tensor(item["input_ids"], dtype=torch.long).squeeze(0),
                "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long).squeeze(0),
                "label": torch.tensor(target, dtype=torch.float).squeeze(0)
            }

    def __len__(self):
        return len(self.targets)

### Model


In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
class CLRPModel(nn.Module):
    def __init__(self,path):
        super(CLRPModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.config = AutoConfig.from_pretrained(path)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self, input_ids, attention_mask, labels=None):
        x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        loss = None
        if labels is not None:
            loss = loss_fn(x, labels)
        return (loss, x) if loss is not None else x

### Metrics

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.count = 0
        self.max = 0
        self.min = 0
        self.avg = 0
        self.sum = 0
    
    def update(self, val, n=1):
        self.val = val
        self.count += n
        self.sum += val*n
        self.avg = self.sum / self.count
        if val > self.max: self.max = val
        if val < self.min: self.min = val


def loss_fn(outputs, targets):
    outputs = outputs.view(-1)
    targets = targets.view(-1)
    return torch.sqrt(nn.MSELoss()(outputs, targets))


### Trainer

In [None]:
class Trainer:

    def __init__(self, model, log_interval, eval_interval, epochs, 
                optimizer, lr_scheduler, model_dir):
        self.model = model
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.epochs = epochs
        self.log_interval = log_interval
        self.eval_interval = eval_interval
        self.model_dir = model_dir
        self.evaluator = Evaluator(self.model)


    def train(self, train_loader, valid_loader, result_dict, fold):
        result_dict["best_valid_loss"] = 9999
        for epoch in range(self.epochs):
            result_dict["epoch"] = epoch
            result_dict = self._train_loop_for_one_epoch(
                epoch=epoch,
                train_loader=train_loader,
                valid_loader=valid_loader,
                result_dict=result_dict
            )
        
        return result_dict

    def _train_loop_for_one_epoch(self, epoch, train_loader, valid_loader, result_dict):
        losses = AverageMeter()
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(config.DEVICE)
            attention_mask = batch["attention_mask"].to(config.DEVICE)
            label = batch["label"].to(config.DEVICE)
            self.model = self.model.to(config.DEVICE)

            loss = self._train_loop_for_one_step(
                input_ids,
                attention_mask,
                label
            )
            losses.update(loss.item())

            if batch_idx % self.log_interval == 0:
                print(f"Epoch={epoch}, Avg Loss={losses.avg}, Batch Idx={batch_idx}")
                if "train_loss" not in result_dict.keys(): result_dict["train_loss"] = list()
                result_dict["train_loss"].append(losses.avg)
                print("--------Training Results Summary------")
                print(f"Epoch: {epoch}, train_loss: {losses.avg}")

            if batch_idx % self.eval_interval == 0:
                result_dict = self.evaluator.evaluate(
                    valid_loader=valid_loader,
                    result_dict=result_dict,
                    epoch=epoch
                )
                if result_dict["valid_loss"][-1] <= result_dict["best_valid_loss"]:
                    print(f"Train loss: {result_dict['train_loss'][-1]}, Valid loss: {result_dict['valid_loss'][-1]}")
                    print(f"Valid loss decreased from {result_dict['best_valid_loss']} to {result_dict['valid_loss'][-1]}")
                    result_dict["best_valid_loss"] = result_dict["valid_loss"][-1]
                    
                    print(f"Saving model state dict in {self.model_dir}....")
                    torch.save(self.model.state_dict(), f'{self.model_dir}/model-fold-{fold}_dict')

        return result_dict
    
    def _train_loop_for_one_step(self, input_ids, attention_mask, label):
        self.model.train()
        self.optimizer.zero_grad()
        loss, logits = self.model(input_ids, attention_mask, label)
        loss.backward()
        self.optimizer.step()
        if self.lr_scheduler:
            self.lr_scheduler.step()
        return loss

### Evaluator

In [None]:
class Evaluator:

    def __init__(self, model):
        self.model = model

    def evaluate(self, epoch, valid_loader, result_dict):
        losses = AverageMeter()
        with torch.no_grad():
            for batch_idx, batch in enumerate(valid_loader):
                input_ids = batch["input_ids"].to(config.DEVICE)
                attention_mask = batch["attention_mask"].to(config.DEVICE)
                label = batch["label"].to(config.DEVICE)
                self.model = self.model.to(config.DEVICE)

                loss = self._eval_loop_for_one_step(
                    input_ids,
                    attention_mask,
                    label
                )
                losses.update(loss.item())
            print("----------Validation Results Summary---------")
            print(f"Epoch: {epoch}, valid_loss: {losses.avg}")
            if "valid_loss" not in result_dict.keys(): result_dict["valid_loss"] = list()
            result_dict["valid_loss"].append(losses.avg)

        return result_dict

    def _eval_loop_for_one_step(self, input_ids, attention_mask, label):
        self.model.eval()
        loss, logits = self.model(input_ids, attention_mask, label)
        return loss

### Main App

In [None]:
def run(df, fold, model_dir):

    xtrain = df[df["fold"] != fold]
    xvalid = df[df ["fold"] == fold]
    dtrain = CLRPDataset(xtrain, config.TOKENIZER_CHECKPOINT, config.MAX_LEN)
    dvalid = CLRPDataset(xvalid, config.TOKENIZER_CHECKPOINT, config.MAX_LEN)

    train_loader = DataLoader(
        dtrain,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=True
    )
    valid_loader = DataLoader(
        dvalid,
        batch_size=config.EVAL_BATCH_SIZE,
        shuffle=True
    )

    model = CLRPModel(path=config.CHECKPOINT)
    
    optimizer = AdamW(model.parameters(),
                      lr=config.LR,
                      weight_decay=config.WD
                )
    
    num_training_steps = config.EPOCHS*len(train_loader)
    lr_scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    print(f"Fold: {fold}")
    print(f"Total Epochs: {config.EPOCHS}, Train Dataset length: {len(dtrain)}, Train Data Loader length: {len(train_loader)}")
    print(f"Total Epochs: {config.EPOCHS}, Valid Dataset length: {len(dvalid)}, Valid Data Loader length: {len(valid_loader)}")

    print(f"Num training steps: {num_training_steps}")

    result_dict = {}
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        epochs=config.EPOCHS,
        log_interval=config.LOG_INTERVAL,
        eval_interval=config.EVAL_INTERVAL,
        model_dir=model_dir
    )
    result_dict = trainer.train(
        train_loader=train_loader,
        valid_loader=valid_loader,
        result_dict=result_dict,
        fold=fold
    )
    joblib.dump(result_dict, f"./result_dict-fold-{fold}")

In [None]:
import torch
import gc

for fold in range(config.FOLDS):
    torch.cuda.empty_cache()
    gc.collect()
    run(train_df, fold, ".")

In [None]:
df = train_df.copy(deep=True)
xtrain = df[df["fold"] == fold]
xvalid = df[df ["fold"] != fold]
dtrain = CLRPDataset(xtrain, config.TOKENIZER_CHECKPOINT, config.MAX_LEN)
dvalid = CLRPDataset(xvalid, config.TOKENIZER_CHECKPOINT, config.MAX_LEN)

In [None]:
xvalid.shape