In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 20210331 --apt-packages libomp5 libopenblas-dev
!rm -rf /kaggle/working/*.whl
!rm -rf /kaggle/working/*.py

In [2]:
import os

os.environ['XLA_USE_BF16'] = "1"
os.environ['XLA_TENSOR_ALLOCATOR_MAXSIZE'] = '100000000'

In [3]:
import numpy as np
import pandas as pd

import os, random, gc
import torch

import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import re, time, json, pickle

from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader
from pathlib import Path
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [4]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification, AutoModelForSequenceClassification

In [5]:
def seed_(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_()

In [6]:
MAX_LENGTH = 300
NUM_TARGETS = 1

SEED = 321

MODEL_NAME = "roberta-base"

MODEL_ROOT = Path(".")

In [7]:
TRAIN_BATCH_SIZE = 16
TRAIN_NUM_WORKERS = 2

VAL_BATCH_SIZE = 20
VAL_NUM_WORKERS = 2

## Cross validation by KFold

In [8]:
seed_(SEED)


train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

train_data["fold"] = -1
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=True)

for fold, (_, val_set) in enumerate(kf.split(np.arange(len(train_data)))):
    train_data.loc[val_set, "fold"] = fold

print(train_data.shape)
train_data.head()

In [9]:
train_data.fold.value_counts()

## Generating Tensor Dataset

In [10]:
def gen_data(model_name=MODEL_NAME):
    X_input_ids = []
    X_masks = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    with open(MODEL_ROOT/f"{MODEL_NAME}-tokenizer.pkl", "wb") as f:
        pickle.dump(tokenizer, f)

    for excerpt in tqdm(train_data.excerpt):
        inp = tokenizer(excerpt, add_special_tokens=True, return_tensors="pt",
                        max_length=MAX_LENGTH, padding="max_length", truncation=True)
        X_input_ids.append(inp["input_ids"])
        X_masks.append(inp["attention_mask"])

    X_input_ids = torch.cat(X_input_ids)
    X_masks = torch.cat(X_masks)
    Y = torch.tensor(train_data.target.values, dtype=torch.float32)
    
    print(X_input_ids.shape, X_masks.shape, Y.shape)
    
    return X_input_ids, X_masks, Y

In [11]:
X_input_ids, X_masks, Y = gen_data()

In [12]:
class CRPDataset(Dataset):
    def __init__(self, X_input_ids, X_masks, Y, is_train=True):
        assert X_input_ids.shape == X_masks.shape
        
        self.X_input_ids = X_input_ids
        self.X_masks = X_masks
        self.Y = Y
        
    def __len__(self):
        return len(self.X_input_ids)
    
    def __getitem__(self, idx):
        return (self.X_input_ids[idx], self.X_masks[idx]), self.Y[[idx]] 

In [13]:
ds = CRPDataset(X_input_ids, X_masks, Y)
len(ds)

In [14]:
(x, x_mask), y = ds[0]
x.shape, x_mask.shape, y

## Training the model

In [15]:
def get_model(model_name, task="token_classification", num_targets=NUM_TARGETS):
    task = task.lower()
    print(task)
        
    if "token" in task:
        model_instance = AutoModelForTokenClassification
    elif "sequence" in task:
        model_instance = AutoModelForSequenceClassification
        
    model = model_instance.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)
    
    if hasattr(model, "classifier"):
        model.classifier = nn.Linear(model.classifier.in_features, NUM_TARGETS)
        
    return config,tokenizer, model

In [16]:
class AttentionBlock(nn.Module):
    def __init__(self, in_features, middle_features, out_features):
        super().__init__()
        self.in_features = in_features
        self.middle_features = middle_features
        self.out_features = out_features
        self.W = nn.Linear(in_features, middle_features)
        self.V = nn.Linear(middle_features, out_features)

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector

In [17]:
class CRPTokenModel(nn.Module):
    def __init__(self, model_name=MODEL_NAME, num_targets=NUM_TARGETS, alpha=0.5, p=0.5):
        super().__init__()
        self.model_name = model_name
        self.num_targets = num_targets
        self.alpha = alpha
        self.p = p
        
        config,tokenizer, model = get_model(model_name, task="token_classification", num_targets=1)
        
        self.in_features =  model.classifier.in_features
        model.classifier = nn.Identity()
        
        self.config = config
        self.tokenizer = tokenizer
        self.model = model
        
        self.att = AttentionBlock(self.in_features, self.in_features, 1)
        self.fc = nn.Linear(self.in_features, self.num_targets)
        
    def forward(self, *args, **kwargs):
        x = self.model(*args, **kwargs)["logits"]
        x = self.att(x)
        x = self.fc(x)
        return x

In [18]:
def one_step(xb, yb, net, criterion, optimizer, device, scheduler=None):
    xb, yb = (xb[0].to(device), xb[1].to(device)), yb.to(device)
        
    net.zero_grad()
    o = net(input_ids=xb[0], attention_mask=xb[1])
    loss = criterion(o, yb)
    loss.backward()
    xm.optimizer_step(optimizer, barrier=True)
  
    with torch.no_grad():
        l = loss.item()
        r2 = r2_score(yb.cpu().numpy(), o.cpu().numpy())

        rmse = torch.sqrt(torch.mean(torch.square(o - yb))).item()
        mad = torch.mean(torch.abs(o - yb)).item()

    if scheduler is not None:
        scheduler.step()

    return l, rmse, mad, r2

In [19]:
@torch.no_grad()
def evaluate(net, criterion, val_laoder, device):
    net.eval()

    os, y = [], []
    val_laoder = tqdm(val_laoder, leave = False, total=len(val_laoder))

    for icount, (xb, yb) in  enumerate(val_laoder):

        y.append(yb.to(device))

        xb = (xb[0].to(device), xb[1].to(device))
        o = net(input_ids=xb[0], attention_mask=xb[1])

        os.append(o)

    y = torch.cat(y)
    o = torch.cat(os)

    l = criterion(o, y).item()
    
    r2 = r2_score(y.cpu().numpy(), o.cpu().numpy())

    rmse = torch.sqrt(torch.mean(torch.square(o - y))).item()
    mad = torch.mean(torch.abs(o - y)).item()

    return l, rmse, mad, r2

In [20]:
def one_epoch(net, criterion, optimizer, scheduler, train_laoder, val_laoder, device):
    net.train()
    l, rmse, mad, r2, icount = 0.,0.,0.,0., 0
    train_laoder = tqdm(train_laoder, leave = False)
    epoch_bar = train_laoder
  
    for (xb, yb) in  epoch_bar:
        _l, _rmse, _mad, _r2 = one_step(xb, yb, net, criterion, optimizer, device)
        l += _l
        rmse += _rmse
        mad += _mad
        r2 += _r2

        icount += 1
        
        if hasattr(epoch_bar, "set_postfix") and not icount%10:
            epoch_bar.set_postfix(
                loss="{:.3f}".format(l/icount),
                rmse="{:.3f}".format(rmse/icount),
                mad="{:.3f}".format(mad/icount),
                r2="{:.3f}".format(r2/icount),
            )
  
    scheduler.step()

    l /= icount
    rmse /= icount
    mad /= icount
    r2 /= icount

    l_val, rmse_val, mad_val, r2_val = evaluate(net, criterion, val_laoder, device)

    return (l, l_val), (rmse, rmse_val), (mad, mad_val), (r2, r2_val)

In [21]:
class AutoSave:
    def __init__(self, top_k=2, metric="f1", mode="min", root=None, name="ckpt"):
        self.top_k = top_k
        self.logs = []
        self.metric = metric
        self.mode = mode
        self.root = Path(root or MODEL_ROOT)
        assert self.root.exists()
        self.name = name

        self.top_models = []
        self.top_metrics = []

    def log(self, model, metrics):
        metric = metrics[self.metric]
        rank = self.rank(metric)

        self.top_metrics.insert(rank+1, metric)
        if len(self.top_metrics) > self.top_k:
            self.top_metrics.pop(0)

        self.logs.append(metrics)
        self.save(model, metric, rank, metrics["epoch"])

    def save(self, model, metric, rank, epoch):
        t = time.strftime("%Y%m%d%H%M%S")
        name = "{}_epoch_{:02d}_{}_{:.04f}_{}".format(self.name, epoch, self.metric, metric, t)
        name = re.sub(r"[^\w_\-\.]", "", name) + ".pth"
        path = self.root.joinpath(name)

        old_model = None
        self.top_models.insert(rank+1, name)
        if len(self.top_models) > self.top_k:
            old_model = self.root.joinpath(self.top_models[0])
            self.top_models.pop(0)      
        
        xm.save(model.state_dict(), path.as_posix())

        if old_model is not None:
            old_model.unlink()

        self.to_json()

    def rank(self, val):
        r = -1
        for top_val in self.top_metrics:
            if val <= top_val:
                return r
            r += 1
        return r
  
    def to_json(self):
        # t = time.strftime("%Y%m%d%H%M%S")
        name = "{}_logs".format(self.name)
        name = re.sub(r"[^\w_\-\.]", "", name) + ".json"
        path = self.root.joinpath(name)

        with path.open("w") as f:
            json.dump(self.logs, f, indent=2)

In [22]:
def one_fold(model_name, fold, train_set, val_set, epochs=20, save=True, save_root=None):
    device = xm.xla_device(fold + 1)
    save_root = Path(save_root) or MODEL_ROOT

    saver = AutoSave(root=save_root, name=f"crp_{model_name}_fold{fold}", metric="rmse_val")
   
    net = CRPTokenModel(model_name)
    net = net.to(device)
    
    with open(MODEL_ROOT/f"{model_name}-config.pkl", "wb") as f:
        pickle.dump(net.config, f)
    
    with open(MODEL_ROOT/f"{model_name}-tokenizer.pkl", "wb") as f:
        pickle.dump(net.tokenizer, f)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=5e-5)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=epochs)

    train_data = CRPDataset(X_input_ids=X_input_ids[train_set] , X_masks=X_masks[train_set], Y=Y[train_set], is_train=True)
    train_laoder = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, num_workers=TRAIN_NUM_WORKERS, shuffle=True, pin_memory=True)

    val_data = CRPDataset(X_input_ids=X_input_ids[val_set] , X_masks=X_masks[val_set], Y=Y[val_set], is_train=False)
    val_laoder = DataLoader(val_data, batch_size=VAL_BATCH_SIZE, num_workers=VAL_NUM_WORKERS, shuffle=False)

    epochs_bar = tqdm(list(range(epochs)), leave=False)

    for epoch  in epochs_bar:
        epochs_bar.set_description(f"--> [EPOCH {epoch:02d}]")
        net.train()

        (l, l_val), (rmse, rmse_val), (mad, mad_val), (r2, r2_val) = one_epoch(
            net=net,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            train_laoder=train_laoder,
            val_laoder=val_laoder, 
            device=device
          )

        epochs_bar.set_postfix(
        loss="({:.3f}, {:.3f})".format(l, l_val),
        rmse="({:.3f}, {:.3f})".format(rmse, rmse_val),
        mad="({:.3f}, {:.3f})".format(mad, mad_val),
        r2="({:.3f}, {:.3f})".format(r2, r2_val),
        )

        print(
            "[{epoch:02d}] loss: {loss} rmse: {rmse} mad: {mad} r2: {r2}".format(
                epoch=epoch,
                loss="({:.3f}, {:.3f})".format(l, l_val),
                rmse="({:.3f}, {:.3f})".format(rmse, rmse_val),
                mad="({:.3f}, {:.3f})".format(mad, mad_val),
                r2="({:.3f}, {:.3f})".format(r2, r2_val),
            )
        )

        if save:
            metrics = {
                "epoch": epoch,
                "loss": l, "rmse": -rmse, "mad": mad, "r2": r2,
                "loss_val": l_val, "rmse_val": -rmse_val, "mad_val": mad_val, "r2_val": r2_val,
            }

            saver.log(net, metrics)

In [23]:
def train(model_name, epochs=20, save=True, n_splits=5, seed=SEED, save_root=None, suffix="", folds=None):
    gc.collect()
    torch.cuda.empty_cache()
    save_root = save_root or MODEL_ROOT/f"{model_name}{suffix}"
    save_root.mkdir(exist_ok=True, parents=True)
    seed_(seed)
    fold_bar = tqdm(train_data.reset_index(drop=True).reset_index().groupby("fold").index.apply(list).items(), total=train_data.fold.max()+1)
    def run(fold, val_set, model_name, epochs, save, save_root):
        print(f"\n############################### [FOLD {fold}  SEED {seed}]")
        fold_bar.set_description(f"[FOLD {fold}  SEED {seed}]")
        train_set = np.setdiff1d(train_data.index, val_set)
        one_fold(model_name, fold=fold, train_set=train_set , val_set=val_set , epochs=epochs, save=save, save_root=save_root)
        gc.collect()
        torch.cuda.empty_cache()
        
    Parallel(n_jobs=N_SPLITS, backend="threading")(delayed(run)(fold, val_set, model_name, epochs, save, save_root) for fold, val_set in fold_bar)

In [24]:
for seed in [666]:
    train(MODEL_NAME, epochs=2, suffix=f"_maxlen{MAX_LENGTH}_seed{seed}", folds=None, seed=seed)