<h1><Center>CommonLit Readability Prize</center></h1>

Refer to my previous [notebook](https://www.kaggle.com/harshsharma511/one-stop-understanding-eda-bert) for Competition Understanding, EDA and Baseline BERT Model. 

In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import joblib
import random
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from warnings import simplefilter
simplefilter('ignore')


In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW)

In [None]:
model_name = 'roberta_v7'
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
pretrained_path = '../input/roberta-base/'
build_dir = Path('./build')
output_dir = build_dir / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'

# Config
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 256
n_fold = 5
n_est = 20
n_stop = 2
batch_size = 16
seed = 42

In [None]:
output_dir.mkdir(parents=True, exist_ok=True)
seed_everything(seed)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape)
trn.head()

In [None]:
#Tokenization Using RoBERTa
tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_path, do_lower_case=True)
model_config = RobertaConfig.from_pretrained(pretrained_path)
model_config.output_hidden_states = True

In [None]:
class Data(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        self.labeled = target_col in df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        texts = self.df[text_col][idx]
        token = tokenizer(texts, max_length=max_len, truncation=True, padding='max_length', 
                          return_tensors='pt', add_special_tokens=True)
        ids = torch.tensor(token['input_ids'], dtype=torch.long).squeeze()
        mask = torch.tensor(token['attention_mask'], dtype=torch.long).squeeze()
        if self.labeled:
            target = torch.tensor(self.df[target_col][idx], dtype=torch.float)
        
        return (ids, mask, target) if self.labeled else (ids, mask)

In [None]:
#Model Training with Cross-Validation
class ReadabilityModel(LightningModule):
    
    def __init__(self, conf):
        super().__init__()
        self.config = conf
        self.model = RobertaModel.from_pretrained(pretrained_path, config=self.config)
        self.dropout = nn.Dropout(0.1)
        self.num_targets = 1
        self.clf = nn.Linear(768, self.num_targets)
        torch.nn.init.normal_(self.clf.weight, std=0.02)
    
    def forward(self, inputs):
        ids, mask = inputs
        out = self.model(ids, attention_mask=mask)
        out = out['hidden_states']
        x = out[-1]
        x = self.dropout(x)
        x = torch.mean(x, 1, True)
        preds = self.clf(x)
        preds = preds.squeeze(-1).squeeze(-1)

        return preds
    
    def training_step(self, batch, batch_idx):
        ids, mask, y = batch
        p = self([ids, mask])
        loss = self.loss_fn(p, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask, y = batch
        p = self([ids, mask])
        loss = self.loss_fn(p, y)
        self.log('val_loss', loss)
        
    def configure_optimizers(self):
        optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
        lr_scheduler = get_constant_schedule_with_warmup(optimizer, 100)
        return [optimizer], [lr_scheduler]
    
    def loss_fn(self, p, y):
        return torch.sqrt(nn.MSELoss()(p, y))

In [None]:
cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],), dtype=float)
for i_cv, (i_trn, i_val) in enumerate(cv.split(trn), 1):
    model = ReadabilityModel(model_config)
    trn_loader = DataLoader(Data(trn.iloc[i_trn]), shuffle=True, batch_size=batch_size)
    val_loader = DataLoader(Data(trn.iloc[i_val]), shuffle=False, batch_size=batch_size * 8)

    trainer = Trainer(max_epochs=n_est, gpus=-1, logger=False, checkpoint_callback=False,
                      callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=n_stop)])
    trainer.fit(model, trn_loader, val_loader)

    val_loader = DataLoader(Data(trn.iloc[i_val].drop(target_col, axis=1)), shuffle=False, 
                            batch_size=batch_size * 8)
    tst_loader = DataLoader(Data(tst), shuffle=False, batch_size=batch_size * 8)
    p[i_val] = np.concatenate(trainer.predict(model, val_loader))
    p_tst += np.concatenate(trainer.predict(model, tst_loader)) / n_fold
    
    trainer.save_checkpoint(f'{model_name}_cv{i_cv}.ckpt')

In [None]:
#Print CV RMSE and Save CV Predictions
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')

In [None]:
#Submission
sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()