# Version 15
* Sub-version 4
* Train with ../input/clrcross-validation-strategies/train_folds_shuffle2.csv
* Seed 1234
* No QWK

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random

import nltk
import string
import re
import math

from sklearn.utils import shuffle

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Autocast
from torch.cuda.amp import autocast, GradScaler

# Stochastic Weight Average
from torch.optim.swa_utils import AveragedModel, SWALR, update_bn

from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, AdamW

import warnings
warnings.filterwarnings('ignore')

In [None]:
import transformers
transformers.__version__

In [None]:
!pip install spacy-readability
import spacy
from spacy_readability import Readability

nlp = spacy.load('en')
nlp.add_pipe(Readability(), last = True)

In [None]:
def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 1234
seed_everything(seed)

# Import data

In [None]:
base_dir = '../input/clrcross-validation-strategies'
data = pd.read_csv(f'{base_dir}/train_folds_shuffle2.csv')
benchmark = data[data['standard_error'] == 0.]
data['compare_to_benchmark'] = np.sign(data['target'])
data.head()

In [None]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def readability_feat(text):
    text = nlp(text)
    
    return np.array([text._.flesch_kincaid_grade_level,
                     text._.flesch_kincaid_reading_ease,
                     text._.dale_chall,
                     text._.coleman_liau_index,
                     text._.automated_readability_index,
                     text._.forcast], dtype = np.float)

def sample_text(targets, num_output = 5):
    mean, var = targets[0], targets[1]
    if targets[1] != 0.:
        sampled_target = torch.normal(mean, var, size = (num_output,))
    else:
        sampled_target = torch.tensor([0.] * num_output, dtype = torch.float)
    return sampled_target

def convert_examples_to_features(text, tokenizer, max_len, is_test = False, return_tensor = False):
    # Take from https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit
    text = text.replace('\n', '')
    if return_tensor:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            return_tensors = 'pt',
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    else:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    return tok

def form_dataset(token, external_features = None, target = None, bins = None):
    if target is not None:
        if bins is not None:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                    'bins': bins,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                    'bins': bins,
                }
        else:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                }
    else:
        if external_features is not None:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                'external_features': torch.tensor(external_features, dtype = torch.float),
            }
        else:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
            }

# Dataset

In [None]:
class Readability_Dataset(Dataset):
    def __init__(self, documents, tokenizer, sample = False, max_len = 300, num_output = 5, binning = True, mode = 'train'):
        self.documents = documents
        self.tokenizer = tokenizer
        self.sample = sample
        self.max_len = max_len
        self.mode = mode
        self.num_output = num_output
        
        if self.mode == 'train':
            self.binning = binning
        
    def __len__(self):
        return len(self.documents)
    
    def __getitem__(self, idx):
        sample = self.documents.iloc[idx]
        document = sample['excerpt']
        
        # Compute readability features
        ext_features = None # readability_feat(document)
        
        # Tokenize
        features = convert_examples_to_features(document, self.tokenizer, self.max_len)
        
        target = torch.tensor(sample['target'])
        bins = None
            
        return form_dataset(features, external_features = ext_features, target = target, bins = bins)

# Model

In [None]:
class Readability_Model(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

# Loss function and metrics

In [None]:
class KLLoss(nn.Module):
    def __init__(self):
        super(KLLoss, self).__init__()
        
    def forward(self, pred_mean, pred_std, target_mean, target_std):
        p = torch.distributions.Normal(pred_mean, pred_std)
        q = torch.distributions.Normal(target_mean, target_std)
        loss = torch.mean(torch.distributions.kl_divergence(p, q))
        return loss
    
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        
    def forward(self, pred_mean, target_mean):
        return torch.mean((pred_mean - target_mean)**2)
    
class RankingLoss(nn.Module):
    def __init__(self):
        super(RankingLoss, self).__init__()
        
    def forward(self, pred_mean, pred_benchmark_mean, target_mean, margin = 0.5):
        return nn.MarginRankingLoss(margin = margin)(pred_mean, pred_benchmark_mean, torch.sign(target_mean))
    
class QuadraticWeightedKappaLoss(nn.Module):
    def __init__(self, num_cat = 7, device = 'cpu'):
        super(QuadraticWeightedKappaLoss, self).__init__()
        self.num_cat = num_cat
        cats = torch.arange(num_cat).to(device)
        self.weights = (cats.view(-1,1) - cats.view(1,-1)).pow(2) / (num_cat - 1)**2
        
    def _confusion_matrix(self, pred_cat, true_cat):
        confusion_matrix = torch.zeros((self.num_cat, self.num_cat)).to(pred_cat.device)
        for t, p in zip(true_cat.view(-1), pred_cat.view(-1)):
            confusion_matrix[t.long(), p.long()] += 1
        return confusion_matrix
        
    def forward(self, pred_cat, true_cat):
        # Confusion matrix
        O = self._confusion_matrix(pred_cat, true_cat)
        
        # Count elements in each category
        true_hist = torch.bincount(true_cat, minlength = self.num_cat)
        pred_hist = torch.bincount(pred_cat, minlength = self.num_cat)
        
        # Expected values
        E = torch.outer(true_hist, pred_hist)
        
        # Normlization
        O = O / torch.sum(O)
        E = E / torch.sum(E)
        
        # Weighted Kappa
        numerator = torch.sum(self.weights * O)
        denominator = torch.sum(self.weights * E)
        
        return numerator / denominator
    
class BradleyTerryLoss(nn.Module):
    def __init__(self):
        super(BradleyTerryLoss, self).__init__()
        
    def forward(self, pred_mean, true_mean):
        batch_size = len(pred_mean)
        true_comparison = torch.sign(true_mean.view(-1,1) - true_mean.view(1,-1))
        pred_comparison = pred_mean.view(-1,1) - pred_mean.view(1,-1)
        return torch.log(1 + torch.tril(torch.exp(-true_comparison * pred_comparison))).sum() / (batch_size * (batch_size - 1) / 2)
    
def loss_fn(pred_mean, pred_std, target_mean, target_std, pred_cat = None, target_cat = None, loss_type = 'rmse', num_bins = None):
    assert loss_type in ['rmse', 'kl', 'rank', 'qwk', 'rmse_rank', 'kl_rank', 'rmse_qwk', 'kl_qwk', 'rank_qwk', 
                         'bradley-terry', 'rmse_bradley-terry', 'qwk_bradley-terry', 'rmse_qwk_bradley-terry']
    if 'qwk' in loss_type:
        assert (pred_cat is not None) and (target_cat is not None) and (num_bins is not None)
    if 'rank' in loss_type:
        assert pred_benchmark_mean is not None
    
    device = pred_mean.device
    
    if loss_type == 'rmse':
        return RMSELoss()(pred_mean, target_mean)
    elif loss_type == 'kl':
        return KLLoss()(pred_mean, pred_std, target_mean, target_std)
    elif loss_type == 'rank':
        return RankingLoss()(pred_mean, target_mean, margin = 0.5)
    elif loss_type == 'qwk':
        return QuadraticWeightedKappaLoss(num_cat = num_bins, device = device)(pred_cat, target_cat)
    elif loss_type == 'rmse_rank':
        return torch.sqrt(RMSELoss()(pred_mean, target_mean)) + RankingLoss()(pred_mean, pred_benchmark_mean, target_mean, margin = 0.5)
    elif loss_type == 'kl_rank':
        return KLLoss()(pred_mean, pred_std, target_mean, target_std) + RankingLoss()(pred_mean, pred_benchmark_mean, target_mean, margin = 0.5)
    elif loss_type == 'rmse_qwk':
        return torch.sqrt(RMSELoss()(pred_mean, target_mean)) + QuadraticWeightedKappaLoss(num_cat = num_bins, device = device)(pred_cat, target_cat)
    elif loss_type == 'kl_qwk':
        return KLLoss()(pred_mean, pred_std, target_mean, target_std) + QuadraticWeightedKappaLoss(num_cat = num_bins, device = device)(pred_cat, target_cat)
    elif loss_type == 'bradley-terry':
        return BradleyTerryLoss()(pred_mean, target_mean)
    elif loss_type == 'rmse_bradley-terry':
        return torch.sqrt(RMSELoss()(pred_mean, target_mean)) + BradleyTerryLoss()(pred_mean, target_mean)
    elif loss_type == 'qwk_bradley-terry':
        return BradleyTerryLoss()(pred_mean, target_mean) + \
               QuadraticWeightedKappaLoss(num_cat = num_bins, device = device)(pred_cat, target_cat)
    elif loss_type == 'rmse_qwk_bradley-terry':
        return torch.sqrt(RMSELoss()(pred_mean, target_mean)) + BradleyTerryLoss()(pred_mean, target_mean) + \
               QuadraticWeightedKappaLoss(num_cat = num_bins, device = device)(pred_cat, target_cat)

def metric_fn(pred_mean, target_mean):
    return np.sqrt(np.mean((pred_mean - target_mean)**2))

# Inference function

In [None]:
def infer(model, dataloader, device = 'cpu', use_tqdm = True, benchmark_token = None):
    model.eval()
    
    if use_tqdm:
        tbar = tqdm(dataloader)
    else:
        tbar = dataloader
        
    loss = 0
    num_sample = 0
        
    pred = []
        
    for item in tbar:
        input_ids = item['input_ids'].to(device)
        token_type_ids = item['token_type_ids'].to(device)
        attention_mask = item['attention_mask'].to(device)
        targets = item['target'].to(device)
        true_mean = targets
        true_std = None
        
        batch_size = input_ids.shape[0]
        
        if benchmark_token is not None:
            benchmark_input_ids, benchmark_token_type_ids, benchmark_attention_mask = benchmark_token
            input_ids = torch.cat((input_ids, benchmark_input_ids), dim = 0)
            token_type_ids = torch.cat((token_type_ids, benchmark_token_type_ids), dim = 0)
            attention_mask = torch.cat((attention_mask, benchmark_attention_mask), dim = 0)
        
        with torch.no_grad():
            with autocast():
                pred_mean, pred_std, pred_bins = model(input_ids = input_ids, 
                                                       attention_mask = attention_mask, 
                                                       token_type_ids = token_type_ids)

                loss_batch = loss_fn(pred_mean, pred_std, true_mean, true_std, loss_type = 'rmse')
            
        loss += loss_batch * batch_size
        num_sample += batch_size
        
        pred.extend(pred_mean.cpu().detach().numpy())
        
    # Stack
    pred = np.array(pred)
    
    # Compute loss
    loss = torch.sqrt(loss / num_sample)
    
    return pred, loss

# Configuration

In [None]:
class config():
    # For inference
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    use_tqdm = False
    # For dataloader
    max_len = 250
    batch_size = 8
    num_workers = 4
    # For model
    num_output = 2
    num_bins = 1
    model_dir = '../input/clrroberta-largev15/model_best_roberta_large_v15_4'
    backbone = 'roberta-large'
    model_name = '_'.join(backbone.split('-'))

cfg = config()

# Main

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.backbone)

prediction = np.zeros(data.shape[0])

oof = []

for fold in range(5):
    val = data[data['kfold'] == fold]
    
    valid_dataset = Readability_Dataset(val, tokenizer, max_len = cfg.max_len)
    valid_dataloader = DataLoader(valid_dataset, batch_size = cfg.batch_size, num_workers = cfg.num_workers, shuffle = False)
    
    # Tokenize the benchmark text
    benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, cfg.max_len, return_tensor = True)
    benchmark_token = (benchmark_token['input_ids'].to(cfg.device), benchmark_token['token_type_ids'].to(cfg.device), benchmark_token['attention_mask'].to(cfg.device))
    
    model_config = AutoConfig.from_pretrained(cfg.backbone, output_hidden_states = True)
    model = Readability_Model(cfg.backbone, model_config, num_output = cfg.num_output, 
                              num_cat = cfg.num_bins, benchmark_token = benchmark_token).to(cfg.device)
    
    print('*' * 50)
    print(f'Fold: {fold}')
    
    # Load pretrain models
    path = f'{cfg.model_dir}/model_best_fold_{fold}_{cfg.model_name}.bin'
    ckp = torch.load(path, map_location = cfg.device)
    model.load_state_dict(ckp['model_state_dict'])
    
    prediction, loss = infer(model, valid_dataloader, device = cfg.device, use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token)
    val['pred'] = prediction
    
    print(f'Validation loss of fold {fold} is:', loss)
    
    oof.append(val)
    
oof = pd.concat(oof)

In [None]:
print('OOF CV:', metric_fn(oof['pred'].values, oof['target'].values))
oof.to_csv(f'oof_{cfg.model_name}.csv')

# Analysis of the comparison task
* We want to know whether our models also perform well in the comparison task of texts and the benchmark text

In [None]:
benchmark_text = oof[oof['standard_error'] == 0]
true_compare = np.sign(oof['target'] - benchmark_text['target'].values)
pred_compare = np.sign(oof['pred'] - benchmark_text['pred'].values)

print('The number of correctly comparison based on predicted score is', sum(true_compare == pred_compare) / len(true_compare))
print('*' * 50)

import seaborn as sns
import matplotlib.pyplot as plt

print('Histogram of difference score from the predicted value and the benchmark text')
sns.histplot(oof[true_compare != pred_compare]['pred'] - benchmark_text['pred'].values)
plt.show()

print('Example of the wrong prediction in terms of comparison task (The first row is the benchmark text):')
pd.concat((benchmark_text, oof[true_compare != pred_compare].iloc[0].to_frame().T))