# Version 5
* Ensemble Hiro's 5 models and Tri's 13 models.

## Model list

#### Hiro's models
* RoBERTa-large version 31;
* RoBERTa-large version 31-b;
* kfold: Ver.2, RoBERTa-large Ver.26a;
* ELECTRA-large Ver.4;
* kfold: Ver.3, DeBERTa-large Ver.14

#### Tri's models
* RoBERTa-large version 15-0, 15-3;
* RoBERTa-large version 16-1;
* XLNet-large-cased version 2-0, 3-0, 3-1;
* GPT2-medium version 1-0;
* ELECTRA-large-discriminator version 1-0, 1-1;
* DeBERTa-large version 1-0, 1-1;
* Funnel-large version 1-0;
* BART-large version 1-0

# Tri's part
##################################################################################################################

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random
import gc

import sys
sys.path.append('../input/readability-package')
import readability
import spacy

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
import string
import re
import math
import pickle

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Autocast
from torch.cuda.amp import autocast, GradScaler
from torch.optim.swa_utils import AveragedModel

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, AdamW

import warnings
warnings.filterwarnings('ignore')

In [None]:
!cp -r ../input/spacy-readability/spacy_readability-master/* ./
!cp -r ../input/syllapy/syllapy-master/* ./
import spacy
from spacy_readability import Readability

nlp = spacy.load('en')
nlp.add_pipe(Readability(), last = True)

In [None]:
def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 0
seed_everything(seed)

# Import data

In [None]:
base_dir = '../input/commonlitreadabilityprize'
train_data = pd.read_csv(f'{base_dir}/train.csv')
# Benchmark text
benchmark = train_data[train_data['standard_error'] == 0.]

In [None]:
base_dir = '../input/commonlitreadabilityprize'
data = pd.read_csv(f'{base_dir}/test.csv')
ss = pd.read_csv(f'{base_dir}/sample_submission.csv')
data.head()

# Utilities

In [None]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def readability_feat(text):
    text = nlp(text)
    
    return np.array([text._.flesch_kincaid_grade_level,
                     text._.flesch_kincaid_reading_ease,
                     text._.dale_chall,
                     text._.coleman_liau_index,
                     text._.automated_readability_index,
                     text._.forcast], dtype = np.float)

def sample_text(targets, num_output = 5):
    mean, var = targets[0], targets[1]
    if targets[1] != 0.:
        sampled_target = torch.normal(mean, var, size = (num_output,))
    else:
        sampled_target = torch.tensor([0.] * num_output, dtype = torch.float)
    return sampled_target

def convert_examples_to_features(text, tokenizer, max_len, is_test = False, return_tensor = False):
    # Take from https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit
    text = text.replace('\n', '')
    if return_tensor:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            return_tensors = 'pt',
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    else:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    return tok

def form_dataset(token, external_features = None, target = None, bins = None):
    if target is not None:
        if bins is not None:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                    'bins': bins,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                    'bins': bins,
                }
        else:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                }
    else:
        if external_features is not None:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                'external_features': torch.tensor(external_features, dtype = torch.float),
            }
        else:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
            }

# Dataset

In [None]:
class Readability_Dataset(Dataset):
    def __init__(self, documents, tokenizer, max_len = 300, mode = 'infer'):
        self.documents = documents
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        
    def __len__(self):
        return len(self.documents)
    
    def __getitem__(self, idx):
        sample = self.documents.iloc[idx]
        document = sample['excerpt']
        
        # Tokenize
        features = convert_examples_to_features(document, self.tokenizer, self.max_len)
        
        return form_dataset(features)

# Models

### Utils class

In [None]:
class AttentivePooling(nn.Module):
    def __init__(self, input_dim = 768, attention_dim = 1024):
        super(AttentivePooling, self).__init__()
        # Attention pooler
        self.word_weight = nn.Linear(input_dim, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
    def forward(self, x, mask = None):
        '''
        x : Batch_size x Seq_len x input_dim
        mask: 
        '''
        # Attention Pooling (over sequence for the first sequence)
        u_i = torch.tanh(self.word_weight(x))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        
        if mask is not None:
            att = att * (1 - mask.unsqueeze(-1))
            
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        x = x * att
        return x.sum(dim = 1)

### RoBERTa base

* Version 11

In [None]:
class Readability_Model_RoBERTa_base_v11(nn.Module):
    def __init__(self, backbone, model_config, benchmark_token = None, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True):
        super(Readability_Model_RoBERTa_base_v11, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.output_cat)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        output = self.layer_norm(output_backbone.pooler_output)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts)
        cats /= len(self.dropouts)

        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### RoBERTa large

* Version 15

In [None]:
class Readability_Model_RoBERTa_large_v15(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_RoBERTa_large_v15, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

* Version 16

In [None]:
class Readability_Model_RoBERTa_large_v16(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_RoBERTa_large_v16, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Weighted mean pooling (over hidden layers)
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        output_backbone = torch.sum(hidden_states * layer_weight, dim = 0)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### XLNet large cased

* Version 2

In [None]:
class Readability_Model_XLNet_large_cased_v2(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_XLNet_large_cased_v2, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

* Version 3

In [None]:
class Readability_Model_XLNet_large_cased_v3(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_XLNet_large_cased_v3, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### GPT2 medium

* Version 1

In [None]:
class Readability_Model_GPT2_medium_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_GPT2_medium_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### ALBERT xlarge v2

* Version 1

In [None]:
class Readability_Model_ALBERT_xlarge_v2_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_ALBERT_xlarge_v2_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### ELECTRA large discriminator

* Version 1

In [None]:
class Readability_Model_ELECTRA_large_discriminator_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_ELECTRA_large_discriminator_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### DeBERTa large

* Version 1

In [None]:
class Readability_Model_DeBERTa_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_DeBERTa_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### Funnel-transformer/large

* Version 1

In [None]:
class Readability_Model_Funnel_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_Funnel_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        output_backbone = output_backbone.last_hidden_state
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### BART large

* Version 1

In [None]:
class Readability_Model_BART_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_BART_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers * 2).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.init_std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.init_std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        decoder_hidden_states = output_backbone.decoder_hidden_states
        encoder_hidden_states = output_backbone.encoder_hidden_states

        # Mean/max pooling (over hidden layers), concatenate with pooler
        decoder_hidden_states = tuple(decoder_hidden_states[-i-1] for i in range(self.model_config.num_hidden_layers))
        encoder_hidden_states = tuple(encoder_hidden_states[-i-1] for i in range(self.model_config.num_hidden_layers))
        hidden_states = torch.stack(decoder_hidden_states + encoder_hidden_states, dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

# Inference function

In [None]:
def infer(model, dataloader, device = 'cpu', use_tqdm = True, benchmark_token = None):
    model.eval()
    
    if use_tqdm:
        tbar = tqdm(dataloader)
    else:
        tbar = dataloader
        
    pred = []
        
    for item in tbar:
        input_ids = item['input_ids'].to(device)
        token_type_ids = item['token_type_ids'].to(device)
        attention_mask = item['attention_mask'].to(device)
        
        if benchmark_token is not None:
            benchmark_input_ids, benchmark_token_type_ids, benchmark_attention_mask = benchmark_token
            input_ids = torch.cat((input_ids, benchmark_input_ids), dim = 0)
            token_type_ids = torch.cat((token_type_ids, benchmark_token_type_ids), dim = 0)
            attention_mask = torch.cat((attention_mask, benchmark_attention_mask), dim = 0)
            
        with torch.no_grad():
            with autocast():
                pred_mean, pred_std, pred_bins = model(input_ids = input_ids, 
                                                       attention_mask = attention_mask, 
                                                       token_type_ids = token_type_ids)
        
        pred.extend(pred_mean.cpu().detach().numpy())
        
    # Stack
    pred = np.array(pred)
    
    return pred

# Configuration

In [None]:
class config():
    # For inference
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    use_tqdm = True
    model_names = ['roberta_large_v15_0', 'roberta_large_v15_3',
                   'roberta_large_v16_1', 
                   'gpt2_medium_v1_0', 
                   'xlnet_large_cased_v2_0', 'xlnet_large_cased_v3_0', 'xlnet_large_cased_v3_1', 
                   'electra_large_discriminator_v1_0', 'electra_large_discriminator_v1_1',
                   'deberta_large_v1_0', 'deberta_large_v1_1',
                   'funnel_large_v1_0',
                   'bart_large_v1_0']
    # For dataloader
    max_len = [250] * 15
    batch_size = (8, 8, 8, 
                  8, 8,
                  6, 
                  6, 6, 6,
                  8, 8,
                  4, 4,
                  8,
                  8)    # In the same order as the 'model_names' attribute
    num_workers = 4
    # For models
    num_bins = (29, 1, 29, 
                29, 29, 
                29, 
                29, 1, 1, 
                1, 1,
                29, 29,
                1,
                1)    # In the same order as the 'model_names' attribute
    
cfg = config()

# Main

In [None]:
# Tokenizer and model configuration
tokenizer_roberta_large = AutoTokenizer.from_pretrained('../input/robertalarge', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_roberta_large = AutoConfig.from_pretrained('../input/robertalarge', output_hidden_states = True)

tokenizer_gpt2_medium = AutoTokenizer.from_pretrained('../input/gpt2-medium', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
tokenizer_gpt2_medium.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_xlnet_large_cased = AutoTokenizer.from_pretrained('../input/xlnet-large-cased', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
tokenizer_xlnet_large_cased.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_electra_large = AutoTokenizer.from_pretrained('../input/electra-large-discriminator', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_electra_large = AutoConfig.from_pretrained('../input/electra-large-discriminator', output_hidden_states = True)

tokenizer_deberta_large = AutoTokenizer.from_pretrained('../input/deberta-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_debert_large = AutoConfig.from_pretrained('../input/deberta-large', output_hidden_states = True)

tokenizer_funnel_large = AutoTokenizer.from_pretrained('../input/funnel-transformer-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_funnel_large = AutoConfig.from_pretrained('../input/funnel-transformer-large', output_hidden_states = True)

tokenizer_bart_large = AutoTokenizer.from_pretrained('../input/bart-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_bart_large = AutoConfig.from_pretrained('../input/bart-large', output_hidden_states = True)

# Dataloader
infer_dataset_roberta_large_v15_0 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[0], mode = 'infer')
infer_dataloader_roberta_large_v15_0 = DataLoader(infer_dataset_roberta_large_v15_0, batch_size = cfg.batch_size[0], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_roberta_large_v15_3 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[2], mode = 'infer')
infer_dataloader_roberta_large_v15_3 = DataLoader(infer_dataset_roberta_large_v15_3, batch_size = cfg.batch_size[2], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_roberta_large_v16_1 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[4], mode = 'infer')
infer_dataloader_roberta_large_v16_1 = DataLoader(infer_dataset_roberta_large_v16_1, batch_size = cfg.batch_size[4], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_gpt2_medium_v1_0 = Readability_Dataset(data, tokenizer_gpt2_medium, max_len = cfg.max_len[5], mode = 'infer')
infer_dataloader_gpt2_medium_v1_0 = DataLoader(infer_dataset_gpt2_medium_v1_0, batch_size = cfg.batch_size[5], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v2_0 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[6], mode = 'infer')
infer_dataloader_xlnet_large_cased_v2_0 = DataLoader(infer_dataset_xlnet_large_cased_v2_0, batch_size = cfg.batch_size[6], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v3_0 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[7], mode = 'infer')
infer_dataloader_xlnet_large_cased_v3_0 = DataLoader(infer_dataset_xlnet_large_cased_v3_0, batch_size = cfg.batch_size[7], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v3_1 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[8], mode = 'infer')
infer_dataloader_xlnet_large_cased_v3_1 = DataLoader(infer_dataset_xlnet_large_cased_v3_1, batch_size = cfg.batch_size[8], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_electra_large_v1_0 = Readability_Dataset(data, tokenizer_electra_large, max_len = cfg.max_len[9], mode = 'infer')
infer_dataloader_electra_large_v1_0 = DataLoader(infer_dataset_electra_large_v1_0, batch_size = cfg.batch_size[9], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_electra_large_v1_1 = Readability_Dataset(data, tokenizer_electra_large, max_len = cfg.max_len[10], mode = 'infer')
infer_dataloader_electra_large_v1_1 = DataLoader(infer_dataset_electra_large_v1_1, batch_size = cfg.batch_size[10], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_deberta_large_v1_0 = Readability_Dataset(data, tokenizer_deberta_large, max_len = cfg.max_len[11], mode = 'infer')
infer_dataloader_deberta_large_v1_0 = DataLoader(infer_dataset_deberta_large_v1_0, batch_size = cfg.batch_size[11], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_deberta_large_v1_1 = Readability_Dataset(data, tokenizer_deberta_large, max_len = cfg.max_len[12], mode = 'infer')
infer_dataloader_deberta_large_v1_1 = DataLoader(infer_dataset_deberta_large_v1_1, batch_size = cfg.batch_size[12], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_funnel_large_v1_0 = Readability_Dataset(data, tokenizer_funnel_large, max_len = cfg.max_len[13], mode = 'infer')
infer_dataloader_funnel_large_v1_0 = DataLoader(infer_dataset_funnel_large_v1_0, batch_size = cfg.batch_size[13], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_bart_large_v1_0 = Readability_Dataset(data, tokenizer_bart_large, max_len = cfg.max_len[14], mode = 'infer')
infer_dataloader_bart_large_v1_0 = DataLoader(infer_dataset_bart_large_v1_0, batch_size = cfg.batch_size[14], num_workers = cfg.num_workers, shuffle = False)

# Prediction storage
prediction_roberta_large_v15_0 = np.zeros(data.shape[0])
prediction_roberta_large_v15_2 = np.zeros(data.shape[0])
prediction_roberta_large_v15_3 = np.zeros(data.shape[0])
prediction_roberta_large_v16_0 = np.zeros(data.shape[0])
prediction_roberta_large_v16_1 = np.zeros(data.shape[0])
prediction_gpt2_medium_v1_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v2_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v3_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v3_1 = np.zeros(data.shape[0])
prediction_electra_large_v1_0 = np.zeros(data.shape[0])
prediction_electra_large_v1_1 = np.zeros(data.shape[0])
prediction_deberta_large_v1_0 = np.zeros(data.shape[0])
prediction_deberta_large_v1_1 = np.zeros(data.shape[0])
prediction_funnel_large_v1_0 = np.zeros(data.shape[0])
prediction_bart_large_v1_0 = np.zeros(data.shape[0])

# Tokenize the benchmark text
benchmark_token_roberta_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_roberta_large, cfg.max_len[0], return_tensor = True)
benchmark_token_roberta_large = (benchmark_token_roberta_large['input_ids'].to(cfg.device), 
                                 benchmark_token_roberta_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_roberta_large['attention_mask'].to(cfg.device))

benchmark_token_gpt2_medium = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_gpt2_medium, cfg.max_len[5], return_tensor = True)
benchmark_token_gpt2_medium = (benchmark_token_gpt2_medium['input_ids'].to(cfg.device), 
                               benchmark_token_gpt2_medium['token_type_ids'].to(cfg.device), 
                               benchmark_token_gpt2_medium['attention_mask'].to(cfg.device))

benchmark_token_xlnet_large_cased = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_xlnet_large_cased, cfg.max_len[6], return_tensor = True)
benchmark_token_xlnet_large_cased = (benchmark_token_xlnet_large_cased['input_ids'].to(cfg.device), 
                                     benchmark_token_xlnet_large_cased['token_type_ids'].to(cfg.device), 
                                     benchmark_token_xlnet_large_cased['attention_mask'].to(cfg.device))

benchmark_token_electra_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_electra_large, cfg.max_len[9], return_tensor = True)
benchmark_token_electra_large = (benchmark_token_electra_large['input_ids'].to(cfg.device), 
                                 benchmark_token_electra_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_electra_large['attention_mask'].to(cfg.device))

benchmark_token_deberta_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_deberta_large, cfg.max_len[11], return_tensor = True)
benchmark_token_deberta_large = (benchmark_token_deberta_large['input_ids'].to(cfg.device), 
                                 benchmark_token_deberta_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_deberta_large['attention_mask'].to(cfg.device))

benchmark_token_funnel_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_funnel_large, cfg.max_len[13], return_tensor = True)
benchmark_token_funnel_large = (benchmark_token_funnel_large['input_ids'].to(cfg.device), 
                                benchmark_token_funnel_large['token_type_ids'].to(cfg.device), 
                                benchmark_token_funnel_large['attention_mask'].to(cfg.device))

benchmark_token_bart_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_bart_large, cfg.max_len[14], return_tensor = True)
benchmark_token_bart_large = (benchmark_token_bart_large['input_ids'].to(cfg.device), 
                              benchmark_token_bart_large['token_type_ids'].to(cfg.device), 
                              benchmark_token_bart_large['attention_mask'].to(cfg.device))

for fold in range(5):
    print('*' * 50)
    print(f'Fold: {fold}')
    
    # Load pretrained models
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 15-0...')
    model_roberta_large_v15_0 = Readability_Model_RoBERTa_large_v15('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[0], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largepretrained-modelsv15/model_best_roberta_large_v15_0'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v15_0.load_state_dict(ckp['model_state_dict'])    
    prediction_roberta_large_v15_0 += infer(model_roberta_large_v15_0, infer_dataloader_roberta_large_v15_0, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v15_0; gc.collect()
    
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 15-3...')
    model_roberta_large_v15_3 = Readability_Model_RoBERTa_large_v15('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[2], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largev15/model_best_roberta_large_v15_3'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v15_3.load_state_dict(ckp['model_state_dict'])    
    prediction_roberta_large_v15_3 += infer(model_roberta_large_v15_3, infer_dataloader_roberta_large_v15_3, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v15_3; gc.collect()
    
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 16-1...')
    model_roberta_large_v16_1 = Readability_Model_RoBERTa_large_v16('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[4], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largepretrained-modelsv16/model_best_roberta_large_v16_1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v16_1.load_state_dict(ckp['model_state_dict'])
    prediction_roberta_large_v16_1 += infer(model_roberta_large_v16_1, infer_dataloader_roberta_large_v16_1, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v16_1; gc.collect()
    
    model_name = 'gpt2_medium'
    print(f'Inference model, {model_name} version 1-0...')
    model_config_gpt2_medium = AutoConfig.from_pretrained('../input/gpt2-medium', output_hidden_states = True)
    model_gpt2_medium_v1_0 = Readability_Model_GPT2_medium_v1('../input/gpt2-medium', model_config_gpt2_medium, num_cat = cfg.num_bins[5], 
                                                              benchmark_token = benchmark_token_gpt2_medium).to(cfg.device)
    model_gpt2_medium_v1_0.backbone.resize_token_embeddings(len(tokenizer_gpt2_medium))
    model_root_path = '../input/clrgpt2-mediumpretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_gpt2_medium_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_gpt2_medium_v1_0 += infer(model_gpt2_medium_v1_0, infer_dataloader_gpt2_medium_v1_0, device = cfg.device, 
                                         use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_gpt2_medium) / 5
    del model_gpt2_medium_v1_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 2-0...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v2_0 = Readability_Model_XLNet_large_cased_v2('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[6], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v2_0.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-largepretrained-models/v02'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v2_0.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v2_0 += infer(model_xlnet_large_cased_v2_0, infer_dataloader_xlnet_large_cased_v2_0, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v2_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 3-0...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v3_0 = Readability_Model_XLNet_large_cased_v3('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[7], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v3_0.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-largepretrained-models/v03'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v3_0.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v3_0 += infer(model_xlnet_large_cased_v3_0, infer_dataloader_xlnet_large_cased_v3_0, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v3_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 3-1...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v3_1 = Readability_Model_XLNet_large_cased_v3('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[8], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v3_1.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-large-casedv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v3_1.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v3_1 += infer(model_xlnet_large_cased_v3_1, infer_dataloader_xlnet_large_cased_v3_1, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v3_1; gc.collect()
    
    model_name = 'electra_large_discriminator'
    print(f'Inference model, {model_name} version 1-0...')
    model_electra_large_v1_0 = Readability_Model_ELECTRA_large_discriminator_v1('../input/electra-large-discriminator', model_config_electra_large, 
                                                                                num_cat = cfg.num_bins[9], benchmark_token = benchmark_token_electra_large).to(cfg.device)
    model_root_path = '../input/clrelectra-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_electra_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_electra_large_v1_0 += infer(model_electra_large_v1_0, infer_dataloader_electra_large_v1_0, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_electra_large) / 5
    del model_electra_large_v1_0; gc.collect()
    
    model_name = 'electra_large_discriminator'
    print(f'Inference model, {model_name} version 1-1...')
    model_electra_large_v1_1 = Readability_Model_ELECTRA_large_discriminator_v1('../input/electra-large-discriminator', model_config_electra_large, 
                                                                                num_cat = cfg.num_bins[10], benchmark_token = benchmark_token_electra_large).to(cfg.device)
    model_root_path = '../input/clrelectra-largev1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_electra_large_v1_1.load_state_dict(ckp['model_state_dict'])    
    prediction_electra_large_v1_1 += infer(model_electra_large_v1_1, infer_dataloader_electra_large_v1_1, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_electra_large) / 5
    del model_electra_large_v1_1; gc.collect()
    
    model_name = 'deberta_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_deberta_large_v1_0 = Readability_Model_DeBERTa_large_v1('../input/deberta-large', model_config_debert_large, 
                                                                  num_cat = cfg.num_bins[11], benchmark_token = benchmark_token_deberta_large).to(cfg.device)
    model_root_path = '../input/clrdeberta-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_deberta_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_deberta_large_v1_0 += infer(model_deberta_large_v1_0, infer_dataloader_deberta_large_v1_0, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_deberta_large) / 5
    del model_deberta_large_v1_0; gc.collect()
    
    model_name = 'deberta_large'
    print(f'Inference model, {model_name} version 1-1...')
    model_deberta_large_v1_1 = Readability_Model_DeBERTa_large_v1('../input/deberta-large', model_config_debert_large, 
                                                                  num_cat = cfg.num_bins[12], benchmark_token = benchmark_token_deberta_large).to(cfg.device)
    if fold == 3:
        model_deberta_large_v1_1 = AveragedModel(model_deberta_large_v1_1)
        
    model_root_path = '../input/clrdeberta-largev1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_deberta_large_v1_1.load_state_dict(ckp['model_state_dict'])    
    prediction_deberta_large_v1_1 += infer(model_deberta_large_v1_1, infer_dataloader_deberta_large_v1_1, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_deberta_large) / 5
    del model_deberta_large_v1_1; gc.collect()
    
    model_name = 'funnel_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_funnel_large_v1_0 = Readability_Model_Funnel_large_v1('../input/funnel-transformer-large', model_config_funnel_large, num_cat = cfg.num_bins[7], 
                                                                benchmark_token = benchmark_token_funnel_large).to(cfg.device)
    model_root_path = '../input/clrfunnel-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_funnel_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_funnel_large_v1_0 += infer(model_funnel_large_v1_0, infer_dataloader_funnel_large_v1_0, device = cfg.device, 
                                          use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_funnel_large) / 5
    del model_funnel_large_v1_0; gc.collect()
    
    model_name = 'bart_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_bart_large_v1_0 = Readability_Model_BART_large_v1('../input/bart-large', model_config_bart_large, num_cat = cfg.num_bins[8], 
                                                            benchmark_token = benchmark_token_bart_large).to(cfg.device)
    model_root_path = '../input/clrbart-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_bart_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_bart_large_v1_0 += infer(model_bart_large_v1_0, infer_dataloader_bart_large_v1_0, device = cfg.device, 
                                        use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_bart_large) / 5
    del model_bart_large_v1_0; gc.collect()

# Averaging

In [None]:
pred_zoo = np.vstack([prediction_roberta_large_v15_0, prediction_roberta_large_v15_3, 
                      prediction_roberta_large_v16_1, 
                      prediction_gpt2_medium_v1_0, 
                      prediction_xlnet_large_cased_v2_0, prediction_xlnet_large_cased_v3_0, prediction_xlnet_large_cased_v3_1, 
                      prediction_electra_large_v1_0, prediction_electra_large_v1_1, 
                      prediction_deberta_large_v1_0, prediction_deberta_large_v1_1, 
                      prediction_funnel_large_v1_0, 
                      prediction_bart_large_v1_0]).T

ss['target'] = np.mean(pred_zoo, axis = 1)

# Hiro's part
##################################################################################################################

In [None]:
ENV = 'kaggle'
assert ENV in ['colab', 'kaggle']
 
PHASE = 'inference'
assert PHASE in ['eval_oof','inference']

In [None]:
if ENV=='colab':
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import os
import math
import random
import time
 
import numpy as np
import pandas as pd
 
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
 
import transformers
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup
 
from sklearn import datasets
from sklearn import model_selection

import gc, json, pickle, shutil
gc.enable()

from tqdm.auto import tqdm
from matplotlib import pyplot as plt

In [None]:
def create_folds(data, num_splits, shuffle=False, random_state=None):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

# Dataset

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)
            self.bins = torch.tensor(df.bins.values, dtype=torch.long)
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = not NO_TOKEN_TYPE
        )

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        out_dict = {'input_ids':input_ids, 'attention_mask':attention_mask}
        
        if not NO_TOKEN_TYPE:
            out_dict['token_type_ids'] = torch.tensor(self.encoded['token_type_ids'][index])
        
        if sa_complex is not None:
            if sa_complex == 'hdd':
                with open(f'SelfAttComplex/{str(index).zfill(4)}.pkl','rb') as f:
                    out_dict['sa_complex'] = pickle.load(f)
            else:
                out_dict['sa_complex'] = sa_complex[index]

        if not self.inference_only:
            out_dict['target'] = self.target[index]
            out_dict['bins'] = self.bins[index]

        return out_dict

# Self Attention Complexity in Pretrained Model

In [None]:
def SelfAttention_Complexity(df: pd.DataFrame, output_device):
    pre_dataset = LitDataset(df, inference_only=True)
    pre_loader = DataLoader(pre_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False)
    
    if output_device == 'hdd':
        os.makedirs('SelfAttComplex', exist_ok=True)

    cfg_update = {"output_attentions":True, "hidden_dropout_prob": 0.0,
                  "layer_norm_eps": 1e-7}
    if PHASE=='train':
        config = AutoConfig.from_pretrained(MODEL_NAME)
        config.update(cfg_update)
        backbone = AutoModel.from_pretrained(MODEL_NAME, config=config).to(DEVICE)
    elif PHASE=='eval_oof' or PHASE=='inference':
        config = AutoConfig.from_pretrained(LOAD_BACKBONE_DIR)
        config.update(cfg_update)
        backbone = AutoModel.from_pretrained(LOAD_BACKBONE_DIR, config=config).to(DEVICE)

    backbone.resize_token_embeddings(len(tokenizer))

    output_sa_complex = []
    backbone.eval()
    idx = 0
    with torch.no_grad():
        for batch_num, dsargs in enumerate(tqdm(pre_loader)):

            kwargs = {}
            kwargs['input_ids'] = dsargs['input_ids'].to(DEVICE)
            if not NO_TOKEN_TYPE:
                kwargs['token_type_ids'] = dsargs['token_type_ids'].to(DEVICE)
            kwargs['attention_mask'] = dsargs['attention_mask'].to(DEVICE)

            if 't5' in MODEL_NAME.lower() and HAS_DECODER:
                # shift to right
                kwargs['decoder_input_ids'] = torch.cat([tokenizer.pad_token_id * torch.ones(kwargs['input_ids'].size(0), 1).long().to(DEVICE),
                                                        kwargs['input_ids'][:,:-1]], dim=1)
            
            # self attention
            output_backbone = backbone(**kwargs)
            self_att = torch.stack(output_backbone.attentions, dim=1) #[batch, layer, head, seq, seq]
            seq_len = self_att.size(-1)
            self_att = self_att.view(self_att.size(0), -1, seq_len, seq_len) #[batch, layer*head, seq, seq]
            self_att *= kwargs['attention_mask'].unsqueeze(1).unsqueeze(-1)

            # self attention complexity
            distance_from_diag = (torch.arange(seq_len).view(1, -1) - torch.arange(seq_len).view(-1, 1)) / (seq_len - 1)
            distance_from_diag = distance_from_diag.to(DEVICE)
            sa_complex = []
            temp = self_att * distance_from_diag.unsqueeze(0).unsqueeze(1).clip(min=0)
            temp = temp.sum(dim=-1) #[batch, layer*head, seq]
            sa_complex.append(temp)
            temp = self_att * distance_from_diag.unsqueeze(0).unsqueeze(1).clip(max=0).abs()
            temp = temp.sum(dim=-1) #[batch, layer*head, seq]
            sa_complex.append(temp)
            sa_complex = torch.cat(sa_complex, dim=1).transpose(-2,-1) #[batch, seq, layer*head*2]

            if output_device == 'hdd':
                for batch_item in sa_complex:
                    with open(f'SelfAttComplex/{str(idx).zfill(4)}.pkl','wb') as f:
                        pickle.dump(batch_item, f)
                    idx += 1
            else:
                output_sa_complex.append(sa_complex)
    
    if output_device == 'hdd':
        return 'hdd'
    else:
        output_sa_complex = torch.cat(output_sa_complex, dim=0)
        return output_sa_complex.to(output_device)

# Model
The model is inspired by the one from [Maunish](http://https://www.kaggle.com/maunish/clrp-roberta-svm).

In [None]:
class LitModel(nn.Module):
    def __init__(self, benchmark_token=None, use_max_pooling=False, sa_complex_dim=0):
        super().__init__()
 
        self.benchmark_token = benchmark_token
        self.use_max_pooling = use_max_pooling
        self.sa_complex_dim = sa_complex_dim
        
        cfg_update = {"output_hidden_states":True, "hidden_dropout_prob": 0.0,
                      "layer_norm_eps": 1e-7}
        if PHASE=='train':
            config = AutoConfig.from_pretrained(MODEL_NAME)
            config.save_pretrained(f'{SAVE_DIR}/backbone')
            config.update(cfg_update)                       
            self.backbone = AutoModel.from_pretrained(MODEL_NAME, config=config)
            self.backbone.save_pretrained(f'{SAVE_DIR}/backbone')
        elif PHASE=='eval_oof' or PHASE=='inference':
            config = AutoConfig.from_pretrained(LOAD_BACKBONE_DIR)
            config.update(cfg_update)                       
            self.backbone = AutoModel.from_pretrained(LOAD_BACKBONE_DIR, config=config)
            
        self.hidden_layer_weights = nn.Parameter(torch.zeros(NUM_HIDDEN_LAYERS).view(-1, 1, 1, 1))
 
        # Dropout layers
        self.dropouts_regr = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])
        self.dropouts_clsi = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])
 
        if self.use_max_pooling:
            num_pool = 2
        else:
            num_pool = 1
        self.attention_layer_norm = nn.LayerNorm(HIDDEN_SIZE * num_pool + sa_complex_dim)
        self.attention = nn.Sequential(            
            nn.Linear(HIDDEN_SIZE * num_pool + sa_complex_dim, 512 * num_pool),            
            nn.Tanh(),                       
            nn.Linear(512 * num_pool, 1),
            nn.Softmax(dim=1)
            )        
        self.head_regressor = nn.Linear(HIDDEN_SIZE * num_pool + sa_complex_dim, 1)
        self.head_classifier = nn.Linear(HIDDEN_SIZE * num_pool + sa_complex_dim, NUM_BINS)                   
 
    def forward(self, input_ids, token_type_ids, attention_mask, self_att_complex):

        kwargs = {}
        if self.benchmark_token is None:
            kwargs['input_ids'] = input_ids
            if not NO_TOKEN_TYPE:
                kwargs['token_type_ids'] = token_type_ids
            kwargs['attention_mask'] = attention_mask
        else:
            benchmark_input_ids, benchmark_token_type_ids, benchmark_attention_mask = self.benchmark_token
            kwargs['input_ids'] = torch.cat((input_ids, benchmark_input_ids), dim = 0)
            if not NO_TOKEN_TYPE:
                kwargs['token_type_ids'] = torch.cat((token_type_ids, benchmark_token_type_ids), dim = 0)
            kwargs['attention_mask'] = torch.cat((attention_mask, benchmark_attention_mask), dim = 0)

        if 't5' in MODEL_NAME.lower() and HAS_DECODER:
            # shift to right
            kwargs['decoder_input_ids'] = torch.cat([tokenizer.pad_token_id * torch.ones(kwargs['input_ids'].size(0), 1).long().to(DEVICE),
                                                     kwargs['input_ids'][:,:-1]], dim=1)
        output_backbone = self.backbone(**kwargs)
        
        # Extract output
        if HAS_DECODER:
            hidden_states = output_backbone.encoder_hidden_states + output_backbone.decoder_hidden_states[1:]
        else:
            hidden_states = output_backbone.hidden_states
 
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = nn.functional.softmax(self.hidden_layer_weights, dim = 0)
        output_backbone = torch.sum(hidden_states * layer_weight, dim = 0)
        if self.use_max_pooling:
            out_max, _ = torch.max(hidden_states, dim = 0)
            output_backbone = torch.cat((output_backbone, out_max), dim = -1)
        if self.sa_complex_dim != 0:
            self_att_complex = torch.cat((self_att_complex, benchmark_sa_complex), dim = 0)
            output_backbone = torch.cat((output_backbone, self_att_complex), dim = -1)
        
        output_backbone = self.attention_layer_norm(output_backbone)
 
        # Attention Pooling
        weights = self.attention(output_backbone)
        context_vector = torch.sum(weights * output_backbone, dim=1)        
 
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_regr):
            if i == 0:
                output_regr = self.head_regressor(dropout(context_vector))
                output_clsi = self.head_classifier(self.dropouts_clsi[i](context_vector))
            else:
                output_regr += self.head_regressor(dropout(context_vector))
                output_clsi += self.head_classifier(self.dropouts_clsi[i](context_vector))
 
        output_regr /= len(self.dropouts_regr)
        output_clsi /= len(self.dropouts_clsi)

        if self.benchmark_token is not None:
            output_regr = output_regr[:-1] - output_regr[-1]
            output_clsi = output_clsi[:-1]

        # Now we reduce the context vector to the prediction score.
        return output_regr, nn.functional.softmax(output_clsi, dim=-1)

# Loss function

In [None]:
class QuadraticWeightedKappaLoss(nn.Module):
    def __init__(self, num_cat, device = 'cpu'):
        super(QuadraticWeightedKappaLoss, self).__init__()
        self.num_cat = num_cat
        cats = torch.arange(num_cat).to(device)
        self.weights = (cats.view(-1,1) - cats.view(1,-1)).pow(2) / (num_cat - 1)**2
        
    def _confusion_matrix(self, pred_smax, true_cat):
        confusion_matrix = torch.zeros((self.num_cat, self.num_cat)).to(pred_smax.device)
        for t, p in zip(true_cat.view(-1), pred_smax):
            confusion_matrix[t.long()] += p
        return confusion_matrix
        
    def forward(self, pred_smax, true_cat):
        # Confusion matrix
        O = self._confusion_matrix(pred_smax, true_cat)
        
        # Count elements in each category
        true_hist = torch.bincount(true_cat, minlength = self.num_cat)
        pred_hist = pred_smax.sum(dim = 0)
        
        # Expected values
        E = torch.outer(true_hist, pred_hist)
        
        # Normlization
        O = O / torch.sum(O)
        E = E / torch.sum(E)
        
        # Weighted Kappa
        numerator = torch.sum(self.weights * O)
        denominator = torch.sum(self.weights * E)
        
        return COEF_QWK * numerator / denominator
    
class BradleyTerryLoss(nn.Module):
    def __init__(self):
        super(BradleyTerryLoss, self).__init__()

    def forward(self, pred_mean, true_mean):
        batch_size = len(pred_mean)
        true_comparison = true_mean.view(-1,1) - true_mean.view(1,-1)
        pred_comparison = pred_mean.view(-1,1) - pred_mean.view(1,-1)
        
        return COEF_BT * (torch.log(1 + torch.tril(torch.exp(-true_comparison * pred_comparison))).sum()
                          / (batch_size * (batch_size - 1) / 2))

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    all_pred_r = []
    with torch.no_grad():
        for batch_num, dsargs in enumerate(data_loader):
            input_ids = dsargs['input_ids'].to(DEVICE)
            attention_mask = dsargs['attention_mask'].to(DEVICE)
            target = dsargs['target'].to(DEVICE)
            bins = dsargs['bins'].to(DEVICE)

            token_type_ids = None
            if not NO_TOKEN_TYPE:
                token_type_ids = dsargs['token_type_ids'].to(DEVICE)

            self_att_complex = None
            if USE_SELF_ATT:
                self_att_complex = dsargs['sa_complex'].to(DEVICE)

            pred_r, _ = model(input_ids, token_type_ids, attention_mask, self_att_complex)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred_r.flatten(), target).item()
            all_pred_r.append(pred_r)

    return mse_sum / len(data_loader.dataset), torch.cat(all_pred_r, dim=0).squeeze()

# Training, Validation

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, num_epochs, fold, scheduler=None):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    

    start = time.time()

    history = {'step':[], 'epoch':[], 'batch_num':[], 'val_rmse':[],
               'trn_rmse':[], 'trn_qwk':[], 'trn_bt':[]}
    
    for epoch in range(num_epochs):
        val_rmse = None         

        epoch_target, epoch_bins, epoch_pred_r, epoch_pred_c = (torch.tensor([]),)*4
        epoch_bins = epoch_bins.long()
    
        for batch_num, dsargs in enumerate(train_loader):
            input_ids = dsargs['input_ids'].to(DEVICE)
            attention_mask = dsargs['attention_mask'].to(DEVICE)
            target = dsargs['target'].to(DEVICE)
            bins = dsargs['bins'].to(DEVICE)

            token_type_ids = None
            if not NO_TOKEN_TYPE:
                token_type_ids = dsargs['token_type_ids'].to(DEVICE)

            self_att_complex = None
            if USE_SELF_ATT:
                self_att_complex = dsargs['sa_complex'].to(DEVICE)

            optimizer.zero_grad()
            
            model.train()

            pred_r, pred_c = model(input_ids, token_type_ids, attention_mask, self_att_complex)
                                                        
            loss = (nn.MSELoss(reduction="mean")(pred_r.flatten(), target)
                    + QWKloss(pred_c, bins) + BTloss(pred_r.flatten(), target))
                        
            loss.backward()
            
            epoch_target = torch.cat([epoch_target.to(DEVICE), target.clone().detach()], dim=0)
            epoch_bins = torch.cat([epoch_bins.to(DEVICE), bins.clone().detach()], dim=0)
            epoch_pred_r = torch.cat([epoch_pred_r.to(DEVICE), pred_r.clone().detach()], dim=0)
            epoch_pred_c = torch.cat([epoch_pred_c.to(DEVICE), pred_c.clone().detach()], dim=0)

            optimizer.step()
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                mse, _ = eval_mse(model, val_loader)
                val_rmse = math.sqrt(mse)
                trn_rmse = nn.MSELoss(reduction="mean")(epoch_pred_r.flatten(), epoch_target).item()
                trn_qwk  = QWKloss(epoch_pred_c, epoch_bins).item()
                trn_bt  = BTloss(epoch_pred_r.flatten(), epoch_target).item()

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}", f"train_rmse: {trn_rmse:0.4}",
                      f"train_qwk: {trn_qwk:0.4}", f"train_bt: {trn_bt:0.4}")

                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break
                percent = step / (num_epochs * len(train_loader))
                if 0.5 <= percent and percent <= 0.8:
                    eval_period = min([eval_period, 8])
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")

                ''' history json dump '''
                history['step'].append(step)
                history['epoch'].append(epoch)
                history['batch_num'].append(batch_num)
                history['val_rmse'].append(val_rmse)
                history['trn_rmse'].append(trn_rmse)
                history['trn_qwk'].append(trn_qwk)
                history['trn_bt'].append(trn_bt)
                with open(f'{SAVE_DIR}/{MODEL_VER}_fold{fold+1}_history.json', 'w') as f:
                    json.dump(history, f, indent=4)
                    
                start = time.time()
                                            
            step += 1

        del epoch_target, epoch_bins, epoch_pred_r, epoch_pred_c
        
        print('\nHidden Layer Weights:')
        print(model.hidden_layer_weights.squeeze())
        print(nn.functional.softmax(model.hidden_layer_weights.squeeze(),dim=0))
    
    return best_val_rmse

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, dsargs in enumerate(data_loader):
            input_ids = dsargs['input_ids'].to(DEVICE)
            attention_mask = dsargs['attention_mask'].to(DEVICE)

            token_type_ids = None
            if not NO_TOKEN_TYPE:
                token_type_ids = dsargs['token_type_ids'].to(DEVICE)

            self_att_complex = None
            if USE_SELF_ATT:
                self_att_complex = dsargs['sa_complex'].to(DEVICE)
                        
            pred_r, _ = model(input_ids, token_type_ids, attention_mask, self_att_complex)                        

            result[index : index + pred_r.shape[0]] = pred_r.flatten().to("cpu")
            index += pred_r.shape[0]

    return result

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())
    
    backbone_parameters = [(n, p) for n, p in named_parameters if n.startswith('backbone')]
    attention_parameters = [(n, p) for n, p in named_parameters if n.startswith('attention')]
    hidden_wts_parameters = [(n, p) for n, p in named_parameters if n.startswith ('hidden_layer_weights')]
    head_parameters = [(n, p) for n, p in named_parameters if n.startswith('head')]
        
    attention_group = [params for (name, params) in attention_parameters]
    hidden_wts_group = [params for (name, params) in hidden_wts_parameters]
    head_group = [params for (name, params) in head_parameters]
 
    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": hidden_wts_group, 'weight_decay': 0.0, 'lr': HIDDEN_WTS_LR})
    parameters.append({"params": head_group})
 
    no_decay = ['bias', 'LayerNorm.weight', 'layer_norm']
 
    if 'roberta' in MODEL_NAME.lower() or 'electra' in MODEL_NAME.lower():
        layers = [getattr(model, 'backbone').embeddings] + list(getattr(model, 'backbone').encoder.layer)
    elif 'gpt2' in MODEL_NAME.lower():
        layers = [getattr(model, 'backbone').wte] + list(getattr(model, 'backbone').h)
    elif 'xlnet' in MODEL_NAME.lower():
        layers = [getattr(model, 'backbone').word_embedding] + list(getattr(model, 'backbone').layer)
    elif 'bart' in MODEL_NAME.lower():
        enc_layers = ([getattr(model, 'backbone').encoder.embed_positions] +
                      list(getattr(model, 'backbone').encoder.layers) +
                      [getattr(model, 'backbone').encoder.layernorm_embedding])
        dec_layers = ([getattr(model, 'backbone').decoder.embed_positions] +
                      list(getattr(model, 'backbone').decoder.layers) + 
                      [getattr(model, 'backbone').decoder.layernorm_embedding])
        assert len(enc_layers)==len(dec_layers)
        layers = [getattr(model, 'backbone').shared]
        for e, d in zip(enc_layers, dec_layers):
            layers += [e, d]
    elif 't5' in MODEL_NAME.lower():
        enc_layers = (list(getattr(model, 'backbone').encoder.block) +
                      [getattr(model, 'backbone').encoder.final_layer_norm])
        dec_layers = (list(getattr(model, 'backbone').decoder.block) + 
                      [getattr(model, 'backbone').decoder.final_layer_norm])
        assert len(enc_layers)==len(dec_layers)
        layers = [getattr(model, 'backbone').shared]
        for e, d in zip(enc_layers, dec_layers):
            layers += [e, d]
    else:
        raise RuntimeError('specify the parameters for backbone.')
 
    layers.reverse()
    layerwise_learning_rate_decay = LAYERWISE_LR_DECAY**(1.0/len(layers))
    lr = BACKBONE_LR
    for i, layer in enumerate(layers):
        lr *= layerwise_learning_rate_decay
        parameters += [
            {
                'params': [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                'weight_decay': 0.01,
                'lr': lr,
            },
            {
                'params': [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                'lr': lr,
            },
        ]
 
    return AdamW(parameters)

In [None]:
def convert_examples_to_features(text, tokenizer, max_len, is_test = False, return_tensor = False):
    # Take from https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit
    text = text.replace('\n', '')
    if return_tensor:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            return_tensors = 'pt',
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = not NO_TOKEN_TYPE
        )
    else:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = not NO_TOKEN_TYPE
        )
    return tok

In [None]:
def Train_or_Validation():
    list_val_rmse = []
 
    oof = []
    for fold in range(NUM_FOLDS):
        print(f"\nFold {fold + 1}/{NUM_FOLDS}")
            
        set_random_seed(SEED + fold)
        
        train_dataset = LitDataset(train_df[train_df['kfold'] != fold])
        val_dataset = LitDataset(train_df[train_df['kfold'] == fold])
        val_df = train_df[train_df['kfold'] == fold].copy()
            
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                  drop_last=True, shuffle=True, num_workers=0)    
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                drop_last=False, shuffle=False, num_workers=0)    
        
        sa_complex_dim = 0
        if USE_SELF_ATT:
            sa_complex_dim = benchmark_sa_complex.size(-1)
        
        model = LitModel(benchmark_token = benchmark_token, use_max_pooling = USE_MAX_POOLING,
                         sa_complex_dim = sa_complex_dim).to(DEVICE)
        
        # Update vocabulary size
        model.backbone.resize_token_embeddings(len(tokenizer))
 
        if PHASE=='train':
            model_path = f"{SAVE_DIR}/model_{fold + 1}.bin"
            set_random_seed(SEED + fold)    
 
            optimizer = create_optimizer(model)                        
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_training_steps = NUM_EPOCHS * len(train_loader) * 11//10,
                num_warmup_steps = 50)
            
            list_val_rmse.append(train(model, model_path, train_loader, val_loader, optimizer, 
                                       num_epochs=NUM_EPOCHS, fold=fold, scheduler=scheduler, ))
        
        elif PHASE=='eval_oof':
            model_path = f"{MODEL_DIR}/model_{fold + 1}.bin"
            model.load_state_dict(torch.load(model_path))
            model.to(DEVICE)
            
            mse, pred_r = eval_mse(model, val_loader)
            val_df['pred'] = pred_r.to('cpu').detach().numpy().copy()
            oof.append(val_df)
            list_val_rmse.append(math.sqrt(mse))
 
        del model
        gc.collect()
        
        print("\nPerformance estimates:")
        print(list_val_rmse)
        print("Mean:", np.array(list_val_rmse).mean())

    if PHASE=='eval_oof':
        oof = pd.concat(oof).set_index('id').sort_index()

    return oof

In [None]:
def Inference():
    all_predictions = np.zeros((NUM_FOLDS, len(test_df)))

    test_dataset = LitDataset(test_df, inference_only=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=0)

    for fold in range(NUM_FOLDS):            

        sa_complex_dim = 0
        if USE_SELF_ATT:
            sa_complex_dim = benchmark_sa_complex.size(-1)

        model = LitModel(benchmark_token = benchmark_token, use_max_pooling = USE_MAX_POOLING,
                         sa_complex_dim = sa_complex_dim).to(DEVICE)

        # Update vocabulary size
        model.backbone.resize_token_embeddings(len(tokenizer))

        model_path = f"{MODEL_DIR}/model_{fold + 1}.bin"
        print(f"\nUsing {model_path}")
                            
        model.load_state_dict(torch.load(model_path))    
        
        all_predictions[fold] = predict(model, test_loader)
        
        del model
        gc.collect()

    predictions = all_predictions.mean(axis=0)
    output_df = submission_df.copy()
    output_df.target = predictions
    print(output_df)

    return output_df

# Models

### RoBERTa-large Ver.31

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if ENV=='colab':
    BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/CLR/input'
    TRAIN_DATA_DIR = BASE_DIR
elif ENV=='kaggle':
    BASE_DIR = '../input/commonlitreadabilityprize'
    TRAIN_DATA_DIR = '../input/step-1-create-folds'

train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_folds.csv')
benchmark = train_df[(train_df.target == 0) & (train_df.standard_error == 0)].copy()
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(f"{BASE_DIR}/test.csv")
submission_df = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

In [None]:
SEED = 1000
NUM_FOLDS = 5
NUM_EPOCHS = 4
BATCH_SIZE = 8
MAX_LEN = 248
EVAL_SCHEDULE = [(0.52, 32), (0.49, 16), (0.48, 8), (0.47, 4), (-1., 2)]
MODEL_NAME = 'roberta-large'
MODEL_VER = 'CLRP_LightBase_031_RoBERTaL'
 
NUM_HIDDEN_LAYERS = 24
HIDDEN_SIZE = 1024
NUM_BINS = 29
COEF_QWK = 0.0 # coefficient of QWK loss
COEF_BT = 1.0 # coefficient of Bradley-Terry loss

USE_MAX_POOLING = True
USE_SELF_ATT = True
NO_TOKEN_TYPE = False
HAS_DECODER = False

BACKBONE_LR = 2e-5
HIDDEN_WTS_LR = 1e-2
LAYERWISE_LR_DECAY = 0.1

if ENV=='colab':
    MODEL_DIR = f'/content/drive/MyDrive/Colab Notebooks/CLR/{MODEL_VER}'
    SAVE_DIR = MODEL_DIR
    LOAD_BACKBONE_DIR = f'{MODEL_DIR}/backbone'
elif ENV=='kaggle':
    MODEL_DIR = '../input/clrp-lightbase-031-robertal-dat'
    SAVE_DIR = '.'
    LOAD_BACKBONE_DIR = '../input/robertalarge'

QWKloss = QuadraticWeightedKappaLoss(num_cat=NUM_BINS, device=DEVICE)
BTloss = BradleyTerryLoss()
train_df['bins'] = pd.cut(train_df['target'], bins=NUM_BINS, labels=False)

# Setup Tokenizer
if PHASE=='train':
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(f'{SAVE_DIR}/backbone')
elif PHASE=='eval_oof' or PHASE=='inference':
    tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_DIR)
if 'gpt2' in MODEL_NAME.lower():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the benchmark text
benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, MAX_LEN, return_tensor = True)
if NO_TOKEN_TYPE:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), None, benchmark_token['attention_mask'].to(DEVICE))
else:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), benchmark_token['token_type_ids'].to(DEVICE), benchmark_token['attention_mask'].to(DEVICE))

# Main
if PHASE=='train' or PHASE=='eval_oof':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(train_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    oof_df_RoBERTaL031 = Train_or_Validation()

if PHASE=='eval_oof':
    oof_df_RoBERTaL031.to_csv(f'oof_{MODEL_VER}.csv')

if PHASE=='inference':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(test_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    submission_df_RoBERTaL031 = Inference()
    submission_df_RoBERTaL031.to_csv(f"submission_{MODEL_VER}.csv", index=False)

if os.path.isdir('SelfAttComplex'):
    shutil.rmtree('SelfAttComplex')

### RoBERTa-large Ver.31b

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if ENV=='colab':
    BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/CLR/input'
    TRAIN_DATA_DIR = BASE_DIR
elif ENV=='kaggle':
    BASE_DIR = '../input/commonlitreadabilityprize'
    TRAIN_DATA_DIR = '../input/step-1-create-folds'

train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_folds.csv')
benchmark = train_df[(train_df.target == 0) & (train_df.standard_error == 0)].copy()
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(f"{BASE_DIR}/test.csv")
submission_df = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

In [None]:
SEED = 2319
NUM_FOLDS = 5
NUM_EPOCHS = 4
BATCH_SIZE = 8
MAX_LEN = 248
EVAL_SCHEDULE = [(0.52, 32), (0.49, 16), (0.48, 8), (0.47, 4), (-1., 2)]
MODEL_NAME = 'roberta-large'
MODEL_VER = 'CLRP_LightBase_031b_RoBERTaL'
 
NUM_HIDDEN_LAYERS = 24
HIDDEN_SIZE = 1024
NUM_BINS = 29
COEF_QWK = 0.0 # coefficient of QWK loss
COEF_BT = 1.0 # coefficient of Bradley-Terry loss

USE_MAX_POOLING = True
USE_SELF_ATT = True
NO_TOKEN_TYPE = False
HAS_DECODER = False

BACKBONE_LR = 2e-5
HIDDEN_WTS_LR = 1e-2
LAYERWISE_LR_DECAY = 0.1

if ENV=='colab':
    MODEL_DIR = f'/content/drive/MyDrive/Colab Notebooks/CLR/{MODEL_VER}'
    SAVE_DIR = MODEL_DIR
    LOAD_BACKBONE_DIR = f'{MODEL_DIR}/backbone'
elif ENV=='kaggle':
    MODEL_DIR = '../input/clrp-lightbase-031b-robertal-dat'
    SAVE_DIR = '.'
    LOAD_BACKBONE_DIR = '../input/robertalarge'

QWKloss = QuadraticWeightedKappaLoss(num_cat=NUM_BINS, device=DEVICE)
BTloss = BradleyTerryLoss()
train_df['bins'] = pd.cut(train_df['target'], bins=NUM_BINS, labels=False)

# Setup Tokenizer
if PHASE=='train':
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(f'{SAVE_DIR}/backbone')
elif PHASE=='eval_oof' or PHASE=='inference':
    tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_DIR)
if 'gpt2' in MODEL_NAME.lower():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the benchmark text
benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, MAX_LEN, return_tensor = True)
if NO_TOKEN_TYPE:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), None, benchmark_token['attention_mask'].to(DEVICE))
else:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), benchmark_token['token_type_ids'].to(DEVICE), benchmark_token['attention_mask'].to(DEVICE))

# Main
if PHASE=='train' or PHASE=='eval_oof':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(train_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    oof_df_RoBERTaL031b = Train_or_Validation()

if PHASE=='eval_oof':
    oof_df_RoBERTaL031b.to_csv(f'oof_{MODEL_VER}.csv')

if PHASE=='inference':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(test_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    submission_df_RoBERTaL031b = Inference()
    submission_df_RoBERTaL031b.to_csv(f"submission_{MODEL_VER}.csv", index=False)

if os.path.isdir('SelfAttComplex'):
    shutil.rmtree('SelfAttComplex')

### kfold: Ver.2, RoBERTa-large Ver.26a

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if ENV=='colab':
    BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/CLR/input'
    TRAIN_DATA_DIR = BASE_DIR
elif ENV=='kaggle':
    BASE_DIR = '../input/commonlitreadabilityprize'
    TRAIN_DATA_DIR = BASE_DIR

# train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_folds.csv')
train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train.csv')
train_df = create_folds(train_df, num_splits=5, shuffle=True, random_state=1605)

benchmark = train_df[(train_df.target == 0) & (train_df.standard_error == 0)].copy()
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(f"{BASE_DIR}/test.csv")
submission_df = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

In [None]:
SEED = 1605
NUM_FOLDS = 5
NUM_EPOCHS = 4
BATCH_SIZE = 8
MAX_LEN = 248
EVAL_SCHEDULE = [(0.52, 32), (0.49, 16), (0.48, 8), (0.47, 4), (-1., 2)]
MODEL_NAME = 'roberta-large'
MODEL_VER = 'CLRP_LightBase_kfoldv2_026a_RoBERTaL'
 
NUM_HIDDEN_LAYERS = 24
HIDDEN_SIZE = 1024
NUM_BINS = 29
COEF_QWK = 0.0 # coefficient of QWK loss
COEF_BT = 1.0 # coefficient of Bradley-Terry loss

USE_MAX_POOLING = True
USE_SELF_ATT = False
NO_TOKEN_TYPE = False
HAS_DECODER = False

BACKBONE_LR = 2e-5
HIDDEN_WTS_LR = 1e-2
LAYERWISE_LR_DECAY = 0.1

if ENV=='colab':
    MODEL_DIR = f'/content/drive/MyDrive/Colab Notebooks/CLR/{MODEL_VER}'
    SAVE_DIR = MODEL_DIR
    LOAD_BACKBONE_DIR = f'{MODEL_DIR}/backbone'
elif ENV=='kaggle':
    MODEL_DIR = '../input/clrp-lightbase-kfoldv2-026a-robertal-dat'
    SAVE_DIR = '.'
    LOAD_BACKBONE_DIR = '../input/robertalarge'

QWKloss = QuadraticWeightedKappaLoss(num_cat=NUM_BINS, device=DEVICE)
BTloss = BradleyTerryLoss()
train_df['bins'] = pd.cut(train_df['target'], bins=NUM_BINS, labels=False)

# Setup Tokenizer
if PHASE=='train':
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(f'{SAVE_DIR}/backbone')
elif PHASE=='eval_oof' or PHASE=='inference':
    tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_DIR)
if 'gpt2' in MODEL_NAME.lower():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the benchmark text
benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, MAX_LEN, return_tensor = True)
if NO_TOKEN_TYPE:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), None, benchmark_token['attention_mask'].to(DEVICE))
else:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), benchmark_token['token_type_ids'].to(DEVICE), benchmark_token['attention_mask'].to(DEVICE))

# Main
if PHASE=='train' or PHASE=='eval_oof':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(train_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    oof_df_kfv2_RoBERTaL026a = Train_or_Validation()

if PHASE=='eval_oof':
    oof_df_kfv2_RoBERTaL026a.to_csv(f'oof_{MODEL_VER}.csv')

if PHASE=='inference':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(test_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    submission_df_kfv2_RoBERTaL026a = Inference()
    submission_df_kfv2_RoBERTaL026a.to_csv(f"submission_{MODEL_VER}.csv", index=False)

if os.path.isdir('SelfAttComplex'):
    shutil.rmtree('SelfAttComplex')

### ELECTRA-large Ver.4

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if ENV=='colab':
    BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/CLR/input'
    TRAIN_DATA_DIR = BASE_DIR
elif ENV=='kaggle':
    BASE_DIR = '../input/commonlitreadabilityprize'
    TRAIN_DATA_DIR = '../input/step-1-create-folds'

train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_folds.csv')
benchmark = train_df[(train_df.target == 0) & (train_df.standard_error == 0)].copy()
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(f"{BASE_DIR}/test.csv")
submission_df = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

In [None]:
SEED = 1605
NUM_FOLDS = 5
NUM_EPOCHS = 4
BATCH_SIZE = 8
MAX_LEN = 248
EVAL_SCHEDULE = [(0.52, 32), (0.49, 16), (0.48, 8), (0.47, 4), (-1., 2)]
MODEL_NAME = 'google/electra-large-discriminator'
MODEL_VER = 'CLRP_LightBase_004_ElectraL'
 
NUM_HIDDEN_LAYERS = 24
HIDDEN_SIZE = 1024
NUM_BINS = 29
COEF_QWK = 0.0 # coefficient of QWK loss
COEF_BT = 1.0 # coefficient of Bradley-Terry loss

USE_MAX_POOLING = False
USE_SELF_ATT = False
NO_TOKEN_TYPE = False
HAS_DECODER = False

BACKBONE_LR = 2e-5
HIDDEN_WTS_LR = 1e-2
LAYERWISE_LR_DECAY = 0.1

if ENV=='colab':
    MODEL_DIR = f'/content/drive/MyDrive/Colab Notebooks/CLR/{MODEL_VER}'
    SAVE_DIR = MODEL_DIR
    LOAD_BACKBONE_DIR = f'{MODEL_DIR}/backbone'
elif ENV=='kaggle':
    MODEL_DIR = '../input/clrp-lightbase-004-electral-dat'
    SAVE_DIR = '.'
    LOAD_BACKBONE_DIR = '../input/electra-large-discriminator'

QWKloss = QuadraticWeightedKappaLoss(num_cat=NUM_BINS, device=DEVICE)
BTloss = BradleyTerryLoss()
train_df['bins'] = pd.cut(train_df['target'], bins=NUM_BINS, labels=False)

# Setup Tokenizer
if PHASE=='train':
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(f'{SAVE_DIR}/backbone')
elif PHASE=='eval_oof' or PHASE=='inference':
    tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_DIR)
if 'gpt2' in MODEL_NAME.lower():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the benchmark text
benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, MAX_LEN, return_tensor = True)
if NO_TOKEN_TYPE:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), None, benchmark_token['attention_mask'].to(DEVICE))
else:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), benchmark_token['token_type_ids'].to(DEVICE), benchmark_token['attention_mask'].to(DEVICE))

# Main
if PHASE=='train' or PHASE=='eval_oof':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(train_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    oof_df_ElectraL004 = Train_or_Validation()

if PHASE=='eval_oof':
    oof_df_ElectraL004.to_csv(f'oof_{MODEL_VER}.csv')

if PHASE=='inference':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(test_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    submission_df_ElectraL004 = Inference()
    submission_df_ElectraL004.to_csv(f"submission_{MODEL_VER}.csv", index=False)

if os.path.isdir('SelfAttComplex'):
    shutil.rmtree('SelfAttComplex')

### kfold: Ver.3, DeBERTa-large Ver.14

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

if ENV=='colab':
    BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/CLR/input'
    TRAIN_DATA_DIR = BASE_DIR
elif ENV=='kaggle':
    BASE_DIR = '../input/commonlitreadabilityprize'
    TRAIN_DATA_DIR = BASE_DIR

# train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train_folds.csv')
train_df = pd.read_csv(f'{TRAIN_DATA_DIR}/train.csv')
train_df = create_folds(train_df, num_splits=5, shuffle=True, random_state=321)

benchmark = train_df[(train_df.target == 0) & (train_df.standard_error == 0)].copy()
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(f"{BASE_DIR}/test.csv")
submission_df = pd.read_csv(f"{BASE_DIR}/sample_submission.csv")

In [None]:
SEED = 1000
NUM_FOLDS = 5
NUM_EPOCHS = 4
BATCH_SIZE = 5
MAX_LEN = 248
EVAL_SCHEDULE = [(0.52, 32), (0.49, 16), (0.48, 8), (0.47, 4), (-1., 2)]
MODEL_NAME = 'microsoft/deberta-large'
MODEL_VER = 'CLRP_LightBase_kfoldv3_014_DeBERTaL'
 
NUM_HIDDEN_LAYERS = 24
HIDDEN_SIZE = 1024
NUM_BINS = 29
COEF_QWK = 0.0 # coefficient of QWK loss
COEF_BT = 1.0 # coefficient of Bradley-Terry loss

USE_MAX_POOLING = True
USE_SELF_ATT = False
NO_TOKEN_TYPE = False
HAS_DECODER = False

BACKBONE_LR = 1e-5 # 2e-5
HIDDEN_WTS_LR = 5e-3 # 1e-2
LAYERWISE_LR_DECAY = 0.1

if ENV=='colab':
    MODEL_DIR = f'/content/drive/MyDrive/Colab Notebooks/CLR/{MODEL_VER}'
    SAVE_DIR = MODEL_DIR
    LOAD_BACKBONE_DIR = f'{MODEL_DIR}/backbone'
elif ENV=='kaggle':
    MODEL_DIR = '../input/clrp-lightbase-kfoldv3-014-debertal-dat'
    SAVE_DIR = '.'
    LOAD_BACKBONE_DIR = '../input/deberta-large'

QWKloss = QuadraticWeightedKappaLoss(num_cat=NUM_BINS, device=DEVICE)
BTloss = BradleyTerryLoss()
train_df['bins'] = pd.cut(train_df['target'], bins=NUM_BINS, labels=False)

# Setup Tokenizer
if PHASE=='train':
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(f'{SAVE_DIR}/backbone')
elif PHASE=='eval_oof' or PHASE=='inference':
    tokenizer = AutoTokenizer.from_pretrained(LOAD_BACKBONE_DIR)
if 'gpt2' in MODEL_NAME.lower():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the benchmark text
benchmark_token = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer, MAX_LEN, return_tensor = True)
if NO_TOKEN_TYPE:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), None, benchmark_token['attention_mask'].to(DEVICE))
else:
    benchmark_token = (benchmark_token['input_ids'].to(DEVICE), benchmark_token['token_type_ids'].to(DEVICE), benchmark_token['attention_mask'].to(DEVICE))

# Main
if PHASE=='train' or PHASE=='eval_oof':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(train_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    oof_df_kfv3_DeBERTaL014 = Train_or_Validation()

if PHASE=='eval_oof':
    oof_df_kfv3_DeBERTaL014.to_csv(f'oof_{MODEL_VER}.csv')

if PHASE=='inference':
    sa_complex = None # Self-Attention Complexity in Pretrained Model
    if USE_SELF_ATT:
        sa_complex = SelfAttention_Complexity(test_df, 'cpu')
        benchmark_sa_complex = SelfAttention_Complexity(benchmark, DEVICE)
    submission_df_kfv3_DeBERTaL014 = Inference()
    submission_df_kfv3_DeBERTaL014.to_csv(f"submission_{MODEL_VER}.csv", index=False)

if os.path.isdir('SelfAttComplex'):
    shutil.rmtree('SelfAttComplex')

# Ensemble

In [None]:
weight = [0.4057247639 / 3] * 3 + [0.2704413533, 0.3238339126]

if PHASE=='eval_oof':
    oof_df = oof_df_RoBERTaL031[['pred']].copy()
    oof_df['pred'] = (weight[0] * oof_df_RoBERTaL031['pred'].values +
                      weight[1] * oof_df_RoBERTaL031b['pred'].values +
                      weight[2] * oof_df_kfv2_RoBERTaL026a['pred'].values +
                      weight[3] * oof_df_ElectraL004['pred'].values+
                      weight[4] * oof_df_kfv3_DeBERTaL014['pred'].values)
    oof_df.to_csv(f'oof_ensemble.csv')

if PHASE=='inference':
    submission_df['target'] = (weight[0] * submission_df_RoBERTaL031['target'].values +
                               weight[1] * submission_df_RoBERTaL031b['target'].values +
                               weight[2] * submission_df_kfv2_RoBERTaL026a['target'].values +
                               weight[3] * submission_df_ElectraL004['target'].values +
                               weight[4] * submission_df_kfv3_DeBERTaL014['target'].values)

# Meta ensemble
##################################################################################################################

In [None]:
meta_weights = [0.42, 0.58]
ss['target'] = ss['target'].values * meta_weights[0] + submission_df['target'].values * meta_weights[1]

# Final submission
##################################################################################################################

In [None]:
ss.to_csv('submission.csv', index = None)
ss