# Version 12
#### Model list
* RoBERTa-large version 15-0, 15-3;
* RoBERTa-large version 16-1;
* XLNet-large-cased version 2-0, 3-0, 3-1;
* GPT2-medium version 1-0;
* ELECTRA-large-discriminator version 1-0, 1-1;
* DeBERTa-large version 1-0, 1-1;
* Funnel-large version 1-0;
* BART-large version 1-0

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import random
import gc

import sys
sys.path.append('../input/readability-package')
import readability
import spacy

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
import string
import re
import math
import pickle

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Autocast
from torch.cuda.amp import autocast, GradScaler
from torch.optim.swa_utils import AveragedModel

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, AdamW

import warnings
warnings.filterwarnings('ignore')

In [None]:
!cp -r ../input/spacy-readability/spacy_readability-master/* ./
!cp -r ../input/syllapy/syllapy-master/* ./
import spacy
from spacy_readability import Readability

nlp = spacy.load('en')
nlp.add_pipe(Readability(), last = True)

In [None]:
def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 0
seed_everything(seed)

# Import data

In [None]:
base_dir = '../input/commonlitreadabilityprize'
train_data = pd.read_csv(f'{base_dir}/train.csv')
# Benchmark text
benchmark = train_data[train_data['standard_error'] == 0.]

In [None]:
base_dir = '../input/commonlitreadabilityprize'
data = pd.read_csv(f'{base_dir}/test.csv')
ss = pd.read_csv(f'{base_dir}/sample_submission.csv')
data.head()

# Utilities

In [None]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def readability_feat(text):
    text = nlp(text)
    
    return np.array([text._.flesch_kincaid_grade_level,
                     text._.flesch_kincaid_reading_ease,
                     text._.dale_chall,
                     text._.coleman_liau_index,
                     text._.automated_readability_index,
                     text._.forcast], dtype = np.float)

def sample_text(targets, num_output = 5):
    mean, var = targets[0], targets[1]
    if targets[1] != 0.:
        sampled_target = torch.normal(mean, var, size = (num_output,))
    else:
        sampled_target = torch.tensor([0.] * num_output, dtype = torch.float)
    return sampled_target

def convert_examples_to_features(text, tokenizer, max_len, is_test = False, return_tensor = False):
    # Take from https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-fit
    text = text.replace('\n', '')
    if return_tensor:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            return_tensors = 'pt',
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    else:
        tok = tokenizer.encode_plus(
            text, 
            max_length = max_len, 
            padding = 'max_length', 
            truncation = True,
            return_attention_mask = True,
            return_token_type_ids = True
        )
    return tok

def form_dataset(token, external_features = None, target = None, bins = None):
    if target is not None:
        if bins is not None:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                    'bins': bins,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                    'bins': bins,
                }
        else:
            if external_features is not None:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'external_features': torch.tensor(external_features, dtype = torch.float),
                    'target': target,
                }
            else:
                return {
                    'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                    'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                    'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                    'target': target,
                }
    else:
        if external_features is not None:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
                'external_features': torch.tensor(external_features, dtype = torch.float),
            }
        else:
            return {
                'input_ids': torch.tensor(token['input_ids'], dtype = torch.long),
                'token_type_ids': torch.tensor(token['token_type_ids'], dtype = torch.long),
                'attention_mask': torch.tensor(token['attention_mask'], dtype = torch.long),
            }

# Dataset

In [None]:
class Readability_Dataset(Dataset):
    def __init__(self, documents, tokenizer, max_len = 300, mode = 'infer'):
        self.documents = documents
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        
    def __len__(self):
        return len(self.documents)
    
    def __getitem__(self, idx):
        sample = self.documents.iloc[idx]
        document = sample['excerpt']
        
        # Tokenize
        features = convert_examples_to_features(document, self.tokenizer, self.max_len)
        
        return form_dataset(features)

# Model

### Utils class

In [None]:
class AttentivePooling(nn.Module):
    def __init__(self, input_dim = 768, attention_dim = 1024):
        super(AttentivePooling, self).__init__()
        # Attention pooler
        self.word_weight = nn.Linear(input_dim, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
    def forward(self, x, mask = None):
        '''
        x : Batch_size x Seq_len x input_dim
        mask: 
        '''
        # Attention Pooling (over sequence for the first sequence)
        u_i = torch.tanh(self.word_weight(x))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        
        if mask is not None:
            att = att * (1 - mask.unsqueeze(-1))
            
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        x = x * att
        return x.sum(dim = 1)

### RoBERTa base

* Version 11

In [None]:
class Readability_Model_RoBERTa_base_v11(nn.Module):
    def __init__(self, backbone, model_config, benchmark_token = None, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True):
        super(Readability_Model_RoBERTa_base_v11, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.output_cat)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        output = self.layer_norm(output_backbone.pooler_output)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts)
        cats /= len(self.dropouts)

        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### RoBERTa large

* Version 15

In [None]:
class Readability_Model_RoBERTa_large_v15(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_RoBERTa_large_v15, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

* Version 16

In [None]:
class Readability_Model_RoBERTa_large_v16(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_RoBERTa_large_v16, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Weighted mean pooling (over hidden layers)
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        output_backbone = torch.sum(hidden_states * layer_weight, dim = 0)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### XLNet large cased

* Version 2

In [None]:
class Readability_Model_XLNet_large_cased_v2(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_XLNet_large_cased_v2, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

* Version 3

In [None]:
class Readability_Model_XLNet_large_cased_v3(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_XLNet_large_cased_v3, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### GPT2 medium

* Version 1

In [None]:
class Readability_Model_GPT2_medium_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_GPT2_medium_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### ALBERT xlarge v2

* Version 1

In [None]:
class Readability_Model_ALBERT_xlarge_v2_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_ALBERT_xlarge_v2_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### ELECTRA large discriminator

* Version 1

In [None]:
class Readability_Model_ELECTRA_large_discriminator_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_ELECTRA_large_discriminator_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### DeBERTa large

* Version 1

In [None]:
class Readability_Model_DeBERTa_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_DeBERTa_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        
        # Extract output
        hidden_states = output_backbone.hidden_states
        
        # Mean/max pooling (over hidden layers), concatenate with pooler
        hidden_states = torch.stack(tuple(hidden_states[-i-1] for i in range(len(hidden_states) - 1)), dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### Funnel-transformer/large

* Version 1

In [None]:
class Readability_Model_Funnel_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_Funnel_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        token_type_ids = token_type_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        # Extract output
        output_backbone = output_backbone.last_hidden_state
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

### BART large

* Version 1

In [None]:
class Readability_Model_BART_large_v1(nn.Module):
    def __init__(self, backbone, model_config, is_sampled = False, num_external_features = 6, num_output = 2, 
                 num_cat = 7, attention_dim = 1024, multisample_dropout = True, benchmark_token = None):
        super(Readability_Model_BART_large_v1, self).__init__()
        self.model_config = model_config
        self.is_sampled = is_sampled
        self.benchmark_token = benchmark_token
        self.backbone = AutoModel.from_pretrained(backbone, config = self.model_config)
        self.layer_norm = nn.LayerNorm(self.model_config.hidden_size * 2)    # Concat of mean and max pooling
        self.output = nn.Linear(self.model_config.hidden_size * 2, num_output)   #  + num_external_features
        self.output_cat = nn.Linear(self.model_config.hidden_size * 2, num_cat)
        
        # Attention pooler
        self.word_weight = nn.Linear(self.model_config.hidden_size * 2, attention_dim)
        self.context_weight = nn.Linear(attention_dim, 1)
        
        self.hidden_layer_weights = nn.Parameter(torch.zeros(self.model_config.num_hidden_layers * 2).view(-1, 1, 1, 1))
        
        # Dropout layers
        if multisample_dropout:
            self.dropouts_output = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
            self.dropouts_cat = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        
        # Initialize weights
        self._init_weights(self.layer_norm)
        self._init_weights(self.output)
        self._init_weights(self.word_weight)
        self._init_weights(self.context_weight)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.init_std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.model_config.init_std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, token_type_ids, attention_mask, external_features = None):
        output_backbone = self.backbone(input_ids = input_ids.squeeze(),
                                        attention_mask = attention_mask.squeeze())
        
        decoder_hidden_states = output_backbone.decoder_hidden_states
        encoder_hidden_states = output_backbone.encoder_hidden_states

        # Mean/max pooling (over hidden layers), concatenate with pooler
        decoder_hidden_states = tuple(decoder_hidden_states[-i-1] for i in range(self.model_config.num_hidden_layers))
        encoder_hidden_states = tuple(encoder_hidden_states[-i-1] for i in range(self.model_config.num_hidden_layers))
        hidden_states = torch.stack(decoder_hidden_states + encoder_hidden_states, dim = 0)
        layer_weight = F.softmax(self.hidden_layer_weights, dim = 0)
        out_mean = torch.sum(hidden_states * layer_weight, dim = 0)
        out_max, _ = torch.max(hidden_states, dim = 0)
        output_backbone = torch.cat((out_mean, out_max), dim = -1)
        output_backbone = self.layer_norm(output_backbone)
        
        # Attention Pooling (over time)
        u_i = torch.tanh(self.word_weight(output_backbone))
        u_w = self.context_weight(u_i).squeeze(1)
        val = u_w.max()
        att = torch.exp(u_w - val)
        att = att / torch.sum(att, dim = 1, keepdim = True)
        
        output = output_backbone * att
        output = output.sum(dim = 1)
        
        # Multiple dropout
        for i, dropout in enumerate(self.dropouts_output):
            if i == 0:
                logits = self.output(dropout(output))
                cats = self.output_cat(self.dropouts_cat[i](output))
            else:
                logits += self.output(dropout(output))
                cats += self.output_cat(self.dropouts_cat[i](output))
        
        logits /= len(self.dropouts_output)
        cats /= len(self.dropouts_output)
        
        if self.benchmark_token is not None:
            logits = logits[:-1] - logits[-1]

            cats = cats[:-1]
        
        if self.is_sampled:
            return logits, None, torch.argmax(F.softmax(cats, dim = -1), dim = -1)
        else:
            return logits[:,0], torch.exp(0.5 * logits[:,1]), torch.argmax(F.softmax(cats, dim = -1), dim = -1)

# Inference function

In [None]:
def infer(model, dataloader, device = 'cpu', use_tqdm = True, benchmark_token = None):
    model.eval()
    
    if use_tqdm:
        tbar = tqdm(dataloader)
    else:
        tbar = dataloader
        
    pred = []
        
    for item in tbar:
        input_ids = item['input_ids'].to(device)
        token_type_ids = item['token_type_ids'].to(device)
        attention_mask = item['attention_mask'].to(device)
        
        if benchmark_token is not None:
            benchmark_input_ids, benchmark_token_type_ids, benchmark_attention_mask = benchmark_token
            input_ids = torch.cat((input_ids, benchmark_input_ids), dim = 0)
            token_type_ids = torch.cat((token_type_ids, benchmark_token_type_ids), dim = 0)
            attention_mask = torch.cat((attention_mask, benchmark_attention_mask), dim = 0)
            
        with torch.no_grad():
            with autocast():
                pred_mean, pred_std, pred_bins = model(input_ids = input_ids, 
                                                       attention_mask = attention_mask, 
                                                       token_type_ids = token_type_ids)
        
        pred.extend(pred_mean.cpu().detach().numpy())
        
    # Stack
    pred = np.array(pred)
    
    return pred

# Configuration

In [None]:
class config():
    # For inference
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    use_tqdm = True
    model_names = ['roberta_large_v15_0', 'roberta_large_v15_3',
                   'roberta_large_v16_1', 
                   'gpt2_medium_v1_0', 
                   'xlnet_large_cased_v2_0', 'xlnet_large_cased_v3_0', 'xlnet_large_cased_v3_1', 
                   'electra_large_discriminator_v1_0', 'electra_large_discriminator_v1_1',
                   'deberta_large_v1_0', 'deberta_large_v1_1',
                   'funnel_large_v1_0',
                   'bart_large_v1_0']
    # For dataloader
    max_len = [250] * 15
    batch_size = (8, 8, 8, 
                  8, 8,
                  6, 
                  6, 6, 6,
                  8, 8,
                  4, 4,
                  8,
                  8)    # In the same order as the 'model_names' attribute
    num_workers = 4
    # For models
    num_bins = (29, 1, 29, 
                29, 29, 
                29, 
                29, 1, 1, 
                1, 1,
                29, 29,
                1,
                1)    # In the same order as the 'model_names' attribute
    
cfg = config()

# Main

In [None]:
# Tokenizer and model configuration
tokenizer_roberta_large = AutoTokenizer.from_pretrained('../input/robertalarge', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_roberta_large = AutoConfig.from_pretrained('../input/robertalarge', output_hidden_states = True)

tokenizer_gpt2_medium = AutoTokenizer.from_pretrained('../input/gpt2-medium', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
tokenizer_gpt2_medium.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_xlnet_large_cased = AutoTokenizer.from_pretrained('../input/xlnet-large-cased', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
tokenizer_xlnet_large_cased.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_electra_large = AutoTokenizer.from_pretrained('../input/electra-large-discriminator', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_electra_large = AutoConfig.from_pretrained('../input/electra-large-discriminator', output_hidden_states = True)

tokenizer_deberta_large = AutoTokenizer.from_pretrained('../input/deberta-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_debert_large = AutoConfig.from_pretrained('../input/deberta-large', output_hidden_states = True)

tokenizer_funnel_large = AutoTokenizer.from_pretrained('../input/funnel-transformer-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_funnel_large = AutoConfig.from_pretrained('../input/funnel-transformer-large', output_hidden_states = True)

tokenizer_bart_large = AutoTokenizer.from_pretrained('../input/bart-large', local_files_only = True, checkpoint_file = 'pytorch_model.bin')
model_config_bart_large = AutoConfig.from_pretrained('../input/bart-large', output_hidden_states = True)

# Dataloader
infer_dataset_roberta_large_v15_0 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[0], mode = 'infer')
infer_dataloader_roberta_large_v15_0 = DataLoader(infer_dataset_roberta_large_v15_0, batch_size = cfg.batch_size[0], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_roberta_large_v15_3 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[2], mode = 'infer')
infer_dataloader_roberta_large_v15_3 = DataLoader(infer_dataset_roberta_large_v15_3, batch_size = cfg.batch_size[2], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_roberta_large_v16_1 = Readability_Dataset(data, tokenizer_roberta_large, max_len = cfg.max_len[4], mode = 'infer')
infer_dataloader_roberta_large_v16_1 = DataLoader(infer_dataset_roberta_large_v16_1, batch_size = cfg.batch_size[4], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_gpt2_medium_v1_0 = Readability_Dataset(data, tokenizer_gpt2_medium, max_len = cfg.max_len[5], mode = 'infer')
infer_dataloader_gpt2_medium_v1_0 = DataLoader(infer_dataset_gpt2_medium_v1_0, batch_size = cfg.batch_size[5], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v2_0 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[6], mode = 'infer')
infer_dataloader_xlnet_large_cased_v2_0 = DataLoader(infer_dataset_xlnet_large_cased_v2_0, batch_size = cfg.batch_size[6], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v3_0 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[7], mode = 'infer')
infer_dataloader_xlnet_large_cased_v3_0 = DataLoader(infer_dataset_xlnet_large_cased_v3_0, batch_size = cfg.batch_size[7], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_xlnet_large_cased_v3_1 = Readability_Dataset(data, tokenizer_xlnet_large_cased, max_len = cfg.max_len[8], mode = 'infer')
infer_dataloader_xlnet_large_cased_v3_1 = DataLoader(infer_dataset_xlnet_large_cased_v3_1, batch_size = cfg.batch_size[8], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_electra_large_v1_0 = Readability_Dataset(data, tokenizer_electra_large, max_len = cfg.max_len[9], mode = 'infer')
infer_dataloader_electra_large_v1_0 = DataLoader(infer_dataset_electra_large_v1_0, batch_size = cfg.batch_size[9], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_electra_large_v1_1 = Readability_Dataset(data, tokenizer_electra_large, max_len = cfg.max_len[10], mode = 'infer')
infer_dataloader_electra_large_v1_1 = DataLoader(infer_dataset_electra_large_v1_1, batch_size = cfg.batch_size[10], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_deberta_large_v1_0 = Readability_Dataset(data, tokenizer_deberta_large, max_len = cfg.max_len[11], mode = 'infer')
infer_dataloader_deberta_large_v1_0 = DataLoader(infer_dataset_deberta_large_v1_0, batch_size = cfg.batch_size[11], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_deberta_large_v1_1 = Readability_Dataset(data, tokenizer_deberta_large, max_len = cfg.max_len[12], mode = 'infer')
infer_dataloader_deberta_large_v1_1 = DataLoader(infer_dataset_deberta_large_v1_1, batch_size = cfg.batch_size[12], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_funnel_large_v1_0 = Readability_Dataset(data, tokenizer_funnel_large, max_len = cfg.max_len[13], mode = 'infer')
infer_dataloader_funnel_large_v1_0 = DataLoader(infer_dataset_funnel_large_v1_0, batch_size = cfg.batch_size[13], num_workers = cfg.num_workers, shuffle = False)

infer_dataset_bart_large_v1_0 = Readability_Dataset(data, tokenizer_bart_large, max_len = cfg.max_len[14], mode = 'infer')
infer_dataloader_bart_large_v1_0 = DataLoader(infer_dataset_bart_large_v1_0, batch_size = cfg.batch_size[14], num_workers = cfg.num_workers, shuffle = False)

# Prediction storage
prediction_roberta_large_v15_0 = np.zeros(data.shape[0])
prediction_roberta_large_v15_2 = np.zeros(data.shape[0])
prediction_roberta_large_v15_3 = np.zeros(data.shape[0])
prediction_roberta_large_v16_0 = np.zeros(data.shape[0])
prediction_roberta_large_v16_1 = np.zeros(data.shape[0])
prediction_gpt2_medium_v1_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v2_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v3_0 = np.zeros(data.shape[0])
prediction_xlnet_large_cased_v3_1 = np.zeros(data.shape[0])
prediction_electra_large_v1_0 = np.zeros(data.shape[0])
prediction_electra_large_v1_1 = np.zeros(data.shape[0])
prediction_deberta_large_v1_0 = np.zeros(data.shape[0])
prediction_deberta_large_v1_1 = np.zeros(data.shape[0])
prediction_funnel_large_v1_0 = np.zeros(data.shape[0])
prediction_bart_large_v1_0 = np.zeros(data.shape[0])

# Tokenize the benchmark text
benchmark_token_roberta_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_roberta_large, cfg.max_len[0], return_tensor = True)
benchmark_token_roberta_large = (benchmark_token_roberta_large['input_ids'].to(cfg.device), 
                                 benchmark_token_roberta_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_roberta_large['attention_mask'].to(cfg.device))

benchmark_token_gpt2_medium = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_gpt2_medium, cfg.max_len[5], return_tensor = True)
benchmark_token_gpt2_medium = (benchmark_token_gpt2_medium['input_ids'].to(cfg.device), 
                               benchmark_token_gpt2_medium['token_type_ids'].to(cfg.device), 
                               benchmark_token_gpt2_medium['attention_mask'].to(cfg.device))

benchmark_token_xlnet_large_cased = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_xlnet_large_cased, cfg.max_len[6], return_tensor = True)
benchmark_token_xlnet_large_cased = (benchmark_token_xlnet_large_cased['input_ids'].to(cfg.device), 
                                     benchmark_token_xlnet_large_cased['token_type_ids'].to(cfg.device), 
                                     benchmark_token_xlnet_large_cased['attention_mask'].to(cfg.device))

benchmark_token_electra_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_electra_large, cfg.max_len[9], return_tensor = True)
benchmark_token_electra_large = (benchmark_token_electra_large['input_ids'].to(cfg.device), 
                                 benchmark_token_electra_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_electra_large['attention_mask'].to(cfg.device))

benchmark_token_deberta_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_deberta_large, cfg.max_len[11], return_tensor = True)
benchmark_token_deberta_large = (benchmark_token_deberta_large['input_ids'].to(cfg.device), 
                                 benchmark_token_deberta_large['token_type_ids'].to(cfg.device), 
                                 benchmark_token_deberta_large['attention_mask'].to(cfg.device))

benchmark_token_funnel_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_funnel_large, cfg.max_len[13], return_tensor = True)
benchmark_token_funnel_large = (benchmark_token_funnel_large['input_ids'].to(cfg.device), 
                                benchmark_token_funnel_large['token_type_ids'].to(cfg.device), 
                                benchmark_token_funnel_large['attention_mask'].to(cfg.device))

benchmark_token_bart_large = convert_examples_to_features(benchmark['excerpt'].iloc[0], tokenizer_bart_large, cfg.max_len[14], return_tensor = True)
benchmark_token_bart_large = (benchmark_token_bart_large['input_ids'].to(cfg.device), 
                              benchmark_token_bart_large['token_type_ids'].to(cfg.device), 
                              benchmark_token_bart_large['attention_mask'].to(cfg.device))

for fold in range(5):
    print('*' * 50)
    print(f'Fold: {fold}')
    
    # Load pretrained models
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 15-0...')
    model_roberta_large_v15_0 = Readability_Model_RoBERTa_large_v15('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[0], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largepretrained-modelsv15/model_best_roberta_large_v15_0'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v15_0.load_state_dict(ckp['model_state_dict'])    
    prediction_roberta_large_v15_0 += infer(model_roberta_large_v15_0, infer_dataloader_roberta_large_v15_0, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v15_0; gc.collect()
    
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 15-3...')
    model_roberta_large_v15_3 = Readability_Model_RoBERTa_large_v15('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[2], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largev15/model_best_roberta_large_v15_3'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v15_3.load_state_dict(ckp['model_state_dict'])    
    prediction_roberta_large_v15_3 += infer(model_roberta_large_v15_3, infer_dataloader_roberta_large_v15_3, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v15_3; gc.collect()
    
    model_name = 'roberta_large'
    print(f'Inference model, {model_name} version 16-1...')
    model_roberta_large_v16_1 = Readability_Model_RoBERTa_large_v16('../input/robertalarge', model_config_roberta_large, num_cat = cfg.num_bins[4], 
                                                                    benchmark_token = benchmark_token_roberta_large).to(cfg.device)
    model_root_path = '../input/clrroberta-largepretrained-modelsv16/model_best_roberta_large_v16_1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_roberta_large_v16_1.load_state_dict(ckp['model_state_dict'])
    prediction_roberta_large_v16_1 += infer(model_roberta_large_v16_1, infer_dataloader_roberta_large_v16_1, device = cfg.device, 
                                            use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_roberta_large) / 5
    del model_roberta_large_v16_1; gc.collect()
    
    model_name = 'gpt2_medium'
    print(f'Inference model, {model_name} version 1-0...')
    model_config_gpt2_medium = AutoConfig.from_pretrained('../input/gpt2-medium', output_hidden_states = True)
    model_gpt2_medium_v1_0 = Readability_Model_GPT2_medium_v1('../input/gpt2-medium', model_config_gpt2_medium, num_cat = cfg.num_bins[5], 
                                                              benchmark_token = benchmark_token_gpt2_medium).to(cfg.device)
    model_gpt2_medium_v1_0.backbone.resize_token_embeddings(len(tokenizer_gpt2_medium))
    model_root_path = '../input/clrgpt2-mediumpretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_gpt2_medium_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_gpt2_medium_v1_0 += infer(model_gpt2_medium_v1_0, infer_dataloader_gpt2_medium_v1_0, device = cfg.device, 
                                         use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_gpt2_medium) / 5
    del model_gpt2_medium_v1_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 2-0...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v2_0 = Readability_Model_XLNet_large_cased_v2('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[6], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v2_0.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-largepretrained-models/v02'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v2_0.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v2_0 += infer(model_xlnet_large_cased_v2_0, infer_dataloader_xlnet_large_cased_v2_0, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v2_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 3-0...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v3_0 = Readability_Model_XLNet_large_cased_v3('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[7], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v3_0.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-largepretrained-models/v03'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v3_0.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v3_0 += infer(model_xlnet_large_cased_v3_0, infer_dataloader_xlnet_large_cased_v3_0, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v3_0; gc.collect()
    
    model_name = 'xlnet_large_cased'
    print(f'Inference model, {model_name} version 3-1...')
    model_config_xlnet_large_cased = AutoConfig.from_pretrained('../input/xlnet-large-cased', output_hidden_states = True)
    model_xlnet_large_cased_v3_1 = Readability_Model_XLNet_large_cased_v3('../input/xlnet-large-cased', model_config_xlnet_large_cased, 
                                                                          num_cat = cfg.num_bins[8], benchmark_token = benchmark_token_xlnet_large_cased).to(cfg.device)
    model_xlnet_large_cased_v3_1.backbone.resize_token_embeddings(len(tokenizer_xlnet_large_cased))
    model_root_path = '../input/clrxlnet-large-casedv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_xlnet_large_cased_v3_1.load_state_dict(ckp['model_state_dict'])
    prediction_xlnet_large_cased_v3_1 += infer(model_xlnet_large_cased_v3_1, infer_dataloader_xlnet_large_cased_v3_1, device = cfg.device, 
                                               use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_xlnet_large_cased) / 5
    del model_xlnet_large_cased_v3_1; gc.collect()
    
    model_name = 'electra_large_discriminator'
    print(f'Inference model, {model_name} version 1-0...')
    model_electra_large_v1_0 = Readability_Model_ELECTRA_large_discriminator_v1('../input/electra-large-discriminator', model_config_electra_large, 
                                                                                num_cat = cfg.num_bins[9], benchmark_token = benchmark_token_electra_large).to(cfg.device)
    model_root_path = '../input/clrelectra-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_electra_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_electra_large_v1_0 += infer(model_electra_large_v1_0, infer_dataloader_electra_large_v1_0, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_electra_large) / 5
    del model_electra_large_v1_0; gc.collect()
    
    model_name = 'electra_large_discriminator'
    print(f'Inference model, {model_name} version 1-1...')
    model_electra_large_v1_1 = Readability_Model_ELECTRA_large_discriminator_v1('../input/electra-large-discriminator', model_config_electra_large, 
                                                                                num_cat = cfg.num_bins[10], benchmark_token = benchmark_token_electra_large).to(cfg.device)
    model_root_path = '../input/clrelectra-largev1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_electra_large_v1_1.load_state_dict(ckp['model_state_dict'])    
    prediction_electra_large_v1_1 += infer(model_electra_large_v1_1, infer_dataloader_electra_large_v1_1, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_electra_large) / 5
    del model_electra_large_v1_1; gc.collect()
    
    model_name = 'deberta_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_deberta_large_v1_0 = Readability_Model_DeBERTa_large_v1('../input/deberta-large', model_config_debert_large, 
                                                                  num_cat = cfg.num_bins[11], benchmark_token = benchmark_token_deberta_large).to(cfg.device)
    model_root_path = '../input/clrdeberta-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_deberta_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_deberta_large_v1_0 += infer(model_deberta_large_v1_0, infer_dataloader_deberta_large_v1_0, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_deberta_large) / 5
    del model_deberta_large_v1_0; gc.collect()
    
    model_name = 'deberta_large'
    print(f'Inference model, {model_name} version 1-1...')
    model_deberta_large_v1_1 = Readability_Model_DeBERTa_large_v1('../input/deberta-large', model_config_debert_large, 
                                                                  num_cat = cfg.num_bins[12], benchmark_token = benchmark_token_deberta_large).to(cfg.device)
    if fold == 3:
        model_deberta_large_v1_1 = AveragedModel(model_deberta_large_v1_1)
        
    model_root_path = '../input/clrdeberta-largev1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_deberta_large_v1_1.load_state_dict(ckp['model_state_dict'])    
    prediction_deberta_large_v1_1 += infer(model_deberta_large_v1_1, infer_dataloader_deberta_large_v1_1, device = cfg.device, 
                                           use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_deberta_large) / 5
    del model_deberta_large_v1_1; gc.collect()
    
    model_name = 'funnel_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_funnel_large_v1_0 = Readability_Model_Funnel_large_v1('../input/funnel-transformer-large', model_config_funnel_large, num_cat = cfg.num_bins[7], 
                                                                benchmark_token = benchmark_token_funnel_large).to(cfg.device)
    model_root_path = '../input/clrfunnel-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_funnel_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_funnel_large_v1_0 += infer(model_funnel_large_v1_0, infer_dataloader_funnel_large_v1_0, device = cfg.device, 
                                          use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_funnel_large) / 5
    del model_funnel_large_v1_0; gc.collect()
    
    model_name = 'bart_large'
    print(f'Inference model, {model_name} version 1-0...')
    model_bart_large_v1_0 = Readability_Model_BART_large_v1('../input/bart-large', model_config_bart_large, num_cat = cfg.num_bins[8], 
                                                            benchmark_token = benchmark_token_bart_large).to(cfg.device)
    model_root_path = '../input/clrbart-largepretrained-modelsv1'
    ckp = torch.load(f'{model_root_path}/model_best_fold_{fold}_{model_name}.bin', map_location = cfg.device)
    model_bart_large_v1_0.load_state_dict(ckp['model_state_dict'])    
    prediction_bart_large_v1_0 += infer(model_bart_large_v1_0, infer_dataloader_bart_large_v1_0, device = cfg.device, 
                                        use_tqdm = cfg.use_tqdm, benchmark_token = benchmark_token_bart_large) / 5
    del model_bart_large_v1_0; gc.collect()

# Averaging

In [None]:
pred_zoo = np.vstack([prediction_roberta_large_v15_0, prediction_roberta_large_v15_3, 
                      prediction_roberta_large_v16_1,
                      prediction_gpt2_medium_v1_0,
                      prediction_xlnet_large_cased_v2_0, prediction_xlnet_large_cased_v3_0, prediction_xlnet_large_cased_v3_1, 
                      prediction_electra_large_v1_0, prediction_electra_large_v1_1, 
                      prediction_deberta_large_v1_0, prediction_deberta_large_v1_1, 
                      prediction_funnel_large_v1_0, 
                      prediction_bart_large_v1_0]).T

ss['target'] = np.mean(pred_zoo, axis = 1)

# Submission

In [None]:
ss.to_csv('submission.csv', index = None)
ss