# CommonLit Mean Pooled Regression

This notebook builds an ensemble of 4 base models (DeBERTa, ELECTRA, MPNet and RoBERTa). While the large models will have better scores, this notebook is focused on simplicity and solid performance with the tradeoff of slightly worse accuracy. The models were externally trained and uploaded as a dataset but the training code is included in this notebook.

## Model layout

The models are all defined below and use the same mean pooling logic. Each output head is straightforward given the small training set and varies slightly based on what worked best. Models are defined to maximize compatibility with the HuggingFace Trainer API.

In [None]:
import torch

from torch.nn import LayerNorm, Linear, Module, MSELoss, Sequential, Tanh

from transformers import AutoModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.deberta.modeling_deberta import DebertaModel, DebertaPreTrainedModel
from transformers.models.electra.modeling_electra import ElectraModel, ElectraPreTrainedModel
from transformers.models.mpnet.modeling_mpnet import MPNetModel, MPNetPreTrainedModel
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

class MeanPooledRegression(Module):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Get language model
        model = self.model()

        # Run inputs through language model
        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        # pylint: disable=E1101
        # Build mean pooled vector using outputs with input attention mask set
        mask = attention_mask.unsqueeze(-1).expand(outputs[0].size()).float()
        pooling = torch.sum(outputs[0] * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

        # Run mean pooled vector through regression function
        logits = self.regressor(pooling)

        # Calculate loss
        loss = None
        if labels is not None:
            loss_fct = MSELoss()
            loss = loss_fct(logits.squeeze(), labels.squeeze())

        # Return outputs
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )

class DebertaForReadability(DebertaPreTrainedModel, MeanPooledRegression):
    _keys_to_ignore_on_load_missing = ["regressor"]

    def __init__(self, config):
        super().__init__(config)

        self.deberta = DebertaModel(config)
        self.regressor = Sequential(
            Linear(config.hidden_size, config.hidden_size),
            Tanh(),
            Linear(config.hidden_size, 1)
        )

    def model(self):
        return self.deberta

class ElectraForReadability(ElectraPreTrainedModel, MeanPooledRegression):
    _keys_to_ignore_on_load_missing = ["regressor"]

    def __init__(self, config):
        super().__init__(config)

        self.electra = ElectraModel(config)
        self.regressor = Linear(config.hidden_size, 1)

    def model(self):
        return self.electra

class MPNetForReadability(MPNetPreTrainedModel, MeanPooledRegression):
    _keys_to_ignore_on_load_missing = ["regressor"]

    def __init__(self, config):
        super().__init__(config)

        self.mpnet = MPNetModel(config)
        self.regressor = Sequential(
            LayerNorm(config.hidden_size),
            Linear(config.hidden_size, 1)
        )

    def model(self):
        return self.mpnet

class RobertaForReadability(RobertaPreTrainedModel, MeanPooledRegression):
    _keys_to_ignore_on_load_missing = ["regressor"]

    def __init__(self, config):
        super().__init__(config)

        self.roberta = RobertaModel(config)
        self.regressor = Linear(config.hidden_size, 1)

    def model(self):
        return self.roberta

class AutoModelForReadability:
    @staticmethod
    def from_pretrained(path, config=None):
        if "deberta" in path:
            return DebertaForReadability.from_pretrained(path, config=config)
        elif "electra" in path:
            return ElectraForReadability.from_pretrained(path, config=config)
        elif "mpnet" in path:
            return MPNetForReadability.from_pretrained(path, config=config)
        elif "roberta" in path:
            return RobertaForReadability.from_pretrained(path, config=config)

        return AutoModelForSequenceClassification.from_pretrained(path, config=config)

## Cross-validation sets

Use 5 folds for model cross-validation.

In [None]:
import csv

from sklearn.model_selection import KFold

def buildcv():
    # Read training data and labels
    data, targets = [], []
    with open("/kaggle/input/commonlitreadabilityprize/train.csv", "r", newline='') as csvf:
        for row in csv.DictReader(csvf):
            # Include complete entries
            if float(row["target"]) != 0 and float(row["standard_error"]) != 0:
                data.append(row["excerpt"])
                targets.append(float(row["target"]))

    fold = 0
    for train_index, test_index in KFold(n_splits=5).split(data):
        X_train = [x for i, x in enumerate(data) if i in train_index]
        X_test = [x for i, x in enumerate(data) if i in test_index]

        y_train = [x for i, x in enumerate(targets) if i in train_index]
        y_test = [x for i, x in enumerate(targets) if i in test_index]

        with open("train-%d.csv" % fold, "w", newline='') as csvfile:
            writer = csv.writer(csvfile)

            writer.writerow(["label", "text"])

            for i, x in enumerate(X_train):
                writer.writerow([y_train[i], x])

        with open("valid-%d.csv" % fold, "w", newline='') as csvfile:
            writer = csv.writer(csvfile)

            writer.writerow(["label", "text"])

            for i, x in enumerate(X_test):
                writer.writerow([y_test[i], x])

        fold += 1

## Training

The section below trains each model. As noted above, this code was executed externally but it's included here for clarity. The HuggingFace Trainer API is used to with 5 folds.

*Note that running the training code below requires the datasets package to be installed.*
```python
!pip install datasets
from datasets import load_dataset
```

In [None]:
import gc

import numpy as np
import torch

from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoConfig
from transformers import Trainer, TrainerCallback, TrainingArguments, set_seed

EVAL_STEPS = 50

class EvalCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        # Dynamically set evaluation schedule based on current rmse
        rmse = state.log_history[-1]["eval_rmse"]
        prmse = state.log_history[-2]["eval_rmse"] if len(state.log_history) > 1 and \
                "eval_rmse" in state.log_history[-2] else None

        if rmse > 0.51 or (prmse and rmse > prmse):
            args.eval_steps = EVAL_STEPS
        elif rmse <= 0.48:
            args.eval_steps = 1
        elif rmse <= 0.49: 
            args.eval_steps = 2
        elif rmse <= 0.50:
            args.eval_steps = 4
        elif rmse <= 0.51:
            args.eval_steps = 8

def metrics(pred):
    return {"rmse": mean_squared_error(pred.label_ids, pred.predictions, squared=False)}

def regression(options):
    path, epochs, lrate, decay = options
    print("Parameters -", options)

    # Get model short name
    name = path.lower().split("/")[-1].split("-")[0]

    rmse = []
    for b in range(5):
        # Initialize training arguments
        args = TrainingArguments("%s-%d" % (name, b), overwrite_output_dir=True, 
                                 num_train_epochs=epochs, learning_rate=lrate, 
                                 weight_decay=decay, evaluation_strategy="steps",
                                 eval_steps=EVAL_STEPS, save_total_limit=1,
                                 load_best_model_at_end=True, metric_for_best_model="rmse",
                                 greater_is_better=False)

        files = {"train": "train-%d.csv" % b, "validation": "valid-%d.csv" % b}

        # Set seed before initializing model.
        set_seed(args.seed)

        # Model config
        config = AutoConfig.from_pretrained(path)
        config.update({"hidden_dropout_prob": 0.0, "layer_norm_eps": 1e-7, "num_labels": 1})

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(path)

        # Load model
        model = AutoModelForReadability.from_pretrained(path, config=config)

        # Load datasets
        datasets = load_dataset("csv", data_files=files)
        datasets = datasets.map(lambda data: tokenizer(data["text"], padding="max_length",
                                                       max_length=248, truncation=True), batched=True)

        trainer = Trainer(model=model, tokenizer=tokenizer, args=args,
                          train_dataset=datasets["train"],
                          eval_dataset=datasets["validation"],
                          compute_metrics=metrics, 
                          callbacks=[EvalCallback()] if EVAL_STEPS < 50 else None)

        trainer.train()
        trainer.save_model()
        trainer.save_state()

        results = trainer.evaluate()
        rmse.append(results["eval_rmse"])

        # Clear memory
        tokenizer, model, datasets, trainer, results = None, None, None, None, None
        gc.collect()
        torch.cuda.empty_cache()

        # Early exit
        if rmse[0] >= 0.48:
            return 1.0

    print(options, rmse, "=", np.mean(rmse))

    return np.mean(rmse)

# Training run externally 
#buildcv()
#regression(("roberta-base", 3, 2e-05, 0.01))
#regression(("microsoft/deberta-base", 4, 2e-05, 0))
#regression(("google/electra-base-discriminator", 3, 3e-05, 0.01))
#regression(("microsoft/mpnet-base", 4, 4e-05, 0))

## Inference

Load models and run inference against test set.

In [None]:
import csv

import numpy as np
import torch

from transformers import AutoTokenizer

def batch(texts, size):
    return [texts[x : x + size] for x in range(0, len(texts), size)]

def encode(chunk):
    embeddings = None

    for tokenizer, model in models:
        # Tokenize sentences
        encoded_input = tokenizer(chunk, max_length=248, padding="max_length",
                                  truncation=True, return_tensors="pt")        
        encoded_input.to(device)

        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Transformer model logits as features
        outputs = model_output.logits.cpu().numpy()

        if embeddings is not None:
            embeddings = np.concatenate((embeddings, outputs), axis=1)
        else:
            embeddings = outputs

    return embeddings

In [None]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ids, data = [], []
with open("/kaggle/input/commonlitreadabilityprize/test.csv", mode="r") as csvfile:
    for row in csv.DictReader(csvfile):
        ids.append(row["id"])
        data.append(row["excerpt"])

values = None
models = None
for mtype in ["deberta", "electra", "mpnet", "roberta"]:
    models = []
    predictions = None

    for b in range(5):
        path = "/kaggle/input/commonlit-transformers-models/%s-%d" % (mtype, b)
        print(path)
        tokenizer = AutoTokenizer.from_pretrained(path)

        model = AutoModelForReadability.from_pretrained(path)
        model.to(device)

        models.append((tokenizer, model))

    for chunk in batch(data, 32):
        outputs = encode(chunk)
        outputs = np.mean(outputs, axis=1).reshape(-1, 1)
        
        if predictions is not None:
            predictions = np.concatenate((predictions, outputs), axis=0)
        else:
            predictions = outputs

    if values is not None:
        values = np.concatenate((values, predictions), axis=1)
    else:
        values = predictions

# Write test predictions
predictions = np.average(values, weights=[0.21, 0.17, 0.23, 0.39], axis=1)
with open("submission.csv", "w") as output:
    output.write("id,target\n")

    for x, p in enumerate(predictions):
        output.write("%s,%.8f\n" % (ids[x], p))