## What is this

This notebook shows an example of [AllenNLP](https://github.com/allenai/allennlp) Jsonnet API for training/inference.

I publish this example because I couldn't find an example using AllenNLP with Jsonnet.
(Examples using AllenNLP I found didn't use Jsonnet config file)


In notebook only competition, it is relatively hard to use AllenNLP with Jsonnet config file,
because training is launched by running a command `allennlp train` on the shell and it loads
modules in a repository. (it means that we have to write scripts outside the notebook).

In this example, I invoke `allennlp.commands.train` on the notebook to mitigate this limitation.
Each modules are defined in the notebook.


Unfortunately, the submission using this notebook failed with the error `Notebook Exceeded Allowed Compute`.
Although I was trying to figure out the cause of memory/disk problem, I couldn't solve it.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas
import numpy


df = pandas.read_csv("../input/commonlitreadabilityprize/train.csv")
num_records = len(df)

ids = numpy.arange(num_records)
ids = numpy.random.permutation(ids)

train_size = 0.8
partition = int(num_records * train_size)

train_ids, valid_ids = ids[:partition], ids[partition:]

df.loc[train_ids].to_csv("./processed_train.csv", index=False)
df.loc[valid_ids].to_csv("./processed_valid.csv", index=False)


## Modules: Dataset reader, Model, Predictor

Model is simple that consists of two types of features: 1 categorical feature (hostname) and 1 text feature (excerpt).

A hostname is embedded into 50 dimensional and excerpt is fed into RoBERTa (base).

Each representation are concatenated and projected to a scalar.

In [None]:
from typing import Any, Dict, Iterable, MutableMapping, Optional
from urllib.parse import urlparse

from allennlp.data import DatasetReader
from allennlp.data import Tokenizer
from allennlp.data.fields.field import Field
from allennlp.data.fields import ArrayField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.token_indexers.token_indexer import TokenIndexer
from allennlp.data.tokenizers.token_class import Token
import pandas
import numpy
from overrides import overrides


@DatasetReader.register("commonlit_reader")
class CommonlitDatasetReader(DatasetReader):
    def __init__(
        self,
        tokenizer: Tokenizer,
        excerpt_token_indexers: Optional[Dict[str, TokenIndexer]] = None,
        hostname_token_indexers: Optional[Dict[str, TokenIndexer]] = None,
    ) -> None:

        super().__init__()

        self.tokenizer = tokenizer
        self.excerpt_token_indexers: Dict[str, TokenIndexer] = excerpt_token_indexers or {
            "tokens": SingleIdTokenIndexer(),
        }
        self.hostname_token_indexers: Dict[str, TokenIndexer] = hostname_token_indexers or {
            "tokens": SingleIdTokenIndexer(),
        }

    def _read(self, file_path: str) -> Iterable[Instance]:
        instances = []

        dataframe = pandas.read_csv(file_path)
        dataframe["hostname"] = dataframe \
            .url_legal \
            .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else "EMPTY_HOSTNAME")

        for _, row in dataframe.iterrows():
            excerpt = row.excerpt
            hostname = row.hostname
            target = row.target if hasattr(row, "target") else None
            instances.append(self.text_to_instance(excerpt, hostname, target))

        return instances

    @overrides
    def text_to_instance(self, excerpt: str, hostname: str, target: Optional[float] = None) -> Instance:
        excerpt_tokens = self.tokenizer.tokenize(excerpt)
        hostname_tokens = [Token(text=hostname)]
        fields: MutableMapping[str, Field[Any]] = {
            "excerpt": TextField(excerpt_tokens),
            "hostname": TextField(hostname_tokens),
        }
        if target is not None:
            fields["target"] = ArrayField(numpy.asarray(target, dtype=numpy.float32))
        return Instance(fields=fields)

    def apply_token_indexers(self, instance: Instance) -> None:
        assert isinstance(instance.fields["excerpt"], TextField)
        instance.fields["excerpt"].token_indexers = self.excerpt_token_indexers
        assert isinstance(instance.fields["hostname"], TextField)
        instance.fields["hostname"].token_indexers = self.hostname_token_indexers


from typing import Dict, Optional
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder
from allennlp.modules import Seq2VecEncoder
from allennlp.nn.util import get_text_field_mask
from allennlp.data.fields.text_field import TextFieldTensors
from overrides.overrides import overrides
from torch import FloatTensor
from torch.functional import Tensor
from torch.nn.functional import mse_loss
from torch import cat
from torch import sqrt
from torch.nn import Linear


EPS = 1e-8


@Model.register("naive")
class NaiveRegressor(Model):

    def __init__(
        self,
        vocab: Vocabulary,
        excerpt_embedder: TextFieldEmbedder,
        excerpt_encoder: Seq2VecEncoder,
        hostname_embedder: Optional[TextFieldEmbedder] = None,
    ) -> None:

        super().__init__(vocab)

        self.vocab = vocab
        self.excerpt_embedder = excerpt_embedder
        self.excerpt_encoder = excerpt_encoder
        self.hostname_embedder = hostname_embedder

        in_features = self.excerpt_encoder.get_output_dim()
        if hostname_embedder is not None:
            in_features += hostname_embedder.get_output_dim()

        self.classification_layer = Linear(
            in_features=in_features,
            out_features=1,
        )

    @overrides
    def forward(
        self,
        excerpt: TextFieldTensors,
        hostname: Optional[TextFieldTensors] = None,
        target: Optional[FloatTensor] = None,
    ) -> Dict[str, Tensor]:

        mask = get_text_field_mask(excerpt)
        excerpt_emb = self.excerpt_embedder(excerpt)
        hidden_state = self.excerpt_encoder(excerpt_emb, mask=mask)

        if self.hostname_embedder is not None and hostname is not None:
            hostname_emb = self.hostname_embedder(hostname)
            hidden_state = cat((hidden_state, hostname_emb.squeeze(dim=1)), dim=1)

        logit = self.classification_layer(hidden_state)

        output_dict = {"logit": logit}
        if target is not None:
            output_dict["loss"] = sqrt(mse_loss(logit.view(-1), target) + EPS)

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {}


from allennlp.common.util import JsonDict
from allennlp.data.instance import Instance
from allennlp.predictors import Predictor


@Predictor.register("regressor_predictor")
class RegressorPredictor(Predictor):
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(**json_dict)  # type: ignore


## Config file

I'm not sure if this is the best way but I create a config file in this notebook.

Executing the following cell creates a Jsonnet config.

In [None]:
jsonnet_text = """\
{
    dataset_reader: {
        type: "commonlit_reader",
        tokenizer: {
            type: "pretrained_transformer",
            model_name: "../input/roberta-base",
        },
        excerpt_token_indexers: {
            tokens: {
                type: "pretrained_transformer",
                model_name: "../input/roberta-base",
            },
        },
    },
    train_data_path: "./processed_train.csv",
    validation_data_path: "./processed_valid.csv",
    model: {
        type: "naive",
        excerpt_embedder: {
            type: "basic",
            token_embedders: {
                tokens: {
                    type: "pretrained_transformer",
                    model_name: "../input/roberta-base",
                },
            },
        },
        excerpt_encoder: {
            type: "bert_pooler",
            pretrained_model: "../input/roberta-base",
        },
        hostname_embedder: {
            type: "basic",
            token_embedders: {
                tokens: {
                    embedding_dim: 50,
                },
            },
        },
    },
    trainer: {
        num_epochs: 15,
        learning_rate_scheduler: {
            type: "slanted_triangular",
            num_epochs: 10,
            num_steps_per_epoch: 3088,
            cut_frac: 0.06
        },
        optimizer: {
            type: "huggingface_adamw",
            lr: 5e-7,
            weight_decay: 0.05,
        },
        validation_metric: "-loss"
    },
    data_loader: {
        batch_size: 8,
        shuffle: true
    }
}
"""

f = open("baseline.jsonnet", "w")
f.write(jsonnet_text)
f.close()

## Training

Instead of running `allennlp train` on a shell,
we can directly invoke `allennlp.commands.train.train_model_from_file`.

In [None]:
import allennlp.commands

allennlp.commands.train.train_model_from_file(
    parameter_filename="./baseline.jsonnet",
    serialization_dir="./serialization/1",
)

### Inference

In [None]:
from allennlp.models.archival import load_archive

archive = load_archive("serialization/1/model.tar.gz")
predictor = RegressorPredictor.from_archive(archive)

In [None]:
from urllib.parse import urlparse


test_df = pandas.read_csv("../input/commonlitreadabilityprize/test.csv")
print(test_df.head())

test_df["hostname"] = test_df \
    .url_legal \
    .apply(lambda url: urlparse(url).hostname if isinstance(url, str) else "EMPTY_HOSTNAME")

batch_json = test_df.apply(lambda row: {"excerpt": row.excerpt, "hostname": row.hostname}, axis=1).tolist()
predictor.predict_batch_json(batch_json)

In [None]:
class BatchIterator:
        def __init__(self, data, batch_size):
                self.data = data
                self.batch_size = batch_size
                self.cur = 0
            
        def __iter__(self):
                return self
            
        def __next__(self):
                batch = self.data[self.cur:self.cur+self.batch_size]
                self.cur += self.batch_size
                if len(batch) == 0:
                    raise StopIteration
                return batch


predictions = []
batch_iterator = BatchIterator(batch_json, batch_size=1)

for batch in batch_iterator:
    predictions += predictor.predict_batch_json(batch)

In [None]:
test_df["target"] = list(map(lambda p: p["logit"][0], predictions))
test_df[["id", "target"]].to_csv("submission.csv", index=False)
test_df