# Feedback Prize - BERT Inference

This is the **inference** notebook to the training notebook [Feedback Prize - BERT](https://www.kaggle.com/morodertobias/feedback-prize-bert/). 

The current version uses a fine-tuned BERT model on the input ``discourse_type [SEP] discourse_text``.

Note, data preparation and model creation function need to be identical as in the corresponding training notebook. Also be careful that added notebook outputs or dataset might change over time.

- Notebook version ``feedback-prize-bert``: v13

In [None]:
import os
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers

In [None]:
print(tf.__version__)
print(transformers.__version__)

In [None]:
strategy = tf.distribute.get_strategy()
strategy

# Config

Let us use a config object holding all parameters, settings and configs.

In [None]:
class Config:
    model_name = "tpu_bert_v7"
    n_fold = 5
    # inputs
    input_dir = pathlib.Path("/kaggle/input/feedback-prize-effectiveness/")
    path_train = input_dir / "train.csv"
    train_dir = input_dir / "train"
    path_test = input_dir / "test.csv"
    test_dir = input_dir / "test"
    path_submission = input_dir / "sample_submission.csv"
    labels = ["Ineffective", "Adequate", "Effective"]
    label_dict = {v: i for i, v in enumerate(labels)}
    num_classes = len(labels)
    id_col = "discourse_id"
    # model
    model_dir = pathlib.Path("/kaggle/input/feedback-prize-bert")
    pretrained_dir = model_dir / "pretrained"
    path_tokenizer = pretrained_dir
    fmt_weights_name = f"weights__{model_name}__fold-" + "{fold}.h5"
    max_len = 512
    dropout = 0.4
    # train
    batch_size = 32
    verbose = 2
    
cfg = Config()

In [None]:
sorted(os.listdir(cfg.model_dir))

# Preparation

Load data and tokenizer, prepare the main input ``discourse_type [SEP] discourse_text`` and create dataset.

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.path_tokenizer)
tokenizer

In [None]:
data = pd.read_csv(cfg.path_test)
data["text"] = data["discourse_type"] + tokenizer.sep_token + data["discourse_text"]
data

In [None]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF


def encode_text(text):
    """Encode text with tokenizer and return dictionary of numpy results."""
    encoded = tokenizer.batch_encode_plus(
        text,
        max_length=cfg.max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )
    return {
        "input_ids": encoded["input_ids"].numpy(),
        "attention_masks": encoded["attention_mask"].numpy(),
        "token_type_ids": encoded["token_type_ids"].numpy(),
    }


def get_dataset(data, batch_size=cfg.batch_size, shuffle=False, cache=False, include_label=True):
    """Get dataset"""
    encoded_text = encode_text(data['text'].to_list())
    tensor_slices = encoded_text
    if include_label:
        tensor_slices = (encoded_text, data["label"].to_list())
    ds = tf.data.Dataset.from_tensor_slices(tensor_slices)
    ds = ds.with_options(options)
    if shuffle:
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size)
    if cache:
        ds = ds.cache()
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

In [None]:
ds = get_dataset(data, include_label=False, shuffle=False)
ds

# Load model

Directly copy and paste the ``create_model`` function from the training notebook; compilation can be skipped.

In [None]:
from tensorflow.keras import Model, layers, losses, optimizers, metrics, callbacks, backend

In [None]:
def create_model():
    # inputs
    input_ids = layers.Input(shape=(cfg.max_len,), dtype="int32", name="input_ids")
    attention_masks = layers.Input(shape=(cfg.max_len,), dtype="int32", name="attention_masks")
    token_type_ids = layers.Input(shape=(cfg.max_len,), dtype="int32", name="token_type_ids")
    # base_model
    base_model_config = transformers.AutoConfig.from_pretrained(
        cfg.pretrained_dir / "config.json"
    )
    base_model = transformers.TFAutoModel.from_pretrained(
        cfg.pretrained_dir / "tf_model.h5", config=base_model_config
    )
    # base_model.trainable = False
    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    x = base_model_output.last_hidden_state[:, 0, :]
    # head
    x = layers.Dropout(cfg.dropout)(x)
    output = layers.Dense(cfg.num_classes, activation="softmax")(x)
    model = Model(
        inputs=[input_ids, attention_masks, token_type_ids],
        outputs=output,
        name=cfg.model_name,
    )
#     # compile
#     model.compile(
#         optimizer=optimizers.Adam(cfg.learning_rate),
#         loss=losses.SparseCategoricalCrossentropy(),
#         metrics=["acc"],
#     )
    return model

In [None]:
backend.clear_session()
with strategy.scope():
    model = create_model()
model.summary()

# Predict and create submission

In [None]:
pred_avg = pd.DataFrame(0.0, columns=cfg.labels, index=data[cfg.id_col])
for fold in range(cfg.n_fold):
    print(f"fold: {fold}")
    path_weights = cfg.model_dir / cfg.fmt_weights_name.format(fold=fold)
    print(f"path_weights: {path_weights}")
    model.load_weights(path_weights)
    pred = model.predict(ds, verbose=1)
    pred = pd.DataFrame(pred, columns=cfg.labels, index=data[cfg.id_col])
    display(pred.head())
    pred_avg += pred
pred_avg = pred_avg / cfg.n_fold

In [None]:
pred_avg = pred_avg.reset_index()
pred_avg

In [None]:
pred_avg.to_csv("submission.csv", index=False)
!head -4 "submission.csv"