# Feedback Prize - BERT

This is a **training** notebook. The inference notebook can be found [Feedback Prize - BERT Inference](https://www.kaggle.com/code/morodertobias/feedback-prize-bert-inference/notebook).

We fine-tune a pretrained BERT base model and start with a classification acting on ``discourse_type [SEP] discourse_text`` only.


## References

We mainly used the following references, also corresponding to former challenges:

- [Semantic Similarity with BERT](https://keras.io/examples/nlp/semantic_similarity_with_bert/)
- [US Phrase Matching: TF-Keras Train [TPU]](https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu/notebook)
- [TensorFlow - LongFormer - NER - [CV 0.633]](https://www.kaggle.com/code/cdeotte/tensorflow-longformer-ner-cv-0-633/notebook)
- [【Tensorflow】FeedBack BERT-Baseline](https://www.kaggle.com/code/imvision12/tensorflow-feedback-bert-baseline/notebook)
- [TFRecord Experiments - Upsample and Coarse Dropout](https://www.kaggle.com/code/cdeotte/tfrecord-experiments-upsample-and-coarse-dropout)

In [None]:
!pip install -q transformers==4.18.0

In [None]:
import os
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers

In [None]:
print(tf.__version__)
print(transformers.__version__)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    print("TPU failed!")
    tpu = None
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

# Config

Let us use a config object holding all parameters, settings and configs.

In [None]:
class Config:
    seed = 887
    model_name = "tpu_bert_v7"
    n_fold = 5
    one_fold = False
    # inputs
    input_dir = pathlib.Path("/kaggle/input/feedback-prize-effectiveness/")
    path_train = input_dir / "train.csv"
    train_dir = input_dir / "train"
    path_test = input_dir / "test.csv"
    test_dir = input_dir / "test"
    path_submission = input_dir / "sample_submission.csv"
    labels = ["Ineffective", "Adequate", "Effective"]
    label_dict = {v: i for i, v in enumerate(labels)}
    num_classes = len(labels)
    id_col = "discourse_id"
    # model
    pretrained = "bert-base-uncased"
    pretrained_dir = pathlib.Path("/kaggle/working/pretrained")
    max_len = 512
    dropout = 0.4
    # train
    learning_rate = 3e-6  #0.001
    batch_size = 128
    epochs = 25  # 50
    patience = 3
    verbose = 2
    
cfg = Config()

# Download pretrained files

Let us follow the ideas of [TensorFlow - LongFormer - NER - [CV 0.633]](https://www.kaggle.com/code/cdeotte/tensorflow-longformer-ner-cv-0-633/notebook) and download  tokenizer and model, and store them into a notebook output folder in order to use them directly in the inference notebook or for creating a versioned dataset.

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.pretrained)
tokenizer.save_pretrained(cfg.pretrained_dir)
config = transformers.AutoConfig.from_pretrained(cfg.pretrained)
config.save_pretrained(cfg.pretrained_dir)
base_model = transformers.TFAutoModel.from_pretrained(cfg.pretrained, config=config, from_pt=True)
base_model.save_pretrained(cfg.pretrained_dir)
os.listdir(cfg.pretrained_dir)

# Data Preparation

Load data and prepare the text ``discourse_type [SEP] discourse_text`` used in classification. 

The tokenizer will encode this into ``[CLS] discourse_type [SEP] discourse_text [SEP]`` as can be seen by the example below.

In [None]:
data = pd.read_csv(cfg.path_train)
data["label"] = data["discourse_effectiveness"].map(cfg.label_dict)
data

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.pretrained_dir)
tokenizer

In [None]:
data["text"] = data["discourse_type"] + tokenizer.sep_token + data["discourse_text"]
data

#### Look at a random example

In [None]:
rec = data.sample(n=1).iloc[0].to_dict()
rec

In [None]:
print("original:", rec["text"])
print("tokenized:", tokenizer.tokenize(rec["text"]))
print("encode_plus:", tokenizer.encode_plus(rec["text"]))
print("decoded:", tokenizer.decode(tokenizer.encode_plus(rec["text"])['input_ids']))

# Dataset

We like to map ``tokenizer.encode_plus`` to all texts in the dataset; this does not seem to directly work on a tensorflow dataset, hence we compute the results beforehand... as done in the reference notebooks.

In [None]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF


def encode_text(text):
    """Encode text with tokenizer and return dictionary of numpy results."""
    encoded = tokenizer.batch_encode_plus(
        text,
        max_length=cfg.max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf",
    )
    return {
        "input_ids": encoded["input_ids"].numpy(),
        "attention_masks": encoded["attention_mask"].numpy(),
        "token_type_ids": encoded["token_type_ids"].numpy(),
    }


def get_dataset(data, batch_size=cfg.batch_size, shuffle=False, cache=False, include_label=True):
    """Get dataset"""
    encoded_text = encode_text(data['text'].to_list())
    tensor_slices = encoded_text
    if include_label:
        tensor_slices = (encoded_text, data["label"].to_list())
    ds = tf.data.Dataset.from_tensor_slices(tensor_slices)
    ds = ds.with_options(options)
    if shuffle:
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size)
    if cache:
        ds = ds.cache()
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

#### Verify dataset creation

In [None]:
data.iloc[:5]

In [None]:
ds = get_dataset(data.iloc[:5], batch_size=2)
ds.element_spec

In [None]:
elem = next(iter(ds))
elem

# Model

The model is straightforward, i.e., inputs > base_model > output head.

In [None]:
from tensorflow.keras import Model, layers, losses, optimizers, metrics, callbacks, backend

In [None]:
def create_model():
    # inputs
    input_ids = layers.Input(shape=(cfg.max_len,), dtype="int32", name="input_ids")
    attention_masks = layers.Input(shape=(cfg.max_len,), dtype="int32", name="attention_masks")
    token_type_ids = layers.Input(shape=(cfg.max_len,), dtype="int32", name="token_type_ids")
    # base_model
    base_model_config = transformers.AutoConfig.from_pretrained(
        cfg.pretrained_dir / "config.json"
    )
    base_model = transformers.TFAutoModel.from_pretrained(
        cfg.pretrained_dir / "tf_model.h5", config=base_model_config
    )
    # base_model.trainable = False
    base_model_output = base_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    x = base_model_output.last_hidden_state[:, 0, :]
    # head
    x = layers.Dropout(cfg.dropout)(x)
    output = layers.Dense(cfg.num_classes, activation="softmax")(x)
    model = Model(
        inputs=[input_ids, attention_masks, token_type_ids],
        outputs=output,
        name=cfg.model_name,
    )
    # compile
    model.compile(
        optimizer=optimizers.Adam(cfg.learning_rate),
        loss=losses.SparseCategoricalCrossentropy(),
        metrics=["acc"],
    )
    return model

#### Verify model creation

In [None]:
backend.clear_session()
with strategy.scope():
    model = create_model()
model.summary()

In [None]:
model.predict(elem[0]), elem[1]

# Training helper functions

In [None]:
import sklearn.metrics as sk_metrics

In [None]:
def create_callbacks(filepath):
    """Create callbacks for training"""
    return [
        callbacks.ModelCheckpoint(
            filepath=filepath, save_best_only=True, save_weights_only=True, verbose=1
        ),
        callbacks.EarlyStopping(
            patience=cfg.patience, restore_best_weights=False, verbose=1
        ),
    ]


def show_history(history):
    """Show history"""
    history_df = pd.DataFrame(history.history)
    history_df.index = pd.Index(history.epoch, name="epoch")
    display(
        history_df.style.highlight_min(
            color="green", subset=["val_loss"]
        ).highlight_max(color="green", subset=["val_acc"])
    )
    fig, ax = plt.subplots(1, 2, figsize=(16, 8))
    history_df[["loss", "val_loss"]].plot(ax=ax[0], title="loss")
    history_df[["acc", "val_acc"]].plot(ax=ax[1], title="acc")
    plt.tight_layout()
    plt.show()
    
    
def compute_oof(model, valid):
    """Compute OOF"""
    valid_ds = get_dataset(valid)
    pred = model.predict(valid_ds, verbose=0)
    oof = pd.DataFrame(pred, columns=cfg.labels, index=valid[cfg.id_col])
    oof["label"] = valid.set_index(cfg.id_col)["label"]
    return oof    
    

def compute_score(x):
    """Compute score"""
    return sk_metrics.log_loss(y_true=x["label"], y_pred=x[cfg.labels])

In [None]:
def run_training(train, valid, filename):
    """Run training"""
    # https://www.kaggle.com/code/cdeotte/tfrecord-experiments-upsample-and-coarse-dropout
    if tpu:
        tf.tpu.experimental.initialize_tpu_system()
    # create datasets
    train_ds = get_dataset(train, shuffle=True)
    valid_ds = get_dataset(valid)
    # create model
    backend.clear_session()
    with strategy.scope():
        model = create_model()
    # fit
    hist = model.fit(
        train_ds,
        epochs=cfg.epochs,
        validation_data=valid_ds,
        callbacks=create_callbacks(filename),
        verbose=cfg.verbose,
    )
    model.load_weights(filename)
    # oof
    oof = compute_oof(model, valid)
    return hist, oof

# Run training

We use stratified splitting in creating training and validation folds.

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
skf

In [None]:
%%time
d_oof = {}
for fold, (iloc_train, iloc_valid) in enumerate(skf.split(data, data['label'])):
    print(f"fold: {fold}")
    train = data.iloc[iloc_train]
    valid = data.iloc[iloc_valid]
    model_filepath = f"weights__{cfg.model_name}__fold-{fold}.h5"
    print(f"#train: {len(train)},  #valid: {len(valid)} ")
    print(f"model_filepath: {model_filepath}")
    hist, oof = run_training(train, valid, model_filepath)
    print("OOF score:", compute_score(oof))
    show_history(hist)
    d_oof[fold] = oof
    if cfg.one_fold:
        break    

# OOF

Finalize OOF prediction and score summary.

In [None]:
oof = pd.concat(d_oof, names=['fold']).reset_index('fold')
oof.to_csv("oof.csv")
score_by_fold = oof.groupby('fold').apply(compute_score)
display(score_by_fold)
score = compute_score(oof)
print(f"\nOOF score: {score:.6f}")