# BERT for MISL Score Prediction
This code draws mostly from the following repository with code and hyperparameter changes for our specific dataset: https://github.com/ceshine/pytorch-pretrained-BERT/blob/master/notebooks/Sequence%20Regression%20Model.ipynb

It is assumed that PyTorch (pytorch.org) is installed and a large GPU is highly recommended as BERT requires multiple GB of GPU memory. This paper used an NVIDIA GeForce GTX Titan X for all training. There is also an implementation of BERT in PyTorch that must be installed prior to running this notebook. The repository and installation instructions can be found here: https://github.com/huggingface/pytorch-pretrained-BERT.

Comments have been made where changes must be made to replicate results on the users machine.

In [None]:
import os
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("regressor")

FP16 = False
#BATCH_SIZE = 16
BATCH_SIZE = 8
SEED = 42
WARMUP_PROPORTION = 0.1
PYTORCH_PRETRAINED_BERT_CACHE = "/home/.../bert-output/" # Cache directory
LOSS_SCALE = 0. 
MAX_SEQ_LENGTH = 128

DATA_PATH = "/home/.../AutomatedNarrativeAnalysisMISLData.csv" # Path to data set

## Imports

In [None]:
import gc
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear, SCHEDULES
from fastprogress import master_bar, progress_bar
from sklearn.model_selection import train_test_split
import random
import ml_metrics as metrics

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
    device, n_gpu, FP16))

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [None]:
class BertForSequenceRegression(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSequenceRegression, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.apply(self.init_bert_weights)
        self.loss_fct = torch.nn.MSELoss()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, targets=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        outputs = self.regressor(pooled_output).clamp(0, 3)
        if targets is not None:
            loss = self.loss_fct(outputs.view(-1), targets.view(-1))
            return loss
        else:
            return outputs

In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text, target=None):
        self.guid = guid
        self.text = text
        self.target = target


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [None]:
class MISLMacroProcessor:
    def __init__(self, macro_score):
        
        df = pd.read_csv(DATA_PATH)
        texts = df["vecOfNarratives"].values
        scores = df[macro_score].values
        self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(texts, scores, test_size=0.2)
        
    def get_train_examples(self):
        return self._create_examples(self.x_train, self.y_train)

    def get_dev_examples(self):
        return self._create_examples(self.x_valid, self.y_valid)

    #def get_test_examples(self):
    #    return self._create_examples(self.x_test, self.y_test)
    
    def _create_examples(self, x, y):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, (texts, target)) in enumerate(zip(x, y)):
            examples.append(
                InputExample(guid=i, text=texts, target=target))
        return examples

In [None]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)
        
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("target: %s" % (example.target))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              target=example.target))
    return features

In [None]:
class FreezableBertAdam(BertAdam):
    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    continue
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        return lr    

In [None]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [None]:
def count_model_parameters(model):
    logger.info(
        "# of paramters: {:,d}".format(
            sum(p.numel() for p in model.parameters())))
    logger.info(
        "# of trainable paramters: {:,d}".format(
            sum(p.numel() for p in model.parameters() if p.requires_grad)))

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True, 
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

In [None]:
def get_optimizer(num_train_optimization_steps: int, learning_rate: float):
    grouped_parameters = [
       x for x in optimizer_grouped_parameters if any([p.requires_grad for p in x["params"]])
    ]
    for group in grouped_parameters:
        group['lr'] = learning_rate
    if FP16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(grouped_parameters,
                              lr=learning_rate, bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=LOSS_SCALE)

    else:
        optimizer = FreezableBertAdam(grouped_parameters,
                             lr=learning_rate, warmup=WARMUP_PROPORTION,
                             t_total=num_train_optimization_steps)
    return optimizer

In [None]:
def train(model: nn.Module, num_epochs: int, learning_rate: float):
    num_train_optimization_steps = len(train_dataloader) * num_epochs 
    optimizer = get_optimizer(num_train_optimization_steps, learning_rate)
    assert all([x["lr"] == learning_rate for x in optimizer.param_groups])
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_features))
    logger.info("  Batch size = %d", BATCH_SIZE)
    logger.info("  Num steps = %d", num_train_optimization_steps)    
    model.train()
    mb = master_bar(range(num_epochs))
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0    
    for _ in mb:
        for step, batch in enumerate(progress_bar(train_dataloader, parent=mb)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, target = batch
            loss = model(input_ids, segment_ids, input_mask, target)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.

            if FP16:
                optimizer.backward(loss)
            else:
                loss.backward()

            if tr_loss == 0:
                tr_loss = loss.item()
            else:
                tr_loss = tr_loss * 0.9 + loss.item() * 0.1
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if FP16:
                # modify learning rate with special warm up BERT uses
                # if args.fp16 is False, BertAdam is used that handles this automatically
                lr_this_step = (
                     LR * warmup_linear(global_step/num_train_optimization_steps, WARMUP_PROPORTION))
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            mb.child.comment = f'loss: {tr_loss:.4f} lr: {optimizer.get_lr()[0]:.2E}'
    logger.info("  train loss = %.4f", tr_loss) 
    return tr_loss

### Model Training

In [None]:
train_examples = MISLMacroProcessor("Char").get_train_examples() # Change the argument passed to MISLMacroProcessor to whichever MISL element you would like to score. These are the columns of the dataset.

In [None]:
train_features = convert_examples_to_features(
    train_examples, MAX_SEQ_LENGTH, tokenizer)
del train_examples
gc.collect()

In [None]:
# Prepare model
model = BertForSequenceRegression.from_pretrained(
    "bert-base-uncased",
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
if FP16:
    model.half()
model.to(device)

In [None]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in train_features], dtype=torch.float)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

In [None]:
# Train only the "pooler" and the final linear layer
set_trainable(model, True)
set_trainable(model.bert.embeddings, False)
set_trainable(model.bert.encoder, False)
count_model_parameters(model)
train(model, num_epochs = 16, learning_rate = 5e-4) # The number of epochs and learning rate varied depending on the element being scored. The hyperparameters for each element are provided below.

# Char: num_epochs = 16, learning_rate = 5e-4
# Sett: num_epochs = 16, learning_rate = 5e-4
# IE: num_epochs = 16, learning_rate = 1e-3
# Plan: num_epochs = 16, learning_rate = 1e-3
# Act: num_epochs = 16, learning_rate = 1e-3
# Con: num_epochs = 16, learning_rate = 5e-4
# ENP: num_epochs = 16, learning_rate = 1e-3

In [None]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  
output_model_file = "./regressor_stage1.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
gc.collect()

In [None]:
# Train the last two layer, too
set_trainable(model.bert.encoder.layer[11], True)
set_trainable(model.bert.encoder.layer[10], True)
count_model_parameters(model)
train(model, num_epochs = 8, learning_rate = 5e-5) # The number of epochs and learning rate varied depending on the element being scored. The hyperparameters for each element are provided below.

# Char: num_epochs = 16, learning_rate = 5e-5
# Sett: num_epochs = 16, learning_rate = 5e-5
# IE: num_epochs = 8, learning_rate = 5e-5
# Plan: num_epochs = 8, learning_rate = 5e-5
# Act: num_epochs = 8, learning_rate = 5e-5
# Con: num_epochs = 8, learning_rate = 1e-6
# ENP: num_epochs = 8, learning_rate = 5e-5

In [None]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  
output_model_file = "./regressor_stage2.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
# Train all layers
set_trainable(model, True)
count_model_parameters(model)
train(model, num_epochs = 8, learning_rate = 1e-5) # The number of epochs and learning rate varied depending on the element being scored. The hyperparameters for each element are provided below.

# Char: num_epochs = 16, learning_rate = 5e-5
# Sett: num_epochs = 16, learning_rate = 1e-5
# IE: num_epochs = 8, learning_rate = 1e-5
# Plan: num_epochs = 8, learning_rate = 1e-5
# Act: num_epochs = 8, learning_rate = 1e-5
# Con: num_epochs = 8, learning_rate = 1e-7
# ENP: num_epochs = 8, learning_rate = 1e-5

In [None]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  
output_model_file = "./regressor_stage3.pth"
# torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
del train_features
gc.collect()

### Model Evaluation

In [None]:
eval_examples = MISLMacroProcessor("Char").get_dev_examples() # Set the argument to the MISLMacroProcessor to whatever element the model was trained on
eval_features = convert_examples_to_features(
    eval_examples, MAX_SEQ_LENGTH, tokenizer)

In [None]:
logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", BATCH_SIZE * 5)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in eval_features], dtype=torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE * 5)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

mb = progress_bar(eval_dataloader)
pred = []
real = []
for input_ids, input_mask, segment_ids, targets in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    targets = targets.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids, segment_ids, input_mask, targets)
        outputs = model(input_ids, segment_ids, input_mask)

    outputs = [item for sublist in np.round(outputs.detach().cpu().numpy(),0).astype(int).tolist() for item in sublist]
    targets = np.round(targets.to('cpu').numpy(),0).astype(int).tolist()
    pred.extend(outputs)
    real.extend(targets)
    # tmp_eval_accuracy = accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    # eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1
    mb.comment = f'{eval_loss / nb_eval_steps:.4f}'

eval_loss / nb_eval_steps

In [None]:
# Calculate QWK on test set
metrics.quadratic_weighted_kappa(pred, real, max_rating=3, min_rating=0)

## Compare predictions to Expert Scores

In [None]:
expert_df = pd.read_csv("/home/.../ExpertScores.csv") # Change to path of expert scored data
texts = expert_df['vecOfNarratives'].values
scores = expert_df['Char'].values # Set the selected column of the expert data frame to whatever element BERT was trained on.
examples = []
for (i, (texts, target)) in enumerate(zip(texts, scores)):
    examples.append(InputExample(guid=i, text=texts, target=target))
    
test_features = convert_examples_to_features(
    examples, MAX_SEQ_LENGTH, tokenizer)
#del examples
gc.collect()

In [None]:
logger.info("***** Running Testing *****")
logger.info("  Num examples = %d", len(examples))
logger.info("  Batch size = %d", BATCH_SIZE * 5)
all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
all_targets = torch.tensor([f.target for f in test_features], dtype=torch.float)
test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_targets)
# Run prediction for full data
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE * 5)

model.eval()
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0

mb = progress_bar(test_dataloader)
pred = []
real = []
for input_ids, input_mask, segment_ids, targets in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    targets = targets.to(device)

    with torch.no_grad():
        tmp_test_loss = model(input_ids, segment_ids, input_mask, targets)
        outputs = model(input_ids, segment_ids, input_mask)

    outputs = [item for sublist in np.round(outputs.detach().cpu().numpy(),0).astype(int).tolist() for item in sublist]
    targets = np.round(targets.to('cpu').numpy(),0).astype(int).tolist()
    pred.extend(outputs)
    real.extend(targets)
    # tmp_eval_accuracy = accuracy(logits, label_ids)

    test_loss += tmp_test_loss.mean().item()
    # eval_accuracy += tmp_eval_accuracy

    nb_test_examples += input_ids.size(0)
    nb_test_steps += 1
    mb.comment = f'{eval_loss / nb_eval_steps:.4f}'

test_loss / nb_test_steps

In [None]:
# Calculate QWK between predictions and expert scores
metrics.quadratic_weighted_kappa(pred, real, max_rating=3, min_rating=0)