# Train and validate model from Scratch

In [3]:
#https://github.com/cambridgeltl/MTL-Bioinformatics-2016/tree/master/data -- for data

In [4]:
from torch import nn
from transformers import BertForTokenClassification


class BertForTokenClassificationCustom(nn.Module):
    def __init__(self,
                 model_name: str,
                 num_labels: int,
                 hidden_dropout_prob: float,
                 attention_probs_dropout_prob: float):
        """
        This model is a replica of BertForTokenClassification class but instead of being
        a subclass of `PreTrainedModel` (transformers library) it is a subclass of `nn.Module`
        from Pytorch. In fact, `BertForTokenClassification` is instantiated through `from_pretrained`
        within the `init` method.
        The only difference in functionallity is within the `forward` method.
        Here the output of BERT model is reshaped to be directly compatible with
        `nn.module.CrossEntropyLoss` (batch_size, num_classes, sequence_len). This is done in
        order to make compatible this BERT model with the `torch_lr_finder` library.
        Args:
            model_name (`str`): model name as expected in Transformers library
            num_labels (`int`): number of labels
            hidden_dropout_prob(`float`): droput
            attention_probs_dropout_prob (`float`): dropout in attention
        """
        super().__init__()
        self.bert = BertForTokenClassification.from_pretrained(model_name,
                                                               num_labels=num_labels,
                                                               hidden_dropout_prob=hidden_dropout_prob,
                                                               attention_probs_dropout_prob=attention_probs_dropout_prob)

    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path: str,
                        num_labels: int = 2,
                        hidden_dropout_prob: float = 0.,
                        attention_probs_dropout_prob: float = 0.):

        return BertForTokenClassificationCustom(model_name=pretrained_model_name_or_path,
                                                num_labels=num_labels,
                                                hidden_dropout_prob=hidden_dropout_prob,
                                                attention_probs_dropout_prob=attention_probs_dropout_prob)

    def forward(self, input_bert):
        outputs = self.bert(*input_bert)

        # Prepare output to be compatible with nn.module.CrossEntropyLoss()
        bert_shape = outputs[0].shape
        return outputs[0].view(bert_shape[0], bert_shape[-1], -1)

In [5]:
from transformers import PreTrainedModel
import torch
from typing import Union


def get_optimizer_with_weight_decay(model: PreTrainedModel,
                                    optimizer: torch.optim.Optimizer,
                                    learning_rate: Union[float, int],
                                    weight_decay: Union[float, int]) -> torch.optim.Optimizer:
    """
    Apply weight decay to all the network parameters but those called `bias` or  `LayerNorm.weight`.
    Args:
        model (`PreTrainedModel`): model to apply weight decay.
        optimizer (`torch.optim.Optimizer`): The optimizer to use during training.
        learning_rate (`float` or `int`): value of the learning rate to use during training.
        weight_decay (`float` or `int`): value of the weight decay to apply.
    Returns:
        optimizer (`torch.optim.Optimizer`): the optimizer instantiated with the selected
        learning rate and the parameters with and without weight decay.
    """
    no_decay = ["bias", "LayerNorm.weight"]
    params = [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]
    params_nd = [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]
    optimizer_grouped_parameters = [{"params": params, "weight_decay": weight_decay},
                                    {"params": params_nd, "weight_decay": 0.0}]

    return optimizer(optimizer_grouped_parameters, lr=learning_rate)

In [6]:
import itertools
import os
from typing import List, Tuple, Union, Dict, Optional

import numpy as np
import torch
from matplotlib import pyplot as plt
from seqeval.metrics import classification_report as seqeval_report
from seqeval.metrics import f1_score
from sklearn.metrics import classification_report as sklearn_report
from torch import nn
from torch.nn.modules.loss import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from transformers import PreTrainedModel, PreTrainedTokenizer


class BertTrainer:
    def __init__(self,
                 model: PreTrainedModel,
                 tokenizer: PreTrainedTokenizer,
                 optimizer: torch.optim.Optimizer,
                 n_epochs: int,
                 labels2ind: Dict[str, int],
                 scheduler: Optional[torch.optim.lr_scheduler.LambdaLR] = None,
                 device: str = 'cpu',
                 clipping: Optional[Union[int, float]] = None,
                 accumulate_grad_every: int = 1,
                 print_every: int = 10,
                 print_val_mistakes: bool = False,
                 output_dir: str = './'):

        """
        Complete training and evaluation loop in Pytorch specially designed for
        BERT-based models from transformers library. It allows to save the model
        from the epoch with the best F1-score and the tokenizer. The class
        optionally generates reports and figures with the obtained results that
        are automatically stored in disk.
        Args:
            model (`PreTrainedModel`): Pre-trained model from transformers library.
                For NER, usually loaded as `BertForTokenClassification.from_pretrained(...)`
            tokenizer (`PreTrainedTokenizer`): Pre-trained tokenizer from transformers library.
                Usually loaded as `AutoTokenizer.from_pretrained(...)`
            optimizer (`torch.optim.Optimizer`): Pytorch Optimizer
            n_epochs (`int`): Number of epochs to train.
            labels2ind (`dict`): maps `str` class labels into `int` indexes.
            scheduler (`torch.optim.lr_scheduler.LambdaLR`, `Optional`): Pytorch scheduler. It sets a
                different learning rate for each training step to update the network weights.
            device (`str`): Type of device where to train the network. It must be `cpu` or `cuda`.
            clipping (`int` or `float`, `Optional`): max norm to apply to the gradients. If None,
                no graddient clipping is applied.
            accumulate_grad_every (`int`): How often you want to accumulate the gradient. This is useful
                when there are limitations in the batch size due to memory issues. Let's say that in your
                GPU only fits a model with batch size of 8 and you want to try a batch size of 32. Then,
                you should set this parameter to 4 (8*4=32). Internally, a loop will be ran 4 times
                accumulating the gradient for each step. Later, the network parameters will be updated.
                So at the end, this is equivalent to train your network with a batch size of 32. The batch
                size is inferred from `dataloader_train` argument.
            print_every (`int`): How often you want to print loss. Measured in batches where a batch is
                considered batch_size * accumulate_grad_every.
            print_val_mistakes (`bool`): whether to print validation examples (sentences) where the model
                commits at least one mistake. It is printed after each epoch. The printed info is the word
                within each sentence, its predicted label and the real label. This is very useful to
                inspect the behaviour of your model.
            output_dir (`str`): Directory where file reports and images are saved.
        Methods:
            train(dataloader_train: DataLoader, dataloader_val: Optional[DataLoader] = None)
                Complete training and evaluation (optional) loop in Pytorch.
            evaluate(dataloader_val: DataLoader, epoch: int = 0, verbose: bool = False)
                Evaluation on test data.
        """

        self.tokenizer = tokenizer
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.n_epochs = n_epochs
        self.labels2ind = labels2ind
        self.inds2labels = {v: k for k, v in self.labels2ind.items()}
        self.device = device
        self.clipping = clipping
        self.accumulate_grad_every = accumulate_grad_every
        self.print_every = print_every
        self.print_val_mistakes = print_val_mistakes
        self.output_dir = output_dir

        os.makedirs(self.output_dir, exist_ok=True)

    def _reformat_predictions(self,
                              y_true: List[List[int]],
                              y_pred: List[List[int]],
                              input_ids: List[List[str]]
                              ) -> Tuple[List[List[str]],
                                         List[List[str]],
                                         List[List[str]]]:
        """
        Takes batch of tokens, labels (class indexes) and predictions (class indexes)
        and get rid of unwanted tokens, that is, those that have as label the index
        to ignore (i.e. padding tokens).  It also converts the label and prediction
        indexes into their corresponding class name.
        Args:
            y_true (list of lists `int`): indexes of the real labels
            y_pred (list of lists `int`): indexes of the predicted classes
            input_ids (list of lists `str`) : tokens
        Returns:
            Tuple that contains the transformed input arguments
        """
        # Map indexes to labels and remove ignored indexes
        true_list = [[] for _ in range(len(y_true))]
        pred_list = [[] for _ in range(len(y_pred))]
        input_list = [[] for _ in range(len(input_ids))]

        for i in range(len(y_true)):
            for j in range(len(y_true[0])):
                if y_true[i][j] != CrossEntropyLoss().ignore_index:
                    true_list[i].append(self.inds2labels[y_true[i][j]])
                    pred_list[i].append(self.inds2labels[y_pred[i][j]])
                    input_list[i].append(input_ids[i][j])

        return true_list, pred_list, input_list

    def _print_missclassified_val_examples(self,
                                           y_true: List[List[str]],
                                           y_pred: List[List[str]],
                                           input_ids: List[List[str]]):
        """
        print validation examples (sentences) where the model commits at least
        one mistake. It is printed after each epoch. This is very useful to
        inspect the behaviour of your model.
        Args:
            y_true (list of lists `str`): real labels
            y_pred (list of lists `str`): predicted classes
            input_ids (list of lists `str`) : tokens
        Examples::
                TOKEN          LABEL          PRED
                immunostaining O              O
                showed         O              O
                the            O              O
                estrogen       B-cell_type    B-cell_type
                receptor       I-cell_type    I-cell_type
                cells          I-cell_type    O
                                  ·
                                  ·
                                  ·
                synovial       O              O
                tissues        O              O
                .              O              O
        """
        # Print some examples (where the model fails)
        for i in range(len(input_ids)):
            if y_true[i] != y_pred[i]:
                tokens = self.tokenizer.convert_ids_to_tokens(input_ids[i])
                max_len_token = max([len(t) for t in tokens] +
                                    [len(la) for la in self.labels2ind.keys()])

                print(f"\n{'TOKEN':<{max_len_token}}",
                      f"{'LABEL':<{max_len_token}}",
                      f"{'PRED':<{max_len_token}}")

                for token, label_true, label_pred in zip(tokens, y_true[i], y_pred[i]):
                    print(f"{token:<{max_len_token}}",
                          f"{label_true:<{max_len_token}}",
                          f"{label_pred:<{max_len_token}}")

    def _write_report_to_file(self,
                              report_entities: str,
                              report_tokens: str,
                              epoch: int,
                              tr_loss: float,
                              val_loss: float):
        """
        Writes and saves the following info into a file called `classification_report.txt`
        within the directory `output_dir` for the model from the best epoch:
            - Classification report at span/entity level (for validation dataset).
            - Classification report at word level (for validation dataset).
            - Epoch where the best model was found (best F1-score in validation dataset)
            - Training loss from the best epoch.
            - Validation loss from the best epoch.
        Args:
            report_entities (`str`): classification report at entity/span level.
            report_tokens (`str`): classification report at word level.
            epoch (`int`): epoch
            tr_loss (`float`): training loss
            val_loss (`float`): validation loss
        Returns:
        """
        with open(os.path.join(self.output_dir, 'classification_report.txt'), 'w') as f:
            f.write(report_entities)
            f.write(f'\n{report_tokens}')
            f.write(f"\nEpoch: {epoch} "
                    f"\n- Training Loss: {tr_loss}"
                    f"\n- Validation Loss: {val_loss}")

    def _save_model(self):
        if not isinstance(self.model, PreTrainedModel):
            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

    def _estimate_gradients(self, batch: Dict[str, torch.Tensor]) -> float:
        # Send tensors to device
        batch = {k: v.to(self.device) for k, v in batch.items()}

        # estimate loss and gradient
        loss, _ = self.model(**batch)
        loss.backward()

        return loss.item()

    def _update_network_params(self):
        # Graddient clipping
        if self.clipping is not None:
            nn.utils.clip_grad_norm_(self.model.parameters(), self.clipping)

        # Udate parameters (accumulated gradient based on accumulated grad)
        self.optimizer.step()
        if self.scheduler is not None:
            self.scheduler.step()
        self.model.zero_grad()

    def _validation_step(self,
                         batch: Dict[str, torch.Tensor]
                         ) -> Tuple[float, np.ndarray]:
        # Send tensors to device
        batch = {k: v.to(self.device) for k, v in batch.items()}

        # Predict and estimate error
        with torch.no_grad():
            loss, pred = self.model(**batch)

        return loss.item(), pred.detach().cpu().numpy()

    def evaluate(self,
                 dataloader_val: DataLoader,
                 epoch: int = 0,
                 verbose: bool = False) -> Tuple[float, float, str, str]:
        """
        Args:
            dataloader_val:
            epoch:
            verbose:
        Returns:
        """
        n_steps_val = len(dataloader_val)
        self.model.eval()

        val_loss_cum = .0
        y_pred = []
        y_true = []
        input_ids = []
        for step, batch in enumerate(dataloader_val):
            val_loss, pred = self._validation_step(batch)
            val_loss_cum += val_loss
            y_true.extend(batch['labels'].tolist())
            y_pred.extend(pred.argmax(axis=-1).tolist())
            input_ids.extend(batch['input_ids'].tolist())

        y_true, y_pred, input_ids = self._reformat_predictions(y_true, y_pred, input_ids)

        # Performance Reports and loss
        report_entities = seqeval_report(y_true=y_true, y_pred=y_pred, digits=4)
        report_tokens = sklearn_report(y_true=list(itertools.chain(*y_true)),
                                       y_pred=list(itertools.chain(*y_pred)), digits=4)

        loss_val_epoch = val_loss_cum / n_steps_val

        if verbose:
            print(f"- Epoch: {epoch}/{self.n_epochs - 1} - Validation Loss: {loss_val_epoch}")
            print(report_entities)
            print(report_tokens)

        # Print some examples (where the model fails)
        if self.print_val_mistakes and verbose:
            self._print_missclassified_val_examples(y_true, y_pred, input_ids)

        # Save model and write report to txt file
        f1 = f1_score(y_true=y_true, y_pred=y_pred)

        return loss_val_epoch, f1, report_entities, report_tokens

    def train(self,
              dataloader_train: DataLoader,
              dataloader_val: Optional[DataLoader] = None
              ) -> Tuple[List[float], List[float]]:
        """
        Complete training and evaluation (optional) loop in Pytorch.
        Args:
            dataloader_train (`torch.utils.data.dataloader.DataLoader`): Pytorch dataloader.
            dataloader_val (`torch.utils.data.dataloader.DataLoader`, `Optional`):
                Pytorch dataloader. If `None` no validation will be performed.
        Returns:
            loss_tr_epochs (list of `float`): training loss for each epoch
            loss_val_epochs (list of `float`): validation loss for each epoch
        """
        loss_tr_epochs = []
        loss_val_epochs = []
        f1_best = .0
        lrs = []
        self.model.to(self.device)

        for epoch in range(self.n_epochs):
            tr_loss_mean = .0
            tr_loss_cum = .0
            step = -1

            # Training
            # -----------------------------
            self.model.train()
            self.model.zero_grad()
            for i, batch in enumerate(dataloader_train):
                # Estimate gradients and accumulate them
                tr_loss = self._estimate_gradients(batch)
                tr_loss_cum += tr_loss

                # Update params every acumulated steps
                if (i + 1) % self.accumulate_grad_every == 0:
                    self._update_network_params()
                    if self.scheduler is not None:
                        lrs.append(self.scheduler.get_last_lr()[0])
                    step += 1
                else:
                    continue

                if step % self.print_every == 0:
                    tr_loss_mean = tr_loss_cum/(i+1)
                    print(f"- Epoch: {epoch}/{self.n_epochs - 1}",
                          f"- Step: {step:3}/{(len(dataloader_train)// self.accumulate_grad_every) - 1}",
                          f"- Training Loss: {tr_loss_mean:.6f}")

            loss_tr_epochs.append(tr_loss_mean)
            print(f"- Epoch: {epoch}/{self.n_epochs - 1} - Training Loss: {tr_loss_mean}")

            # Plot training curve
            plt.plot(loss_tr_epochs)
            plt.xlabel('#Epochs')
            plt.ylabel('Error')
            plt.legend(['training'])

            # Validation
            # -----------------------------
            if dataloader_val is not None:
                val_loss, f1, report_ent, report_toks = self.evaluate(dataloader_val,
                                                                      epoch=epoch,
                                                                      verbose=True)
                loss_val_epochs.append(val_loss)

                if f1 > f1_best:
                    f1_best = f1
                    self._save_model()
                    self._write_report_to_file(report_ent, report_toks, epoch,
                                               tr_loss_mean, val_loss)

                # Plot val curve
                plt.plot(loss_val_epochs)
                plt.legend(['training', 'validation'])

            plt.tight_layout()
            plt.savefig(os.path.join(self.output_dir, 'error_curves.jpg'))
            plt.close()

            # Plot learning rate curve
            plt.plot(lrs)
            plt.xlabel('#Batches')
            plt.ylabel('Learning rate')
            plt.tight_layout()
            plt.savefig(os.path.join(self.output_dir, 'learning_rate.jpg'))
            plt.close()
        return loss_tr_epochs, loss_val_epochs

In [7]:
from collections import defaultdict
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict, Tuple, Union

import numpy as np
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from transformers import PreTrainedTokenizer


@dataclass
class DataSample:
    """
    A single training/test example (sentence) for token classification.
    """
    words: List[str]
    labels: List[str]


@dataclass
class InputBert:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a BERT model.
    """
    input_ids: torch.tensor
    attention_mask: torch.tensor
    token_type_ids: torch.tensor
    labels: Optional[torch.tensor] = None


class NerDataset(Dataset):
    def __init__(self,
                 dataset: List[DataSample],
                 tokenizer: PreTrainedTokenizer,
                 labels2ind: Dict[str, int],
                 max_len_seq: int = 512,
                 bert_hugging: bool = True):
        """
        Class that builds a torch Dataset specially designed for NER data.
        Args:
            dataset (list of `DataSample` instances): Each data sample is a dataclass
                that contains two fields: `words` and `labels`. Both are lists of `str`.
            tokenizer (`PreTrainedTokenizer`): Pre-trained tokenizer from transformers
                library. Usually loaded as `AutoTokenizer.from_pretrained(...)`.
            labels2ind (`dict`): maps `str` class labels into `int` indexes.
            max_len_seq (`int`): Max length sequence for each example (sentence).
            bert_hugging (`bool`):
        """
        super(NerDataset).__init__()
        self.bert_hugging = bert_hugging
        self.max_len_seq = max_len_seq
        self.label2ind = labels2ind
        self.features = data2tensors(data=dataset,
                                     tokenizer=tokenizer,
                                     label2idx=self.label2ind,
                                     max_seq_len=max_len_seq,
                                     pad_token_label_id=nn.CrossEntropyLoss().ignore_index)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> Union[Dict[str, torch.tensor],
                                      Tuple[List[torch.tensor], torch.tensor]]:
        if self.bert_hugging:
            return asdict(self.features[i])
        else:
            inputs = asdict(self.features[i])
            labels = inputs.pop('labels')
            return list(inputs.values()), labels


def get_labels(data: List[DataSample]) -> Tuple[Dict[str, int], Dict[str, int]]:
    """
    Automatically extract labels types from the data and its count.
    Args:
        data (list of `DataSample`): Each data sample is a dataclass that contains
            two fields: `words` and `labels`. Both are lists of `str`.
    Returns:
        labels2idx (`dict`): maps `str` class labels into `int` indexes.
        labels_count(`dict`): The number of words for each class label that appears in
            the dataset. Usufull information if you want to apply class weights on
            imbalanced data.
    """
    labels = set()
    labels_counts = defaultdict(int)
    for sent in data:
        labels.update(sent.labels)

        for label_ in sent.labels:
            labels_counts[label_] += 1

    if "O" not in labels:
        labels.add('O')
        labels_counts['0'] = 0

    # Convert list of labels ind a mapping labels -> index
    labels2idx = {label_: i for i, label_ in enumerate(labels)}
    return labels2idx, dict(labels_counts)


def get_class_weight_tensor(labels2ind: Dict[str, int],
                            labels_count: Dict[str, int]) -> torch.Tensor:
    """
    Get the class weights based on the class labels frequency within the dataset.
    Args:
        labels2ind (`dict`): maps `str` class labels into `int` indexes.
        labels_count (`dict`): The number of words for each class label that appears in
            the dataset.
    Returns:
        torch.Tensor with the class weights. Size (num_classes).
    """
    label2ind_list = [(k, v) for k, v in labels2ind.items()]
    label2ind_list.sort(key=lambda x: x[1])
    total_labels = sum([count for label, count in labels_count.items()])
    class_weights = [total_labels/labels_count[label] for label, _ in label2ind_list]
    return torch.tensor(np.array(class_weights)/max(class_weights), dtype=torch.float32)


def read_data_from_file(file_path: str, sep: str = '\t') -> List[DataSample]:
    """
    Load data from a txt file (BIO tagging format) and transform it into the
    required format (list of `DataSample` instances).
    Args:
        file_path (`str`): complete path where the data is located (path + filename).
        sep (`str`): Symbol used to separete word from label at each line. Default `\t`.
    Returns:
        List of `DataSample` instances containing words and labels.
    """
    examples = []
    words = []
    labels = []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            splits = line.split(sep)
            if len(splits) > 1:
                words.append(splits[0])
                labels.append(splits[-1].replace('\n', ''))
            else:
                examples.append(DataSample(words=words, labels=labels))
                words = []
                labels = []
    return examples


def data2tensors(data: List[DataSample],
                 tokenizer: PreTrainedTokenizer,
                 label2idx: Dict[str, int],
                 pad_token_label_id: int = -100,
                 max_seq_len: int = 512) -> List[InputBert]:
    """
    Takes data and converts it into tensors to feed the neural network.
    Args:
        data (`list`): List of `DataSample` instances containing words and labels.
        tokenizer (`PreTrainedTokenizer`): Pre-trained tokenizer from transformers
            library. Usually loaded as `AutoTokenizer.from_pretrained(...)`.
        label2idx (`dict`): maps `str` class labels into `int` indexes.
        pad_token_label_id (`int`): index to define the special token [PAD]
        max_seq_len (`int`): Max sequence length.
    Returns:
        List of `InputBert` instances. `InputBert` is a dataclass that contains
        `input_ids`, `attention_mask`, `token_type_ids` and `labels` (Optional).
    """

    features = []
    for sentence in data:
        tokens = []
        label_ids = []
        for word, label in zip(sentence.words, sentence.labels):
            subword_tokens = tokenizer.tokenize(text=word)

            # BERT could return an empty list of subtokens
            if len(subword_tokens) > 0:
                tokens.extend(subword_tokens)

                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                label_ids.extend([label2idx[label]] + [pad_token_label_id] * (len(subword_tokens) - 1))
                # if label.startswith('B'):
                #     label_ids.extend([label2idx[label]] + [label2idx[f"I{label[1:]}"]] * (len(subword_tokens) - 1))
                # else:
                #     label_ids.extend([label2idx[label]] + [label2idx[label]] * (len(subword_tokens) - 1))

        # Drop part of the sequence longer than max_seq_len (account also for [CLS] and [SEP])
        if len(tokens) > max_seq_len - 2:
            tokens = tokens[:max_seq_len - 2]
            label_ids = label_ids[: max_seq_len - 2]

        # Add special tokens  for the list of tokens and its corresponding labels.
        # For BERT: cls_token = '[CLS]' and sep_token = '[SEP]'
        # For RoBERTa: cls_token = '<s>' and sep_token = '</s>'
        tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
        label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]

        # Create an attention mask (used to locate the padding)
        padding_len = (max_seq_len - len(tokens))
        attention_mask = [1] * len(tokens) + [0] * padding_len

        # Add padding
        tokens += [tokenizer.pad_token] * padding_len
        label_ids += [pad_token_label_id] * padding_len

        # Convert tokens to ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Create segment_id. All zeros since we only have one sentence
        segment_ids = [0] * max_seq_len

        # Assert all the input has the expected length
        assert len(input_ids) == max_seq_len
        assert len(label_ids) == max_seq_len
        assert len(attention_mask) == max_seq_len
        assert len(segment_ids) == max_seq_len

        # Append input features for each sequence/sentence
        features.append((InputBert(input_ids=torch.tensor(input_ids),
                                   attention_mask=torch.tensor(attention_mask),
                                   token_type_ids=torch.tensor(segment_ids),
                                   labels=torch.tensor(label_ids))))
    return features

In [2]:
#https://github.com/fran-martinez/bio_ner_bert
import random

from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import (get_linear_schedule_with_warmup,
                          BertForTokenClassification,
                          AutoTokenizer)

In [8]:
DATA_TR_PATH = '/Users/sdeshpande/Desktop/bioinformatices/MTL-Bioinformatics-2016/data/JNLPBA/original-data/train/Genia4ERtask1.iob2'
DATA_VAL_PATH = '/Users/sdeshpande/Desktop/bioinformatices/MTL-Bioinformatics-2016/data/JNLPBA/original-data/test/Genia4EReval1.iob2'
DATA_TEST_PATH = None
SEED = 42

In [9]:
# MODEL
MODEL_NAME = 'allenai/scibert_scivocab_uncased'
MAX_LEN_SEQ = 128
DEVICE = 'cpu'

In [10]:
# Optimization parameters
N_EPOCHS = 6
BATCH_SIZE = 8
BATCH_SIZE_VAL = 28
WEIGHT_DECAY = 0
LEARNING_RATE = 1e-4  # 2e-4
RATIO_WARMUP_STEPS = .1
DROPOUT = .3
ACUMULATE_GRAD_EVERY = 4
OPTIMIZER = Adam


In [11]:
# Seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# get data
training_set = read_data_from_file(DATA_TR_PATH)
val_set = read_data_from_file(DATA_VAL_PATH)

In [12]:
# Automatically extract labels and their indexes from data.
labels2ind, labels_count = get_labels(training_set + val_set)

In [13]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [14]:
# Create loaders for datasets
training_set = NerDataset(dataset=training_set,
                          tokenizer=tokenizer,
                          labels2ind=labels2ind,
                          max_len_seq=MAX_LEN_SEQ)

In [15]:
val_set = NerDataset(dataset=val_set,
                     tokenizer=tokenizer,
                     labels2ind=labels2ind,
                     max_len_seq=MAX_LEN_SEQ)


In [16]:
dataloader_tr = DataLoader(dataset=training_set,
                           batch_size=BATCH_SIZE,
                           shuffle=True)

dataloader_val = DataLoader(dataset=val_set,
                            batch_size=BATCH_SIZE_VAL,
                            shuffle=False)

In [17]:
# Load model
nerbert = BertForTokenClassification.from_pretrained(MODEL_NAME,
                                                     hidden_dropout_prob=DROPOUT,
                                                     attention_probs_dropout_prob=DROPOUT,
                                                     num_labels=len(labels2ind),
                                                     id2label={str(v): k for k, v in labels2ind.items()})

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [18]:
# Prepare optimizer and schedule (linear warmup and decay)
optimizer = get_optimizer_with_weight_decay(model=nerbert,
                                            optimizer=OPTIMIZER,
                                            learning_rate=LEARNING_RATE,
                                            weight_decay=WEIGHT_DECAY)

training_steps = (len(dataloader_tr)//ACUMULATE_GRAD_EVERY) * N_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_warmup_steps=training_steps * RATIO_WARMUP_STEPS,
                                            num_training_steps=training_steps)


In [19]:
# Trainer
trainer = BertTrainer(model=nerbert,
                      tokenizer=tokenizer,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      labels2ind=labels2ind,
                      device=DEVICE,
                      n_epochs=N_EPOCHS,
                      accumulate_grad_every=ACUMULATE_GRAD_EVERY,
                      output_dir='/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models')

In [20]:
# Train and validate model
trainer.train(dataloader_train=dataloader_tr,
              dataloader_val=dataloader_val)

- Epoch: 0/5 - Step:   0/578 - Training Loss: 2.409743
- Epoch: 0/5 - Step:  10/578 - Training Loss: 2.291913
- Epoch: 0/5 - Step:  20/578 - Training Loss: 2.035993
- Epoch: 0/5 - Step:  30/578 - Training Loss: 1.754588
- Epoch: 0/5 - Step:  40/578 - Training Loss: 1.574826
- Epoch: 0/5 - Step:  50/578 - Training Loss: 1.451670
- Epoch: 0/5 - Step:  60/578 - Training Loss: 1.351268
- Epoch: 0/5 - Step:  70/578 - Training Loss: 1.264252
- Epoch: 0/5 - Step:  80/578 - Training Loss: 1.180885
- Epoch: 0/5 - Step:  90/578 - Training Loss: 1.111903
- Epoch: 0/5 - Step: 100/578 - Training Loss: 1.048262
- Epoch: 0/5 - Step: 110/578 - Training Loss: 0.988361
- Epoch: 0/5 - Step: 120/578 - Training Loss: 0.936783
- Epoch: 0/5 - Step: 130/578 - Training Loss: 0.891917
- Epoch: 0/5 - Step: 140/578 - Training Loss: 0.849834
- Epoch: 0/5 - Step: 150/578 - Training Loss: 0.812592
- Epoch: 0/5 - Step: 160/578 - Training Loss: 0.780114
- Epoch: 0/5 - Step: 170/578 - Training Loss: 0.751390
- Epoch: 0

- Epoch: 2/5 - Step:   0/578 - Training Loss: 0.121493
- Epoch: 2/5 - Step:  10/578 - Training Loss: 0.138313
- Epoch: 2/5 - Step:  20/578 - Training Loss: 0.136047
- Epoch: 2/5 - Step:  30/578 - Training Loss: 0.140055
- Epoch: 2/5 - Step:  40/578 - Training Loss: 0.140122
- Epoch: 2/5 - Step:  50/578 - Training Loss: 0.138909
- Epoch: 2/5 - Step:  60/578 - Training Loss: 0.145421
- Epoch: 2/5 - Step:  70/578 - Training Loss: 0.146230
- Epoch: 2/5 - Step:  80/578 - Training Loss: 0.144953
- Epoch: 2/5 - Step:  90/578 - Training Loss: 0.144989
- Epoch: 2/5 - Step: 100/578 - Training Loss: 0.146096
- Epoch: 2/5 - Step: 110/578 - Training Loss: 0.144722
- Epoch: 2/5 - Step: 120/578 - Training Loss: 0.144695
- Epoch: 2/5 - Step: 130/578 - Training Loss: 0.145071
- Epoch: 2/5 - Step: 140/578 - Training Loss: 0.145181
- Epoch: 2/5 - Step: 150/578 - Training Loss: 0.144830
- Epoch: 2/5 - Step: 160/578 - Training Loss: 0.145522
- Epoch: 2/5 - Step: 170/578 - Training Loss: 0.146039
- Epoch: 2

- Epoch: 4/5 - Step:   0/578 - Training Loss: 0.099437
- Epoch: 4/5 - Step:  10/578 - Training Loss: 0.109565
- Epoch: 4/5 - Step:  20/578 - Training Loss: 0.110913
- Epoch: 4/5 - Step:  30/578 - Training Loss: 0.110964
- Epoch: 4/5 - Step:  40/578 - Training Loss: 0.105466
- Epoch: 4/5 - Step:  50/578 - Training Loss: 0.105688
- Epoch: 4/5 - Step:  60/578 - Training Loss: 0.105968
- Epoch: 4/5 - Step:  70/578 - Training Loss: 0.108394
- Epoch: 4/5 - Step:  80/578 - Training Loss: 0.109063
- Epoch: 4/5 - Step:  90/578 - Training Loss: 0.110021
- Epoch: 4/5 - Step: 100/578 - Training Loss: 0.110566
- Epoch: 4/5 - Step: 110/578 - Training Loss: 0.109592
- Epoch: 4/5 - Step: 120/578 - Training Loss: 0.108561
- Epoch: 4/5 - Step: 130/578 - Training Loss: 0.108495
- Epoch: 4/5 - Step: 140/578 - Training Loss: 0.108864
- Epoch: 4/5 - Step: 150/578 - Training Loss: 0.108699
- Epoch: 4/5 - Step: 160/578 - Training Loss: 0.108431
- Epoch: 4/5 - Step: 170/578 - Training Loss: 0.109087
- Epoch: 4

([0.37590606879906147,
  0.165706807995401,
  0.14048133191983697,
  0.12330803851843297,
  0.10823663306339348,
  0.0962954140582241],
 [0.21983185259328372,
  0.2029390742705352,
  0.21392140464614268,
  0.21154211180797522,
  0.21867519077183545,
  0.22014311236747797])

In [21]:
# Test the model on test set if any
if DATA_TEST_PATH is not None:
    print(f"{'*'*40}\n\t\tEVALUATION ON TEST SET\n{'*'*40}")
    test_set = read_data_from_file(DATA_TEST_PATH)

    test_set = NerDataset(dataset=test_set,
                          tokenizer=tokenizer,
                          labels2ind=labels2ind,
                          max_len_seq=MAX_LEN_SEQ)

    dataloader_test = DataLoader(dataset=test_set,
                                 batch_size=BATCH_SIZE_VAL)

    trainer.evaluate(dataloader_test, verbose=True)

In [None]:
# #Find learning rate ----- 
# import random

# from torch.optim import Adam
# from torch.utils.data import DataLoader
# from torch_lr_finder import LRFinder
# from transformers import AutoTokenizer

# from data_utils.data_utils import *
# from nn_utils.neural_architectures import *
# from nn_utils.neural_architectures import BertForTokenClassificationCustom
# from nn_utils.optimizers import get_optimizer_with_weight_decay

# DATA_TR_PATH = '/Users/sdeshpande/Desktop/bioinformatices/MTL-Bioinformatics-2016/data/JNLPBA/original-data/train/Genia4ERtask1.iob2'
# SEED = 42

# # MODEL
# MODEL_NAME = 'allenai/scibert_scivocab_cased'
# MAX_LEN_SEQ = 128

# # Optimization parameters
# BATCH_SIZE_TR = 32
# LEARNING_RATE = 1e-6
# CLIPPING = None
# OPTIMIZER = Adam

# # get data
# training_set = read_data_from_file(DATA_TR_PATH)

# # Automatically extract labels and their indexes from data.
# labels2ind, labels_count = get_labels(training_set)

# # Load data
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# training_set = NerDataset(dataset=training_set,
#                           tokenizer=tokenizer,
#                           labels2ind=labels2ind,
#                           max_len_seq=MAX_LEN_SEQ,
#                           bert_hugging=False)


# dataloader_tr = DataLoader(dataset=training_set,
#                            batch_size=BATCH_SIZE_TR,
#                            shuffle=True)

# # Seeds
# random.seed(SEED)
# np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.cuda.manual_seed_all(SEED)

# legend = []
# fig = None

# for wd in [0, .1, 1e-2, 1e-3, 1e-4]:
#     for dp in [.1, 0.2, .3]:
#         nerbert = BertForTokenClassificationCustom.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
#                                                                    num_labels=len(labels2ind),
#                                                                    hidden_dropout_prob=dp,
#                                                                    attention_probs_dropout_prob=dp)

#         # Prepare optimizer and schedule (linear warmup and decay)
#         optimizer = get_optimizer_with_weight_decay(model=nerbert,
#                                                     optimizer=OPTIMIZER,
#                                                     learning_rate=LEARNING_RATE,
#                                                     weight_decay=wd)

#         lr_finder = LRFinder(nerbert, optimizer, nn.CrossEntropyLoss(), device='cuda')
#         lr_finder.range_test(train_loader=dataloader_tr, end_lr=1, num_iter=100)
#         fig = lr_finder.plot(ax=fig)
#         legend.append(f"wd: {wd}")

# fig.figure.legend(legend, loc='best')
# fig.figure.tight_layout()
# fig.figure.show()
# fig.figure.savefig('lr_finder.png')

In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Example
text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes."

# Load model
tokenizer = AutoTokenizer.from_pretrained("/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models/")
model = AutoModelForTokenClassification.from_pretrained("/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models/")

# Get input for BERT
input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)

# Predict
with torch.no_grad():
    outputs = model(input_ids)

# From the output let's take the first element of the tuple.
# Then, let's get rid of [CLS] and [SEP] tokens (first and last)
predictions = outputs[0].argmax(axis=-1)[0][1:-1]

# Map label class indexes to string labels.
for token, pred in zip(tokenizer.tokenize(text), predictions):
    print(token, '->', model.config.id2label[pred.numpy().item()])

mouse -> O
thymus -> O
was -> O
used -> O
as -> O
a -> O
source -> O
of -> O
glucocorticoid -> B-protein
receptor -> I-protein
from -> O
normal -> B-cell_type
cs -> B-cell_type
lymphocytes -> I-cell_type
. -> O


In [25]:
from transformers import pipeline

text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes."

nlp_ner = pipeline("ner",
                   model='/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models/',
                   tokenizer='/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models/')

nlp_ner(text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'word': 'glucocorticoid',
  'score': 0.9929465651512146,
  'entity': 'B-protein',
  'index': 9},
 {'word': 'receptor',
  'score': 0.9913542866706848,
  'entity': 'I-protein',
  'index': 10},
 {'word': 'normal',
  'score': 0.6053049564361572,
  'entity': 'B-cell_type',
  'index': 12},
 {'word': 'cs',
  'score': 0.5391853451728821,
  'entity': 'B-cell_type',
  'index': 13},
 {'word': 'lymphocytes',
  'score': 0.9937543869018555,
  'entity': 'I-cell_type',
  'index': 14}]

# use pretrained one from transformers directly

In [22]:
from transformers import pipeline

text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes."

nlp_ner = pipeline("ner",
                   model='/Users/sdeshpande/Desktop/bioinformatices/bio_ner_bert/trained_models/pytorch',
                   tokenizer='fran-martinez/scibert_scivocab_cased_ner_jnlpba')

nlp_ner(text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'word': 'glucocorticoid',
  'score': 0.9894881248474121,
  'entity': 'B-protein',
  'index': 9},
 {'word': 'receptor',
  'score': 0.989505410194397,
  'entity': 'I-protein',
  'index': 10},
 {'word': 'normal',
  'score': 0.7680374383926392,
  'entity': 'B-cell_type',
  'index': 12},
 {'word': 'cs',
  'score': 0.5176804065704346,
  'entity': 'I-cell_type',
  'index': 13},
 {'word': 'lymphocytes',
  'score': 0.9898492097854614,
  'entity': 'I-cell_type',
  'index': 14}]

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Example
text = "Mouse thymus was used as a source of glucocorticoid receptor from normal CS lymphocytes."

# Load model
tokenizer = AutoTokenizer.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")
model = AutoModelForTokenClassification.from_pretrained("fran-martinez/scibert_scivocab_cased_ner_jnlpba")

# Get input for BERT
input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)

# Predict
with torch.no_grad():
    outputs = model(input_ids)

# From the output let's take the first element of the tuple.
# Then, let's get rid of [CLS] and [SEP] tokens (first and last)
predictions = outputs[0].argmax(axis=-1)[0][1:-1]

# Map label class indexes to string labels.
for token, pred in zip(tokenizer.tokenize(text), predictions):
    print(token, '->', model.config.id2label[pred.numpy().item()])

mouse -> O
thymus -> O
was -> O
used -> O
as -> O
a -> O
source -> O
of -> O
glucocorticoid -> B-protein
receptor -> I-protein
from -> O
normal -> B-cell_type
cs -> I-cell_type
lymphocytes -> I-cell_type
. -> O
