<a href="https://colab.research.google.com/github/siddharthtumre/biobert-finetune-ner/blob/main/biobert_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and Import required packages

In [1]:
! pip install transformers install datasets evaluate seqeval huggingface_hub wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 27.7 MB/s 
[?25hCollecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 62.5 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 9.3 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 56.7 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |██████████████████████████████

In [2]:
import os
import wandb
import torch
import numpy as np
from torch import nn
from datasets import DatasetDict

## Set Wandb variables and cuda device

In [3]:
os.environ["WANDB_API_KEY"]="0ab754dc31b1f5958c6f8787b8d1ff04a16e83d5"
os.environ["WANDB_ENTITY"]="siddharthtumre"
os.environ["WANDB_PROJECT"]="finetune-biobert-jnlpba"

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("jnlpba")

Downloading builder script:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading and preparing dataset jnlpba/jnlpba (download: 3.02 MiB, generated: 20.19 MiB, post-processed: Unknown size, total: 23.21 MiB) to /root/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4...


Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/863k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37094 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7714 [00:00<?, ? examples/s]

Dataset jnlpba downloaded and prepared to /root/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
test_valid = dataset['validation'].train_test_split(test_size=0.5)
dataset = DatasetDict(
        {'train': dataset['train'], 'validation': test_valid['test'], 'test': test_valid['train']})

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 37094
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3857
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3857
    })
})

In [8]:
label_names = dataset["train"].features["ner_tags"].feature.names
label_names

['O',
 'B-DNA',
 'I-DNA',
 'B-RNA',
 'I-RNA',
 'B-cell_line',
 'I-cell_line',
 'B-cell_type',
 'I-cell_type',
 'B-protein',
 'I-protein']

## Download the model

In [23]:
! git lfs install

Error: Failed to call git rev-parse --git-dir --show-toplevel: "fatal: not a git repository (or any of the parent directories): .git\n"
Git LFS initialized.


In [25]:
! git clone https://huggingface.co/dmis-lab/biobert-v1.1

Cloning into 'biobert-v1.1'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects:   3% (1/29)[Kremote: Counting objects:   6% (2/29)[Kremote: Counting objects:  10% (3/29)[Kremote: Counting objects:  13% (4/29)[Kremote: Counting objects:  17% (5/29)[Kremote: Counting objects:  20% (6/29)[Kremote: Counting objects:  24% (7/29)[Kremote: Counting objects:  27% (8/29)[Kremote: Counting objects:  31% (9/29)[Kremote: Counting objects:  34% (10/29)[Kremote: Counting objects:  37% (11/29)[Kremote: Counting objects:  41% (12/29)[Kremote: Counting objects:  44% (13/29)[Kremote: Counting objects:  48% (14/29)[Kremote: Counting objects:  51% (15/29)[Kremote: Counting objects:  55% (16/29)[Kremote: Counting objects:  58% (17/29)[Kremote: Counting objects:  62% (18/29)[Kremote: Counting objects:  65% (19/29)[Kremote: Counting objects:  68% (20/29)[Kremote: Counting objects:  72% (21/29)[Kremote: Counting objects:  75% (22/29)[Kremote: Coun

## Perform tokenization

In [26]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./biobert-v1.1", local_files_only=True)

In [46]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [47]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=128,
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

  0%|          | 0/38 [00:00<?, ?ba/s]



In [48]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 37094
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3857
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3857
    })
})

In [49]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [50]:
import evaluate
metric = evaluate.load("seqeval")

In [51]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [52]:
id2label

{0: 'O',
 1: 'B-DNA',
 2: 'I-DNA',
 3: 'B-RNA',
 4: 'I-RNA',
 5: 'B-cell_line',
 6: 'I-cell_line',
 7: 'B-cell_type',
 8: 'I-cell_type',
 9: 'B-protein',
 10: 'I-protein'}

In [53]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=2)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for (p, l) in zip(prediction, label) if l != -100]
                   for prediction, label in zip(predictions, labels)]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    final_results = {}
    for key, value in all_metrics.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results
    # return {
    #             "precision": results["overall_precision"],
    #             "recall": results["overall_recall"],
    #             "f1": results["overall_f1"],
    #             "accuracy": results["overall_accuracy"],
    #         }

In [54]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("./biobert-v1.1",
                                                        local_files_only=True,
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        ).to(device)

loading configuration file ./biobert-v1.1/config.json
Model config BertConfig {
  "_name_or_path": "./biobert-v1.1",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DNA",
    "2": "I-DNA",
    "3": "B-RNA",
    "4": "I-RNA",
    "5": "B-cell_line",
    "6": "I-cell_line",
    "7": "B-cell_type",
    "8": "I-cell_type",
    "9": "B-protein",
    "10": "I-protein"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DNA": 1,
    "B-RNA": 3,
    "B-cell_line": 5,
    "B-cell_type": 7,
    "B-protein": 9,
    "I-DNA": 2,
    "I-RNA": 4,
    "I-cell_line": 6,
    "I-cell_type": 8,
    "I-protein": 10,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_lay

In [55]:
model.config.num_labels

11

## Prediction at 0 training point

In [56]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [57]:
predictions, labels, metrics = trainer.predict(tokenized_dataset["test"], metric_key_prefix="predict")

***** Running Prediction *****
  Num examples = 3857
  Batch size = 8


In [58]:
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


In [59]:
trainer.log_metrics("predict", metrics)

***** predict metrics *****
  predict_DNA_f1              =     0.0005
  predict_DNA_number          =       1079
  predict_DNA_precision       =     0.0002
  predict_DNA_recall          =     0.0148
  predict_RNA_f1              =        0.0
  predict_RNA_number          =        105
  predict_RNA_precision       =        0.0
  predict_RNA_recall          =        0.0
  predict_cell_line_f1        =        0.0
  predict_cell_line_number    =        491
  predict_cell_line_precision =        0.0
  predict_cell_line_recall    =        0.0
  predict_cell_type_f1        =        0.0
  predict_cell_type_number    =       1948
  predict_cell_type_precision =        0.0
  predict_cell_type_recall    =        0.0
  predict_loss                =     2.3339
  predict_overall_accuracy    =      0.143
  predict_overall_f1          =     0.0003
  predict_overall_precision   =     0.0001
  predict_overall_recall      =     0.0018
  predict_protein_f1          =        0.0
  predict_protein_number  

In [61]:
# Save predictions
output_predictions_file = os.path.join("tmp_trainer", "predictions.txt")
if trainer.is_world_process_zero():
    with open(output_predictions_file, "w") as writer:
        for prediction in true_predictions:
            writer.write(" ".join(prediction) + "\n")

In [42]:
# trainer.save_metrics("predict", metrics)

## Train, eval and predict for 1 epoch

In [64]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./oneepoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=1,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [65]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [67]:
trainer.train()

***** Running training *****
  Num examples = 37094
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1160
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33msiddharthtumre[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Dna Precision,Dna Recall,Dna F1,Dna Number,Rna Precision,Rna Recall,Rna F1,Rna Number,Cell Line Precision,Cell Line Recall,Cell Line F1,Cell Line Number,Cell Type Precision,Cell Type Recall,Cell Type F1,Cell Type Number,Protein Precision,Protein Recall,Protein F1,Protein Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.1645,0.261658,0.661249,0.751215,0.703367,1029,0.680851,0.732824,0.705882,131,0.493369,0.730845,0.589074,509,0.741417,0.707717,0.724175,1892,0.660249,0.822489,0.732493,5087,0.662682,0.782146,0.717475,0.917647


***** Running Evaluation *****
  Num examples = 3857
  Batch size = 32
Saving model checkpoint to ./oneepoch/checkpoint-1160
Configuration saved in ./oneepoch/checkpoint-1160/config.json
Model weights saved in ./oneepoch/checkpoint-1160/pytorch_model.bin
tokenizer config file saved in ./oneepoch/checkpoint-1160/tokenizer_config.json
Special tokens file saved in ./oneepoch/checkpoint-1160/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1160, training_loss=0.20234234744104845, metrics={'train_runtime': 644.6325, 'train_samples_per_second': 57.543, 'train_steps_per_second': 1.799, 'total_flos': 1768308404925576.0, 'train_loss': 0.20234234744104845, 'epoch': 1.0})

In [68]:
predictions, labels, metrics = trainer.predict(tokenized_dataset["test"], metric_key_prefix="predict")

***** Running Prediction *****
  Num examples = 3857
  Batch size = 32


In [69]:
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


In [70]:
trainer.log_metrics("predict", metrics)

***** predict metrics *****
  predict_DNA_f1              =     0.7186
  predict_DNA_number          =       1079
  predict_DNA_precision       =     0.6707
  predict_DNA_recall          =     0.7739
  predict_RNA_f1              =     0.6949
  predict_RNA_number          =        105
  predict_RNA_precision       =      0.626
  predict_RNA_recall          =      0.781
  predict_cell_line_f1        =      0.576
  predict_cell_line_number    =        491
  predict_cell_line_precision =     0.4761
  predict_cell_line_recall    =     0.7291
  predict_cell_type_f1        =     0.7322
  predict_cell_type_number    =       1948
  predict_cell_type_precision =     0.7571
  predict_cell_type_recall    =     0.7089
  predict_loss                =     0.2711
  predict_overall_accuracy    =     0.9167
  predict_overall_f1          =     0.7238
  predict_overall_precision   =     0.6692
  predict_overall_recall      =     0.7882
  predict_protein_f1          =      0.739
  predict_protein_number  

In [71]:
# Save predictions
output_predictions_file = os.path.join("./oneepoch", "predictions.txt")
if trainer.is_world_process_zero():
    with open(output_predictions_file, "w") as writer:
        for prediction in true_predictions:
            writer.write(" ".join(prediction) + "\n")

In [72]:
trainer.save_metrics("predict", metrics)

## Train and eval for 20 epochs

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="biobert-complete-ner",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    report_to="wandb",
    push_to_hub=True,
    hub_token="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"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
wandb.finish()