In [1]:
import os

import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertModel
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import DataCollatorForTokenClassification
from datasets import load_metric, load_from_disk
from datasets import Dataset, DatasetDict

from src.controllers import Controller
from src.NLP.datasets.pytorch import PytorchProcessor
from src.NLP.datasets.bert import BertProcessor
from src.NER.pytorch_dataloader import Collator, PytorchDataset, get_dataloader
from src.tools.general_tools import load_pickled_data
from config.nlp_models import BERT_MODEL_NAME

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
c = Controller('pytorch')
if os.listdir(os.path.join("..", c._dataset_base_path)) == []:
    trainset = PytorchProcessor('../data/trainset/annotations.jsonl')
    trainset.to_pt_format(c._dataset_base_path)

In [3]:
train_path = os.path.join("..", c._dataset_base_path, 'train.pk')
assert os.path.isfile(train_path), f'Train dataset not found at {train_path}'
word2idx_path = os.path.join("..", c._dataset_base_path, 'word2idx.pk')
assert os.path.isfile(word2idx_path), f'Word2idx not found at {word2idx_path}'
label2idx_path = os.path.join("..", c._dataset_base_path, 'label2idx.pk')
assert os.path.isfile(label2idx_path), f'Label2idx not found at {label2idx_path}'

In [4]:
ds = PytorchDataset(train_path, word2idx_path, label2idx_path)

In [5]:
dl = get_dataloader(train_path, word2idx_path, label2idx_path, batch_size=2, shuffle=True, num_workers=0)

## BERT format

In [6]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [7]:
dataset = load_from_disk("../results/dataset/bert")
label_names = list(load_pickled_data("../results/dataset/bert/labels.pkl"))
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)

In [7]:
bertp = BertProcessor('../data/trainset/annotations.jsonl')
label_names = list(bertp.labels)
dataset = bertp.to_bert_format(None)
tokenizer = bertp.tokenizer.tokenizer

2023-03-22 13:30:03.205 | INFO     | src.NLP.tokenizers.bert:__init__:14 - Loading BERT Tokenizer...
1866it [00:11, 167.69it/s]
2023-03-22 13:30:15.329 | INFO     | src.NLP.datasets.bert:load_data:113 - Loaded 1856 entries from ../data/trainset/annotations.jsonl and ignored 10 entries.
2023-03-22 13:30:29.910 | INFO     | src.NLP.datasets.bert:to_bert_format:184 - Saved data under the None directory.


In [8]:
label_names

['Substance',
 'Unit',
 'Type',
 'O',
 'E Number',
 'Usage',
 'Substance Specification',
 'INS Number',
 'Comment',
 'Operator',
 'SKIP',
 'Expressed As',
 'Function',
 'Value',
 'Other Identifiers',
 'CAS',
 'Synonym',
 'Class',
 'Conditions Of Use']

In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [10]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
# model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL_NAME, num_labels=len(label_names)#, vocab_size = len(bertp.vocab), ignore_mismatched_sizes=True
).to('cpu')
# model.config.vocab_size = len(bertp.vocab)
# model.resize_token_embeddings(len(bertp.vocab))
training_args = TrainingArguments(
    output_dir="../results/model/fine_tune_bert_output_cased",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps = 1000,
    # run_name = "ep_3_tokenized_11",
    save_strategy='steps',
    no_cuda=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['eval'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

KeyboardInterrupt: 