In [None]:
import os
import spacy
import pandas as pd
import numpy as np
import json
from datasets import ClassLabel, load_dataset, load_metric
from tqdm.auto import tqdm

import transformers
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [1]:
from spacy.training import Example, offsets_to_biluo_tags, biluo_to_iob

# 1. Preparation

## 1.1. Preparing Train-test set

In [13]:
# Read data from disk
training_data = []
with open('./data/food_ner_dataset_training.jsonl', 'r', encoding='utf-8') as file:
    for line in tqdm(file.readlines()):
        training_data.append(json.loads(line))

HBox(children=(FloatProgress(value=0.0, max=2752.0), HTML(value='')))




In [14]:
# Read data from disk
eval_data = []
with open('./data/food_ner_dataset_test.jsonl', 'r', encoding='utf-8') as file:
    for line in tqdm(file.readlines()):
        eval_data.append(json.loads(line))

HBox(children=(FloatProgress(value=0.0, max=4128.0), HTML(value='')))




## 1.2. Loading Spacy Model for preprocessing

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def preprocessing_for_trf(dataset):
    processed_dataset = []
    for item in tqdm(dataset):
        doc = nlp.make_doc(item[0])
        example = Example.from_dict(doc, item[1])
        # Preparing samples
        text = example.text
        entities = biluo_to_iob(example.to_dict()['doc_annotation']['entities'])
        tokens = example.to_dict()['token_annotation']['ORTH']
        processed_dataset.append(dict(text=text, ner_tags=entities, tokens=tokens))
    
    return processed_dataset

In [None]:
processed_training = preprocessing_for_trf(training_data)

In [None]:
with open('./data/food_ner_dataset_trf_v1_training.jsonl', 'w', encoding='utf-8') as file:
    for line in tqdm(processed_training):
        file.write(json.dumps(line)+'\n')

In [None]:
processed_eval = preprocessing_for_trf(eval_data)

In [None]:
with open('./data/food_ner_dataset_trf_v1_eval.jsonl', 'w', encoding='utf-8') as file:
    for line in tqdm(processed_eval):
        file.write(json.dumps(line)+'\n')

## 3.2. Loading Dataset

In [None]:
from datasets import load_dataset

In [None]:
train_file_path = './data/food_ner_dataset_trf_v1_training.jsonl'
eval_file_path = './data/food_ner_dataset_trf_v1_eval.jsonl'

In [None]:
datasets = load_dataset('json', data_files={'train': train_file_path, 'validation': eval_file_path} )

In [None]:
datasets

## 3.3. Training

In [None]:
def get_last_checkpoint(folder):
    content = os.listdir(folder)
    checkpoints = [
        path
        for path in content
        if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
    ]
    if len(checkpoints) == 0:
        return
    return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0])))


In [None]:
# Set seed before initializing model.
set_seed(42)

In [None]:
column_names = datasets["train"].column_names
features = datasets["train"].features

In [None]:
text_column_name = "tokens" if "tokens" in column_names else column_names[0]
label_column_name = (
        "ner_tags" if "ner_tags" in column_names else column_names[1]
)

In [None]:
text_column_name, label_column_name

In [None]:
# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    label_to_id = {i: i for i in range(len(label_list))}
else:
    label_list = get_label_list(datasets["train"][label_column_name])
    label_to_id = {l: i for i, l in enumerate(label_list)}
    
num_labels = len(label_list)

In [None]:
num_labels

In [None]:
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    finetuning_task='ner',
    cache_dir=None,
    revision='main',
    use_auth_token=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    cache_dir=None,
    use_fast=True,
    revision='main',
    use_auth_token=None,
)
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    from_tf=False,
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

In [None]:
# Tokenizer check: this script requires a fast tokenizer.
if not isinstance(tokenizer, PreTrainedTokenizerFast):
    raise ValueError(
        "This example script only works for models that have a fast tokenizer."
    )

In [None]:
preprocessing_num_workers = 6
overwrite_cache = False
label_all_tokens = True

In [None]:
# Padding strategy
padding = False

# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    batch_size=2000
)

In [None]:
# Data collator
fp16 = False # Whether to use 16-bit (mixed) precision training instead of 32-bit training.
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if fp16 else None)

In [None]:
# Metrics
metric = load_metric("seqeval")

In [None]:
return_entity_level_metrics = True

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [None]:
output_dir = './models/transformers-ner-v01'

In [None]:
training_args = TrainingArguments(output_dir, num_train_epochs=5, per_device_train_batch_size=16, per_device_eval_batch_size=16)

In [None]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Training
# if last_checkpoint is not None:
#     checkpoint = last_checkpoint
# elif os.path.isdir("bert-base-uncased"):
#     checkpoint = "bert-base-uncased"
# else:
checkpoint = None

train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too for easy upload

output_train_file = os.path.join(output_dir, "train_results.txt")
if trainer.is_world_process_zero():
    with open(output_train_file, "w") as writer:
#         logger.info("***** Train results *****")
        for key, value in sorted(train_result.metrics.items()):
#             logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")

    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
    trainer.state.save_to_json(os.path.join(output_dir, "trainer_state.json"))


In [None]:
# Evaluation
results = {}
#do-eval
# logger.info("*** Evaluate ***")

results = trainer.evaluate()

output_eval_file = os.path.join(output_dir, "eval_results_ner.txt")
if trainer.is_world_process_zero():
    with open(output_eval_file, "w") as writer:
#         logger.info("***** Eval results *****")
        for key, value in results.items():
#             logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")

## 3.3. Prediction

In [None]:
test_dataset = load_dataset('json', data_files={'test': test_file_path} )

In [None]:
tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    batch_size=2000
)

In [None]:
# Predict
# logger.info("*** Predict ***")

test_dataset = tokenized_datasets["validation"]
predictions, labels, metrics = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

In [None]:
# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [None]:
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
if trainer.is_world_process_zero():
    with open(output_test_results_file, "w") as writer:
        for key, value in sorted(metrics.items()):
#             logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")

In [None]:
# Save predictions
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
if trainer.is_world_process_zero():
    with open(output_test_predictions_file, "w") as writer:
        for prediction in true_predictions:
            writer.write(" ".join(prediction) + "\n")

In [None]:
def convert_trf_to_spacy_preds(texts, preds):
    for text, pred in zip(texts,preds):
        doc = nlp.make_doc(text)
        

In [None]:
from spacy.tokens import Doc

In [None]:
idx = 3
doc = nlp.make_doc(test_dataset['text'][idx])
tokens = test_dataset['tokens'][idx]
ents = true_predictions[idx]

In [None]:
spacy.displacy.render(Doc(doc.vocab, words=tokens, ents=ents), style="ent")