In [None]:
! pip install git+https://github.com/huggingface/transformers

! pip install datasets

! git clone 'https://github.com/rpsfilho93/HAREM.git'

In [None]:
from datasets import load_dataset, load_metric

from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    PreTrainedTokenizerFast,
    DataCollatorForTokenClassification,
    Trainer,
)

In [None]:
train_file = "./data/json/harem_train_total_conll2003.json"
validation_file = "./data/json/harem_test_total_conll2003.json"
cache_dir = "./cache"
task_name = "ner"
model_name_or_path = ""
model_revision ="main"
pad_to_max_length = False
max_seq_length = 512
label_all_tokens = False
overwrite_cache = True
fp16 = True

In [None]:
data_files = {
    "train": train_file,
    "validation": validation_file
}

In [None]:
raw_datasets = load_dataset('json', data_files=data_files, cache_dir=cache_dir)

In [None]:
column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features

if "tokens" in column_names:
    text_column_name = "tokens"
else:
    text_column_name = column_names[0]


In [None]:
#We will need to go through the dataset to get the unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list


In [None]:
label_column_name = f"{task_name}_tags"

In [None]:
label_list = get_label_list(raw_datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list)


In [None]:
# Map that sends B-Xxx label to its I-Xxx counterpart
b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)


In [None]:
config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        label2id=label_to_id,
        id2label={i: l for l, i in label_to_id.items()},
        finetuning_task=task_name,
        cache_dir=cache_dir,
        revision=model_revision,
        use_auth_token=False
    )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path,
            cache_dir=cache_dir,
            use_fast=True,
            revision=model_revision,
            use_auth_token=False,
        )

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
        model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=cache_dir,
        revision=model_revision,
        use_auth_token=False,
    )

In [None]:
assert isinstance(tokenizer, PreTrainedTokenizerFast)

In [None]:
padding = "max_length" if pad_to_max_length else False

In [None]:
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=max_seq_length,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                if label_all_tokens:
                    label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
#Preprocessing training data.
train_dataset = raw_datasets["train"]

train_dataset = train_dataset.map(
                tokenize_and_align_labels,
                batched=True,
                load_from_cache_file=not overwrite_cache,
            )

In [None]:
#Preprocessing evaluation data.
eval_dataset = raw_datasets["validation"]

eval_dataset = eval_dataset.map(
                tokenize_and_align_labels,
                batched=True,
                load_from_cache_file=not overwrite_cache,
            )

In [None]:
# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if fp16 else None)


In [None]:
# Metrics
metric = load_metric("seqeval")


In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if data_args.return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


In [None]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
#Training
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_model() # Saves the tokenizer too for easy upload

metrics["train_samples"] = len(train_dataset)

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
#Evaluation
metrics = trainer.evaluate()

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)