# Token classification (PyTorch)

# My data

In [None]:
from moralization import spacy_model
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import datasets
import evaluate
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import get_scheduler
torch.cuda.is_available()
# import data as spacy doc and take it from there
data_dir = "../data/All_Data/XMI_11"
test_setup = spacy_model.SpacySetup(data_dir, working_dir="./test")
data_doc = test_setup.doc_dict
example_name = list(data_doc.keys())[0]

In [None]:
sentence_list = [[token.text for token in sent] for sent in data_doc[example_name]["train"].sents]
token_list = [[token for token in sent] for sent in data_doc[example_name]["train"].sents]
# initialize nested label list to 0
label_list = [[0 for token in sent] for sent in data_doc[example_name]["train"].sents]
for i in range(0, 5):
    print(sentence_list[i])
    print(label_list[i])
# generate the labels based on the current list of tokens
# now set all Moralisierung, Moralisierung Kontext,
# Moralisierung explizit, Moralisierung interpretativ, Moralisierung Weltwissen to 1
selected_labels = ["Moralisierung", "Moralisierung Kontext", "Moralisierung Weltwissen",
                   "Moralisierung explizit", "Moralisierung interpretativ"]
# create a list as long as tokens
labels = [0 for token in data_doc[example_name]["train"]]
for span in data_doc[example_name]["train"].spans["task1"]:
    if span.label_ in selected_labels:
        labels[span.start+1:span.end] = [1] * (span.end-span.start)
        # mark the beginning of a span with 2
        labels[span.start] = 2

# labels now needs to be structured the same way as label_list
# set the label at beginning of sentence to 2 if it is 1
# also punctuation is included in the moralization label - we
# definitely need to set those labels to -100
j = 0
for m in range(len(label_list)):
    for i in range(len(label_list[m])):
        label_list[m][i] = labels[j]
        if i == 0 and labels[j] == 1:
            label_list[m][i] = 2
        if token_list[m][i].is_punct:
            label_list[m][i] = -100
        j = j+1


for i in range(0, 10):
    print(sentence_list[i])
    print(label_list[i])

In [None]:
# at this point we can write the text into a csv to load into datasets
# later it can be published as such on huggingface datasets
# column heads are sentence, labels
df = pd.DataFrame(zip(sentence_list, label_list), columns=["Sentences", "Labels"])
print(df.head(10))
data_set = datasets.Dataset.from_pandas(df)
# split in train test
train_test_set = data_set.train_test_split(test_size=0.1)

In [None]:
train_test_set["train"][0:100]["Labels"]

In [None]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast

In [None]:
inputs = tokenizer(train_test_set["train"][9]["Sentences"], is_split_into_words=True)
inputs.tokens()

In [None]:
# label_list needs to be expanded to cover the new tokens
# beginning of a span needs a different label than inside of a span
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label == 2:
                label -= 1
            new_labels.append(label)
    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Sentences"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["Labels"]
    new_labels = []
    tokens = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        tokens.append(tokenized_inputs.tokens(i))
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
#     tokenized_inputs["tokens"] = tokens
    return tokenized_inputs

In [None]:
tokenized_datasets = train_test_set.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_test_set["train"].column_names,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [None]:
for i in range(20):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
metric = evaluate.load("seqeval")

In [None]:
# labels = raw_datasets["train"][0]["ner_tags"]
# labels = [label_names[i] for i in labels]
label_names = ["0", "M", "M-BEG"]
labels = train_test_set["train"][0]["Labels"]
labels = [label_names[i] for i in labels if i != -100]
labels


In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[m] for m in label if m != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, m) in zip(prediction, label) if m != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
output_dir = "."

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[m] for m in label if m != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, m) in zip(prediction, label) if m != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#         repo.push_to_hub(
#             commit_message=f"Training in progress epoch {epoch}", blocking=False
#         )

In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
# flake8-noqa-cell
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "."
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Das Arbeitslosengeld ist nicht hoch genug da man ungleiche Standards propagiert.")