# Notebook for transformers exploration

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq
    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !mkdir -p data && unzip -qq test_data.zip && mv -f moralization-test_data/*_Data ./data/. && rm -rf moralization-test_data test_data.zip

In [None]:
# flake8-noqa-cell
from moralization import input_data as inp
from moralization import analyse as ae
from moralization import spacy_model
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import datasets 
import evaluate
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorForTokenClassification
torch.cuda.is_available()

In [None]:
!nvidia-smi

- Either pipeline (to simplify things) or load components manually - tokenizer (convert text to numbers), automodel with correct headers (ie classification) (model architecture and weights from pre-training)  
- UNK (unknown) token for words not in vocab  
- tokenizer is model-specific and contains certain algorithm and vocabulary for each model  
- tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
- tokenizer("Using a Transformer network is simple")  
- tokenizer.save_pretrained("directory_on_my_computer")
- tokenization is followed by encoding
- batches of text need to be padded, and attention mask indicates which tokens are padded

# Second try

## Data
Eeach TOKEN needs to have one label in the training. So, if a sequence is classified as "moralization", then all the tokens in that sequence need to be assigned the label "1", and all other tokens "0". The beginning of a moralization, ie the first token is set to "2".

In [None]:
# flake8-noqa-cell
# import data as spacy doc and take it from there
data_dir = "../data/All_Data/XMI_11"
test_setup = spacy_model.SpacySetup(data_dir, working_dir="./test")
data_doc = test_setup.convert_data_to_spacy_doc()

In [None]:
# DEBUG
# convert doc and span objects into list of tokens and labels
# span.start returns the token id in the doc
# data_doc = test_setup.doc_dict
# print(data_doc.keys())
example_name = list(data_doc.keys())[0]
# for span in data_doc[example_name]["train"].spans["task1"]:
#     print("**********")
#     print(span)
#     print(span.label_)
#     print(span.start)
#     print(data_doc[example_name]["train"][span.start], data_doc[example_name]["train"][span.end-1], "mmm")

In [None]:
# tokenize and label
# either list of sentences with list of tokens - here spacy needs to initialize with sentencizer
# or list of instances with list of tokens
# the instances must not be too long for this!
# with sentences:
sentence_list = [[token.text for token in sent] for sent in data_doc[example_name]["train"].sents]
token_list = [[token for token in sent] for sent in data_doc[example_name]["train"].sents]
# initialize nested label list to 0
label_list = [[0 for token in sent] for sent in data_doc[example_name]["train"].sents]
for i in range(0,5):
    print(sentence_list[i])
    print(label_list[i])

# with instances:
# find the pre-defined headlines and split there

In [None]:
# generate the labels based on the current list of tokens
# now set all Moralisierung, Moralisierung Kontext, 
# Moralisierung explizit, Moralisierung interpretativ, Moralisierung Weltwissen to 1
selected_labels = ["Moralisierung", "Moralisierung Kontext", "Moralisierung Weltwissen",
                  "Moralisierung explizit", "Moralisierung interpretativ"]
# create a list as long as tokens
labels = [0 for token in data_doc[example_name]["train"]]
for span in data_doc[example_name]["train"].spans["task1"]:
    if span.label_ in selected_labels:
        labels[span.start+1:span.end] = [1] * (span.end-span.start)
        # mark the beginning of a span with 2
        labels[span.start] = 2
        
# labels now needs to be structured the same way as label_list
# set the label at beginning of sentence to 2 if it is 1
# also punctuation is included in the moralization label - we 
# definitely need to set those labels to -100
j = 0
for m in range(len(label_list)):
    for i in range(len(label_list[m])):
        label_list[m][i] = labels[j]
        if i==0 and labels[j]==1:
            label_list[m][i] = 2
        if token_list[m][i].is_punct:
            label_list[m][i] = -100
        j = j+1


for i in range(0,10):
    print(sentence_list[i])   
    print(label_list[i])

In [None]:
# at this point we can write the text into a csv to load into datasets
# later it can be published as such on huggingface datasets
# column heads are sentence, labels
df = pd.DataFrame(zip(sentence_list,label_list), columns=["Sentences", "Labels"])

In [None]:
df.head(10)
# Problem is the sentence split - does not get token 101 / 102

In [None]:
# now load this into datasets
data_set = datasets.Dataset.from_pandas(df)

In [None]:
data_set

In [None]:
data_set[4]

In [None]:
# split in train test validation
train_test_set = data_set.train_test_split(test_size=0.1)

In [None]:
train_test_set["train"]

In [None]:
# flake8-noqa-cell
# model_name = "xlm-roberta-large"
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer.is_fast

In [None]:
# now we can feed sentences into the tokenizer
inputs = tokenizer(train_test_set["train"][9]["Sentences"], is_split_into_words=True)
inputs.tokens()

In [None]:
train_test_set["train"][9]["Labels"]

In [None]:
inputs.word_ids()

In [None]:
# label_list needs to be expanded to cover the new tokens
# beginning of a span needs a different label than inside of a span
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label == 2:
                label -= 1
            new_labels.append(label)

    return new_labels

In [None]:
word_ids = inputs.word_ids()
print(align_labels_with_tokens(train_test_set["train"][9]["Labels"], word_ids))

In [None]:
train_test_set["train"][9]["Labels"]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Sentences"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["Labels"]
    new_labels = []
    tokens = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        tokens.append(tokenized_inputs.tokens(i))
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
#     tokenized_inputs["tokens"] = tokens
    return tokenized_inputs

In [None]:
tokenized_datasets = train_test_set.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_test_set["train"].column_names,
)

In [None]:
print(tokenized_datasets["train"]["input_ids"][0:10])

In [None]:
# tokenized_datasets["train"]["tokens"][0:10]

In [None]:
print(tokenized_datasets["train"]["attention_mask"][0:10])

In [None]:
print(tokenized_datasets["train"]["labels"][5:8])

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
# batch = data_collator(tokenized_datasets["train"])
batch["labels"]

In [None]:
# check that data has been padded to same length
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
metric = evaluate.load("seqeval")

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
# label_names = ["0", "M", "M-BEG"]
# labels = tokenized_datasets["train"][8]["labels"]
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
labels = raw_datasets["train"][0]["ner_tags"]
print(labels)
labels = [label_names[i] for i in labels if i != -100]
print(labels)

In [None]:
predictions = labels.copy()
predictions[2] = "0"
print(predictions)

In [None]:
metric.compute(predictions=[predictions], references=[labels])

In [None]:
# set the label names
# label_names = ["0", "M", "M-BEG"]
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

In [None]:
label2id

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,id2label=id2label,label2id=label2id,)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=8
)

In [None]:
# for batch in train_dataloader:
#     print(batch)

In [None]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
312/39

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        for key in batch.keys():
            print(key)
            print(batch[key].shape)
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save model
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

In [None]:
# check with standard data
raw_datasets = datasets.load_dataset("conll2003")

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

In [None]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)