# Notebook for transformers exploration

In [None]:
# Please ignore this cell: extra install steps that are only executed when running the notebook on Google Colab
# flake8-noqa-cell
import os
if 'google.colab' in str(get_ipython()) and not os.path.isdir('Test_Data'):
    # we're running on colab and we haven't already downloaded the test data
    # first install pinned version of setuptools (latest version doesn't seem to work with this package on colab)
    !pip install setuptools==61 -qqq
    # install the moralization package
    !pip install git+https://github.com/ssciwr/moralization.git -qqq
    # download test data sets
    !wget https://github.com/ssciwr/moralization/archive/refs/heads/test_data.zip -q
    !mkdir -p data && unzip -qq test_data.zip && mv -f moralization-test_data/*_Data ./data/. && rm -rf moralization-test_data test_data.zip

In [None]:
# flake8-noqa-cell
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from moralization import input_data as inp
from moralization import analyse as ae

In [None]:
!nvidia-smi

In [None]:

! ls data

In [None]:
# flake8-noqa-cell
raw_data_no_moralization = "data/All_Data/Alle_bearbeiteten_Annotationen-0_label.csv"
df_raw_no_moralization = pd.read_csv(raw_data_no_moralization)
data_dict = inp.InputOutput.read_data("data/All_Data/XMI_11")
df_spans = ae.AnalyseOccurrence(data_dict, mode="spans").df

In [None]:
df_spans.head(10)

In [None]:
df_spans["Gerichtsurteile-neg-AW-neu-optimiert-BB"].loc["KAT2-Moralwerte","Care"]

In [None]:
df_spans["Gerichtsurteile-neg-AW-neu-optimiert-BB"].loc["KAT1-Moralisierendes Segment","Moralisierung"]

In [None]:
df_spans["Gerichtsurteile-neg-AW-neu-optimiert-BB"].loc["KAT1-Moralisierendes Segment","Keine Moralisierung"].split("###")

In [None]:
df_raw_no_moralization = df_raw_no_moralization.rename(
    columns={"Label": "Label_moralization"}
)

In [None]:
df_raw_no_moralization.head(10)

In [None]:
df_spans.loc["KAT1-Moralisierendes Segment"]
# all that are not "Keine Moralisierung" shall be "Moralisierung"

In [None]:
# flake8-noqa-cell
df_new = df_spans.loc[["KAT1-Moralisierendes Segment"]]
df_new = df_new.fillna("")
# drop the multiindex
df_new = df_new.droplevel(0)
# sum strings over all sources
df_new["All sources"] = ""
for file in df_new.columns[:-1]:
    print(file)
    df_new["All sources"] += df_new[file] + "###"
# extract row content into new dataframe
# no moralization
df_no_moralization = pd.DataFrame(
    df_new["All sources"].loc["Keine Moralisierung"].split("###"), columns=["Sentences"]
)
# drop empty rows
df_no_moralization = df_no_moralization[df_no_moralization["Sentences"].astype(bool)]
df_no_moralization["Label_moralization"] = 0
# moralization
df_moralization = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung"].split("###"), columns=["Sentences"]
)
df_moralization = df_moralization[df_moralization["Sentences"].astype(bool)]
df_moralization["Label_moralization"] = 1
df_moralization_kontext = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung Kontext"].split("###"),
    columns=["Sentences"],
)
df_moralization_kontext = df_moralization_kontext[
    df_moralization_kontext["Sentences"].astype(bool)
]
df_moralization_kontext["Label_moralization"] = 1
df_moralization_ww = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung Weltwissen"].split("###"),
    columns=["Sentences"],
)
df_moralization_ww = df_moralization_ww[df_moralization_ww["Sentences"].astype(bool)]
df_moralization_ww["Label_moralization"] = 1
df_moralization_exp = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung explizit"].split("###"),
    columns=["Sentences"],
)
df_moralization_exp = df_moralization_exp[df_moralization_exp["Sentences"].astype(bool)]
df_moralization_exp["Label_moralization"] = 1
df_moralization_int = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung interpretativ"].split("###"),
    columns=["Sentences"],
)
df_moralization_int = df_moralization_int[df_moralization_int["Sentences"].astype(bool)]
df_moralization_int["Label_moralization"] = 1
df_new.to_csv("df_new.csv")

In [None]:
df_moralization.head()

In [None]:
df_new.head()

In [None]:
df_raw_no_moralization.head(10)

In [None]:
# merge all the data frames into one
frames = [
    df_raw_no_moralization,
    df_no_moralization,
    df_moralization,
    df_moralization_kontext,
    df_moralization_ww,
    df_moralization_exp,
    df_moralization_int,
]
all_data = pd.concat(frames)

In [None]:
all_data.head(100)

### Inspect the data

In [None]:
all_data["Label_no_moralization"] = np.where(all_data["Label_moralization"] == 1, 0, 1)
all_data[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
all_data[["Label_moralization", "Label_no_moralization"]].sum().plot.bar(
    ylim=([0, 2000])
)

## Now we got the data in one frame, let's reshuffle and split into train, test, validate

In [None]:
all_data = all_data.sample(frac=1).reset_index(drop=True)

In [None]:
all_data.head(100)

In [None]:
# split into train, test, validate with 60% train, 20% validation, 20% test
train, validate, test = np.split(
    all_data.sample(frac=1, random_state=42),
    [int(0.6 * len(all_data)), int(0.8 * len(all_data))],
)

In [None]:
train["Label_no_moralization"] = np.where(train["Label_moralization"] == 1, 0, 1)
train[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
validate["Label_no_moralization"] = np.where(validate["Label_moralization"] == 1, 0, 1)
validate[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
test["Label_no_moralization"] = np.where(test["Label_moralization"] == 1, 0, 1)
test[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
# flake8-noqa-cell

from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class M_Dataset(Dataset):
    def __init__(
        self, data, tokenizer, attributes, max_token_len: int = 128, sample=1000
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.sample = sample
        self._prepare_data()

    def _prepare_data(self):
        self.data["Label_no_moralization"] = np.where(
            self.data["Label_moralization"] == 1, 0, 1
        )
        if self.sample is not None:
            no_moralization = self.data.loc[self.data["Label_no_moralization"] > 0]
            moralization = self.data.loc[self.data["Label_no_moralization"] == 0]
            self.data = pd.concat(
                [moralization, no_moralization.sample(self.sample, random_state=7)]
            )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        comment = str(item.Sentences)
        attributes = torch.FloatTensor(item[self.attributes])
        tokens = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_token_len,
            return_attention_mask=True,
        )
        return {
            "input_ids": tokens.input_ids.flatten(),
            "attention_mask": tokens.attention_mask.flatten(),
            "labels": attributes,
        }

In [None]:
m_data_train = M_Dataset(
    train, tokenizer, ["Label_moralization", "Label_no_moralization"]
)
m_data_validate = M_Dataset(
    validate, tokenizer, ["Label_moralization", "Label_no_moralization"], sample=None
)
m_data_train.data.head(10)

In [None]:
m_data_train.__getitem__(0)["labels"].shape, m_data_train.__getitem__(0)[
    "input_ids"
].shape, m_data_train.__getitem__(0)["attention_mask"].shape

In [None]:
len(m_data_train)

In [None]:
# flake8-noqa-cell
! pip install pytorch-lightning
import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [None]:
class M_Data_Module(pl.LightningDataModule):
    def __init__(
        self,
        train,
        val,
        attributes,
        batch_size: int = 16,
        max_token_length: int = 128,
        model_name="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    ):
        super().__init__()
        self.train = train
        self.val = val
        self.attributes = attributes
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage=None):
        if stage in (None, "fit"):
            self.train_dataset = M_Dataset(
                self.train, attributes=self.attributes, tokenizer=self.tokenizer
            )
            self.val_dataset = M_Dataset(
                self.val,
                attributes=self.attributes,
                tokenizer=self.tokenizer,
                sample=None,
            )
        if stage == "predict":
            self.val_dataset = M_Dataset(
                self.val,
                attributes=self.attributes,
                tokenizer=self.tokenizer,
                sample=None,
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, num_workers=4, shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False
        )

    def predict_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False
        )

In [None]:
M_data_module = M_Data_Module(
    train, validate, attributes=["Label_moralization", "Label_no_moralization"]
)

In [None]:
M_data_module.setup()

In [None]:
M_data_module.train_dataloader()

In [None]:
len(M_data_module.train_dataloader())

## Model

In [None]:
# flake8-noqa-cell

from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

In [None]:
class M_Comment_Classifier(pl.LightningModule):
    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(
            config["model_name"], return_dict=True
        )
        self.hidden = torch.nn.Linear(
            self.pretrained_model.config.hidden_size,
            self.pretrained_model.config.hidden_size,
        )
        self.classifier = torch.nn.Linear(
            self.pretrained_model.config.hidden_size, self.config["n_labels"]
        )
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.loss_func = nn.BCEWithLogitsLoss(reduction="mean")
        self.dropout = nn.Dropout()

    def forward(self, input_ids, attention_mask, labels=None):
        # roberta layer
        output = self.pretrained_model(
            input_ids=input_ids, attention_mask=attention_mask
        )
        pooled_output = torch.mean(output.last_hidden_state, 1)
        # final logits
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        # calculate loss
        loss = 0
        if labels is not None:
            loss = self.loss_func(
                logits.view(-1, self.config["n_labels"]),
                labels.view(-1, self.config["n_labels"]),
            )
        return loss, logits

    def training_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("train loss ", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def validation_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("validation loss ", loss, prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        return outputs

    def configure_optimizers(self):
        optimizer = AdamW(
            self.parameters(),
            lr=self.config["lr"],
            weight_decay=self.config["weight_decay"],
        )
        total_steps = self.config["train_size"] / self.config["batch_size"]
        warmup_steps = math.floor(total_steps * self.config["warmup"])
        warmup_steps = math.floor(total_steps * self.config["warmup"])
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, total_steps
        )
        return [optimizer], [scheduler]


# def validation_epoch_end(self, outputs):
#   losses = []
#   for output in outputs:
#     loss = output['val_loss'].detach().cpu()
#     losses.append(loss)
#   avg_loss = torch.mean(torch.stack(losses))
#   self.log("avg_val_loss", avg_loss)

In [None]:
config = {
    "model_name": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    "n_labels": len(["Label_moralization", "Label_no_moralization"]),
    "batch_size": 6,
    "lr": 1.5e-6,
    "warmup": 0.2,
    "train_size": len(M_data_module.train_dataloader()),
    "weight_decay": 0.001,
    "n_epochs": 100,
}

model = M_Comment_Classifier(config)

In [None]:
idx = 0
input_ids = m_data_train.__getitem__(idx)["input_ids"]
attention_mask = m_data_train.__getitem__(idx)["attention_mask"]
labels = m_data_train.__getitem__(idx)["labels"]
model.cpu()
loss, output = model(
    input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0), labels.unsqueeze(dim=0)
)
print(labels.shape, output.shape, output)

### Train model

In [None]:
# datamodule
m_data_module = M_Data_Module(
    train,
    validate,
    attributes=["Label_moralization", "Label_no_moralization"],
    batch_size=config["batch_size"],
)
m_data_module.setup()

# model
model = M_Comment_Classifier(config)

In [None]:
# trainer and fit
trainer = pl.Trainer(max_epochs=config["n_epochs"], accelerator='gpu', devices=1, num_sanity_val_steps=50)

In [None]:
trainer.fit(model, m_data_module)

In [None]:
torch.cuda.is_available()

In [None]:
foo = torch.tensor([1,2,3])
foo = foo.to('cuda')

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.save_model("random_try")

- Either pipeline (to simplify things) or load components manually - tokenizer (convert text to numbers), automodel with correct headers (ie classification) (model architecture and weights from pre-training)  
- UNK (unknown) token for words not in vocab  
- tokenizer is model-specific and contains certain algorithm and vocabulary for each model  
- tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
- tokenizer("Using a Transformer network is simple")  
- tokenizer.save_pretrained("directory_on_my_computer")
- tokenization is followed by encoding
- batches of text need to be padded, and attention mask indicates which tokens are padded

# Second try

## Data
Here, each sentence has one label. If I understand it correctly, each TOKEN needs to have one label in the training. So, if a sequence is classified as "moralization", then all the tokens in that sequence need to be assigned the label "1", and all other tokens "0".

In [None]:
# flake8-noqa-cell
# import data as spacy doc and take it from there
from moralization import spacy_model
data_dir = "../data/All_Data/XMI_11"
test_setup = spacy_model.SpacySetup(data_dir, working_dir="./test")
data_doc = test_setup.convert_data_to_spacy_doc()

In [None]:
# convert doc and span objects into list of tokens and labels
# span.start returns the token id in the doc
# data_doc = test_setup.doc_dict
# print(data_doc.keys())
example_name = list(data_doc.keys())[0]
for span in data_doc[example_name]["train"].spans["task1"]:
    print("**********")
    print(span)
    print(span.label_)
    print(span.start)
    print(data_doc[example_name]["train"][span.start], data_doc[example_name]["train"][span.end-1], "mmm")

In [None]:
# tokenize and label
# either list of sentences with list of tokens - here spacy needs to initialize with sentencizer
# or list of instances with list of tokens
# the instances must not be too long for this!
# with sentences:
# tokenlist = [[token.text for token in sent] for sent in data_doc[example_name]["train"].sents]
# find the three "#" tokens and split there
# if there is only one "#" it marks a word that is generally associated with moralization
# but not a break between instances
# we cannot split the strings before the tokenization because that will mess up the span
# alignment
# we could use spacy's matcher for this but unfortunately then we need to run the pipeline again
# maybe something to add to the spacy module to to initially?
# For now generate list of tokens and set all labels to zero
tokenlist = [token for token in data_doc[example_name]["train"]]
labellist = [0 for i in range(0,len(tokenlist))]
split_instances = []
for i, token in enumerate(tokenlist):
    if token.text == "#":
        # check if next two tokens are also "#"
        # these can never be in the beginning or end of a doc 
        # but in principle we should check that
        # sometimes there are also more than three "#" but we just split once anyways
        # hashtags get removed later
        if tokenlist[i+1].text == "#" and tokenlist[i+2].text == "#":
#             print("Found instance break: {}, {}, {}, {}, {}".format(tokenlist[i-2].text,
#                                                                     tokenlist[i-1].text,
#                                                                     token.text,
#                                                                     tokenlist[i+1].text,
#                                                                     tokenlist[i+2].text))
            # save the token id where we will split
            # we need to take care not to save two ids
            # so maybe we save a tuple of all "#" positions
            split_instances.append((i, i+1, i+2))
# now check for overlap in any tuples and remove the ones that overlap
# (424, 425, 426)
# (425, 426, 427)
# (426, 427, 428) should become 
# (424, 425, 426)

elements_to_remove = []
for i in range(0, len(split_instances)-2):
    # check that we are not reaching the end of the list
    # check if i and i+i contain same numbers
    # create a set of i and i+1 and find difference
    my_diff = set(split_instances[i]) - set(split_instances[i+1])
    if len(my_diff) < 3:
#         print("Found matching sets! {} {}".format(split_instances[i], split_instances[i+1]))
#         print("Marking the next instance split for removal ...")
        elements_to_remove.append(i+1)
# print(elements_to_remove)
for item in reversed(elements_to_remove):
#     print("keep", split_instances[item-1])
#     print("remove", split_instances[item])
    # Now delete all of these in the list of tuples
    del split_instances[item]
    
# check the list of instance splits again
# print(split_instances)
for i in range(0, len(split_instances)-2):
    # check that we are not reaching the end of the list
    # check if i and i+i contain same numbers
    # create a set of i and i+1 and find difference
    my_diff = set(split_instances[i]) - set(split_instances[i+1])
    if len(my_diff) != 3:
        print("Found duplicate!", my_diff, split_instances[i], split_instances[i+1])

# generate the labels based on the current list of tokens
# now set all Moralisierung, Moralisierung Kontext, 
# Moralisierung explizit, Moralisierung interpretativ, Moralisierung Weltwissen to 1
selected_labels = ["Moralisierung", "Moralisierung Kontext", "Moralisierung Weltwissen",
                  "Moralisierung explizit", "Moralisierung interpretativ"]
for span in data_doc[example_name]["train"].spans["task1"]:
    if span.label_ in selected_labels:
        labellist[span.start+1:span.end] = [1] * (span.end-span.start)
        # mark the beginning of a span with 2
        labellist[span.start] = 2
        # here we could also mark punctuation but we will leave that for after the 
        # transformers tokenizer
# now punctuation needs a label of -100
# for i, token in enumerate(tokenlist):
#     print(token.text, token.is_punct)
#     if token.is_punct:
#         labellist[i] = -100
        
# for token, label in zip(tokenlist, labellist):
#     print(token.text, label)

# convert token into token.text
wordlist = [token.text for token in tokenlist]
templist = list(zip(wordlist, labellist))
instance_list = []
# Now we can generate a list of instances which is a list of tokens
for i, item in enumerate(split_instances):
    j = split_instances[i-1][0] if i > 0 else 0
    temp = templist[j:item[0]]
    instance_list.append(temp)

words = []
labels = []
# print(instance_list)
# unpack the tuples into two lists of lists
for mylist in instance_list:
    wordlists, labellists = zip(*mylist)
    words.append(wordlists)
    labels.append(labellists)

print(words[0])
print(labels[0])

In [None]:
# now make this a dict so it can be written to json and submitted to datasets
# data_set = {}
# for i, (token, label) in enumerate(zip(tokenlist,labellist)):    

In [None]:
# flake8-noqa-cell
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
tokenizer.is_fast

In [None]:
templist = list(zip(words[0], labels[0]))
print(templist)

In [None]:
# Clean up unnecessary tokens: remove "#"
# remove function does not work here as it only removes the first one
clean_list = [i for i in templist if i != ("#", 0) and i != ("#", 1)] 
print(clean_list)
# disentangle the tuples again
tokenlist, labellist = zip(*cleanlist)

In [None]:
print(labels[15:20])

In [None]:
# now we can feed this into the tokenizer
inputs = tokenizer(words[16], is_split_into_words=True)
inputs.tokens()

In [None]:
inputs.word_ids()

In [None]:
print(list(zip(inputs.word_ids(), labels[16])))

In [None]:
# labellist needs to be expanded to cover the new tokens
# beginning of a span needs a different label than inside of a span
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label == 2:
                label -= 1
            new_labels.append(label)

    return new_labels

In [None]:
word_ids = inputs.word_ids()
print(align_labels_with_tokens(labels[16], word_ids))

In [None]:
def tokenize_and_align_labels(words, labels):
    tokenized_inputs = tokenizer(
        words, truncation=True, is_split_into_words=True
    )
    new_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(label, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_dataset = []
for wordlist, labellist in zip(words, labels):
    print(wordlist)
    print(labellist)
    tokenized_dataset.append(tokenize_and_align_labels(wordlist, labellist))

In [None]:
tokenized_dataset = tokenize_and_align_labels(words, labels)

In [None]:
print(tokenized_dataset["labels"])

In [None]:
print(tokenized_dataset["attention_mask"])