# Notebook for transformers exploration

In [None]:
# flake8-noqa-cell
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from moralization import input as inp
from moralization import analyse as ae

In [None]:
!nvidia-smi

In [None]:
# flake8-noqa-cell
raw_data_no_moralization = (
    "../../moralization-notes/Alle_bearbeiteten_Annotationen-0_label.csv"
)
df_raw_no_moralization = pd.read_csv(raw_data_no_moralization)
data_dict = inp.InputOutput.read_data("../data/All_Data/XMI_11")
df_spans = ae.AnalyseOccurrence(data_dict, mode="spans").df

In [None]:
df_spans.head(10)

In [None]:
df_raw_no_moralization = df_raw_no_moralization.rename(
    columns={"Label": "Label_moralization"}
)

In [None]:
df_raw_no_moralization.head(10)

In [None]:
df_spans.loc["KAT1-Moralisierendes Segment"]
# all that are not "Keine Moralisierung" shall be "Moralisierung"

In [None]:
# flake8-noqa-cell
df_new = df_spans.loc[["KAT1-Moralisierendes Segment"]]
# drop the multiindex
df_new = df_new.droplevel(0)
# sum strings over all sources
df_new["All sources"] = ""
for file in df_new.columns[:-1]:
    print(file)
    df_new["All sources"] += df_new[file] + "###"
# extract row content into new dataframe
# no moralization
df_no_moralization = pd.DataFrame(
    df_new["All sources"].loc["Keine Moralisierung"].split("###"), columns=["Sentences"]
)
# drop empty rows
df_no_moralization = df_no_moralization[df_no_moralization["Sentences"].astype(bool)]
df_no_moralization["Label_moralization"] = 0
# moralization
df_moralization = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung"].split("###"), columns=["Sentences"]
)
df_moralization = df_moralization[df_moralization["Sentences"].astype(bool)]
df_moralization["Label_moralization"] = 1
df_moralization_exp = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung explizit"].split("###"),
    columns=["Sentences"],
)
df_moralization_exp = df_moralization_exp[df_moralization_exp["Sentences"].astype(bool)]
df_moralization_exp["Label_moralization"] = 1
df_moralization_int = pd.DataFrame(
    df_new["All sources"].loc["Moralisierung interpretativ"].split("###"),
    columns=["Sentences"],
)
df_moralization_int = df_moralization_int[df_moralization_int["Sentences"].astype(bool)]
df_moralization_int["Label_moralization"] = 1
df_new.to_csv("df_new.csv")

In [None]:
df_new.head()

In [None]:
df_raw_no_moralization.head(10)

In [None]:
# merge all the data frames into one
frames = [
    df_raw_no_moralization,
    df_no_moralization,
    df_moralization,
    df_moralization_exp,
    df_moralization_int,
]
all_data = pd.concat(frames)

In [None]:
all_data.head(100)

### Inspect the data

In [None]:
all_data["Label_no_moralization"] = np.where(all_data["Label_moralization"] == 1, 0, 1)
all_data[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
all_data[["Label_moralization", "Label_no_moralization"]].sum().plot.bar(
    ylim=([0, 2000])
)

## Now we got the data in one frame, let's reshuffle and split into train, test, validate

In [None]:
all_data = all_data.sample(frac=1).reset_index(drop=True)

In [None]:
all_data.head(100)

In [None]:
# split into train, test, validate with 60% train, 20% validation, 20% test
train, validate, test = np.split(
    all_data.sample(frac=1, random_state=42),
    [int(0.6 * len(all_data)), int(0.8 * len(all_data))],
)

In [None]:
train["Label_no_moralization"] = np.where(train["Label_moralization"] == 1, 0, 1)
train[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
validate["Label_no_moralization"] = np.where(validate["Label_moralization"] == 1, 0, 1)
validate[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
test["Label_no_moralization"] = np.where(test["Label_moralization"] == 1, 0, 1)
test[["Label_moralization", "Label_no_moralization"]].sum().plot.bar()

In [None]:
# flake8-noqa-cell

from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class M_Dataset(Dataset):
    def __init__(
        self, data, tokenizer, attributes, max_token_len: int = 128, sample=1000
    ):
        self.data = data
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.sample = sample
        self._prepare_data()

    def _prepare_data(self):
        self.data["Label_no_moralization"] = np.where(
            self.data["Label_moralization"] == 1, 0, 1
        )
        if self.sample is not None:
            no_moralization = self.data.loc[self.data["Label_no_moralization"] > 0]
            moralization = self.data.loc[self.data["Label_no_moralization"] == 0]
            self.data = pd.concat(
                [moralization, no_moralization.sample(self.sample, random_state=7)]
            )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        comment = str(item.Sentences)
        attributes = torch.FloatTensor(item[self.attributes])
        tokens = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_token_len,
            return_attention_mask=True,
        )
        return {
            "input_ids": tokens.input_ids.flatten(),
            "attention_mask": tokens.attention_mask.flatten(),
            "labels": attributes,
        }

In [None]:
m_data_train = M_Dataset(
    train, tokenizer, ["Label_moralization", "Label_no_moralization"]
)
m_data_validate = M_Dataset(
    validate, tokenizer, ["Label_moralization", "Label_no_moralization"], sample=None
)
m_data_train.data.head(10)

In [None]:
m_data_train.__getitem__(0)["labels"].shape, m_data_train.__getitem__(0)[
    "input_ids"
].shape, m_data_train.__getitem__(0)["attention_mask"].shape

In [None]:
len(m_data_train)

In [None]:
# flake8-noqa-cell

import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [None]:
class M_Data_Module(pl.LightningDataModule):
    def __init__(
        self,
        train,
        val,
        attributes,
        batch_size: int = 16,
        max_token_length: int = 128,
        model_name="roberta-base",
    ):
        super().__init__()
        self.train = train
        self.val = val
        self.attributes = attributes
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage=None):
        if stage in (None, "fit"):
            self.train_dataset = M_Dataset(
                self.train, attributes=self.attributes, tokenizer=self.tokenizer
            )
            self.val_dataset = M_Dataset(
                self.val,
                attributes=self.attributes,
                tokenizer=self.tokenizer,
                sample=None,
            )
        if stage == "predict":
            self.val_dataset = M_Dataset(
                self.val,
                attributes=self.attributes,
                tokenizer=self.tokenizer,
                sample=None,
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, num_workers=4, shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False
        )

    def predict_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, num_workers=4, shuffle=False
        )

In [None]:
M_data_module = M_Data_Module(
    train, validate, attributes=["Label_moralization", "Label_no_moralization"]
)

In [None]:
M_data_module.setup()

In [None]:
M_data_module.train_dataloader()

In [None]:
len(M_data_module.train_dataloader())

## Model

In [None]:
# flake8-noqa-cell

from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

In [None]:
class M_Comment_Classifier(pl.LightningModule):
    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(
            config["model_name"], return_dict=True
        )
        self.hidden = torch.nn.Linear(
            self.pretrained_model.config.hidden_size,
            self.pretrained_model.config.hidden_size,
        )
        self.classifier = torch.nn.Linear(
            self.pretrained_model.config.hidden_size, self.config["n_labels"]
        )
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.loss_func = nn.BCEWithLogitsLoss(reduction="mean")
        self.dropout = nn.Dropout()

    def forward(self, input_ids, attention_mask, labels=None):
        # roberta layer
        output = self.pretrained_model(
            input_ids=input_ids, attention_mask=attention_mask
        )
        pooled_output = torch.mean(output.last_hidden_state, 1)
        # final logits
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        # calculate loss
        loss = 0
        if labels is not None:
            loss = self.loss_func(
                logits.view(-1, self.config["n_labels"]),
                labels.view(-1, self.config["n_labels"]),
            )
        return loss, logits

    def training_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("train loss ", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def validation_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("validation loss ", loss, prog_bar=True, logger=True)
        return {"val_loss": loss, "predictions": outputs, "labels": batch["labels"]}

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        return outputs

    def configure_optimizers(self):
        optimizer = AdamW(
            self.parameters(),
            lr=self.config["lr"],
            weight_decay=self.config["weight_decay"],
        )
        total_steps = self.config["train_size"] / self.config["batch_size"]
        warmup_steps = math.floor(total_steps * self.config["warmup"])
        warmup_steps = math.floor(total_steps * self.config["warmup"])
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, total_steps
        )
        return [optimizer], [scheduler]


# def validation_epoch_end(self, outputs):
#   losses = []
#   for output in outputs:
#     loss = output['val_loss'].detach().cpu()
#     losses.append(loss)
#   avg_loss = torch.mean(torch.stack(losses))
#   self.log("avg_val_loss", avg_loss)

In [None]:
config = {
    "model_name": "distilroberta-base",
    "n_labels": len(["Label_moralization", "Label_no_moralization"]),
    "batch_size": 128,
    "lr": 1.5e-6,
    "warmup": 0.2,
    "train_size": len(M_data_module.train_dataloader()),
    "weight_decay": 0.001,
    "n_epochs": 100,
}

model = M_Comment_Classifier(config)

In [None]:
idx = 0
input_ids = m_data_train.__getitem__(idx)["input_ids"]
attention_mask = m_data_train.__getitem__(idx)["attention_mask"]
labels = m_data_train.__getitem__(idx)["labels"]
model.cpu()
loss, output = model(
    input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0), labels.unsqueeze(dim=0)
)
print(labels.shape, output.shape, output)

### Train model

In [None]:
# datamodule
m_data_module = M_Data_Module(
    train,
    validate,
    attributes=["Label_moralization", "Label_no_moralization"],
    batch_size=config["batch_size"],
)
m_data_module.setup()

# model
model = M_Comment_Classifier(config)

In [None]:
# trainer and fit
trainer = pl.Trainer(max_epochs=config["n_epochs"], gpus=4, num_sanity_val_steps=50)

In [None]:
trainer.fit(model, m_data_module)

In [None]:
torch.cuda.is_available()