# Huggingface Offset based Preprocess
Contains following methods:

`parse_text`: accepts a raw text and span objects to return sub-word tokens and labels per tokens.

`parse_output`: accepts a tensor of labels of shape (L, C), where L is the sequence length and C is the number of classes. Returns span objects.

In [1]:
# !pip install -q jupyter-black
%load_ext jupyter_black
# !python -m spacy download xx_ent_wiki_sm

In [2]:
import torch
from statistics import mean

from tqdm import tqdm
from pathlib import Path
import jsonlines



# Data Setup

In [24]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [25]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import BertPreTrainedModel, BertModel
from transformers.utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

from typing import Optional, Union, Tuple
from transformers.modeling_outputs import TokenClassifierOutput

In [26]:
class CustomBertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.transpose(1, 2).float())

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [35]:
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

n_labels = len(id2label)


def divide(a: int, b: int):
    return a / b if b > 0 else 0


def compute_metrics(p):
    """
    Customize the `compute_metrics` of `transformers`
    Args:
        - p (tuple):      2 numpy arrays: predictions and true_labels
    Returns:
        - metrics (dict): f1 score on
    """
    # (1)
    predictions, true_labels = p
    print(predictions.shape, true_labels.shape, type(predictions))

    # (2)
    predicted_labels = np.where(
        predictions > 0, np.ones(predictions.shape), np.zeros(predictions.shape)
    )
    metrics = {}

    # (3)
    cm = multilabel_confusion_matrix(
        true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels)
    )

    # (4)
    for label_idx, matrix in enumerate(cm):
        if label_idx == 0:
            continue  # We don't care about the label "O"
        tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]
        precision = divide(tp, tp + fp)
        recall = divide(tp, tp + fn)
        f1 = divide(2 * precision * recall, precision + recall)
        metrics[f"f1_{id2label[label_idx]}"] = f1

    # (5)
    macro_f1 = sum(list(metrics.values())) / (n_labels - 1)
    metrics["macro_f1"] = macro_f1

    return metrics


# def compute_metric(data):
#     hypotheses, reference = data
#     hypotheses = np.where(
#         hypotheses > 0, np.ones(hypotheses.shape), np.zeros(hypotheses.shape)
#     )
#     parse_label_encoding()

#     metrics = {}

In [43]:
from datasets import Dataset

train_ds = Dataset.from_json(str(ROOT / "araieval24_task1_train.jsonl"))
val_ds = Dataset.from_json(str(ROOT / "araieval24_task1_dev.jsonl"))

In [44]:
train_ds = train_ds.select(range(0, 10))
val_ds = val_ds.select(range(0, 5))

In [30]:
def parse_sample(sample):
    encoding, label_encoding = parse_text(
        sample["text"], sample["labels"], tokenizer, LABELS
    )
    return {**encoding, "labels": label_encoding}

In [45]:
tokenized_train_ds = train_ds.map(parse_sample, remove_columns=train_ds.column_names)
tokenized_val_ds = val_ds.map(parse_sample, remove_columns=val_ds.column_names)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [56]:
len(tokenized_train_ds[0]["offset_mapping"])

256

In [1]:
training_args = TrainingArguments(
    output_dir="./models/fine_tune_bert_output_span_cat",
    use_cpu=True,
    evaluation_strategy="epoch",
    learning_rate=2.5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    log_level="critical",
    seed=12345,
)

NameError: name 'TrainingArguments' is not defined

In [46]:
def model_init():
    # For reproducibility
    return CustomBertForTokenClassification.from_pretrained(MODEL_NAME, id2label=id2label, label2id=label2id)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,F1 Loaded Language,F1 Consequential Oversimplification,F1 Causal Oversimplification,F1 Questioning The Reputation,F1 Straw Man,F1 Repetition,F1 Guilt By Association,F1 Appeal To Hypocrisy,F1 Conversation Killer,F1 False Dilemma-no Choice,F1 Whataboutism,F1 Slogans,F1 Obfuscation-vagueness-confusion,F1 Name Calling-labeling,F1 Flag Waving,F1 Doubt,F1 Appeal To Fear-prejudice,F1 Exaggeration-minimisation,F1 Red Herring,F1 Appeal To Popularity,F1 Appeal To Authority,F1 Appeal To Time,Macro F1
1,No log,0.359303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2,No log,0.311193,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3,No log,0.256412,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
4,No log,0.191045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
5,No log,0.17464,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
6,No log,0.163367,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
7,No log,0.15506,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
8,No log,0.149528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
9,No log,0.14555,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0


(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>
(5, 256, 23) (5, 23, 256) <class 'numpy.ndarray'>


KeyboardInterrupt: 

In [None]:
input_ids: torch.Size([16, 256]) labels: torch.Size([16, 23, 256])

In [None]:
a = torch.randn((16, 23, 256))
a.view(-1).shape