# Baseline

In [31]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
import torch
import jsonlines
from pathlib import Path

from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

import datautil as dutil
from datautil import tokenizer
import evalutil as eutil
from model import *

import config

In [3]:
ROOT = Path(config.root)
TRAIN_FILE = ROOT / "araieval24_task1_train.jsonl"
DEV_FILE = ROOT / "araieval24_task1_dev.jsonl"

print(f"Training on {TRAIN_FILE.absolute()}\nValidating on {DEV_FILE.absolute()}\n")

with jsonlines.open(TRAIN_FILE) as jsonfile:
    for obj in jsonfile:
        print(obj)

        parsed = dutil.parse_sample(obj)
        print(parsed.keys())
        break

Training on /home/riyadh/codes/nlp/araieval_arabicnlp24/task1/src/../data/araieval24_task1_train.jsonl
Validating on /home/riyadh/codes/nlp/araieval_arabicnlp24/task1/src/../data/araieval24_task1_dev.jsonl

{'id': '7365', 'text': 'تحذيرات من حرب جديدة في حال فشل الانتخابات القادمة', 'labels': [{'start': 0, 'end': 50, 'technique': 'Appeal_to_Fear-Prejudice', 'text': 'تحذيرات من حرب جديدة في حال فشل الانتخابات القادمة'}, {'start': 11, 'end': 14, 'technique': 'Loaded_Language', 'text': 'حرب'}], 'type': 'tweet'}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'encoding'])


# Load Dataset

In [120]:
from datasets import Dataset

train_ds = Dataset.from_json(str(TRAIN_FILE))
val_ds = Dataset.from_json(str(DEV_FILE))

In [121]:
train_ds = train_ds.select(range(0, 10))
val_ds = val_ds.select(range(0, 5))

In [125]:
tokenized_train_ds = train_ds.map(
    dutil.parse_sample, remove_columns=train_ds.column_names
)

tokenized_val_ds = val_ds.map(dutil.parse_sample, remove_columns=val_ds.column_names)

In [126]:
tokenized_train_ds[0].keys()

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [127]:
type(tokenized_train_ds[0]["encoding"])

KeyError: 'encoding'

In [7]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

# Evaluation Utility

In [8]:
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

n_labels = len(id2label)


def divide(a: int, b: int):
    return a / b if b > 0 else 0


def compute_metrics(p):
    """
    Customize the `compute_metrics` of `transformers`
    Args:
        - p (tuple):      2 numpy arrays: predictions and true_labels
    Returns:
        - metrics (dict): f1 score on
    """
    # (1)
    predictions, true_labels = p
    # print(predictions.shape, true_labels.shape, type(predictions))

    # (2)
    predicted_labels = np.where(
        predictions > 0, np.ones(predictions.shape), np.zeros(predictions.shape)
    )
    metrics = {}

    # (3)
    cm = multilabel_confusion_matrix(
        true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels)
    )

    # (4)
    for label_idx, matrix in enumerate(cm):
        if label_idx == 0:
            continue  # We don't care about the label "O"
        tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]
        precision = divide(tp, tp + fp)
        recall = divide(tp, tp + fn)
        f1 = divide(2 * precision * recall, precision + recall)
        metrics[f"f1_{id2label[label_idx]}"] = f1

    # (5)
    macro_f1 = sum(list(metrics.values())) / (n_labels - 1)
    metrics["macro_f1"] = macro_f1

    return metrics


# def compute_metric(data):
#     hypotheses, reference = data
#     hypotheses = np.where(
#         hypotheses > 0, np.ones(hypotheses.shape), np.zeros(hypotheses.shape)
#     )
#     parse_label_encoding()

#     metrics = {}

# Training Setup

## Train

In [16]:
def model_init():
    # For reproducibility
    return CustomBertForTokenClassification.from_pretrained(dutil.MODEL_NAME, id2label=id2label, label2id=label2id)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.train()

NameError: name 'training_args' is not defined

## Todo
1. Write a method to transform tensor labels to tags in evaluation step.
2.    write a collate fn that also returns a list of encoding object, instead of just the batchencoding

In [94]:
obj = {
    "id": "AFP_458-eurl_02_004",
    "text": "كان بطل فقرة بروباغندا في الحلقة الأولى هو صلاح قوش ، المعادل الموضوعي لعُمر سليمان الرئيس الأسبق للمخابرات المصرية. وكما عمر سليمان، نُسجت حول قوش وأنشطته العديد من الأساطير الغامضة.",
    "labels": [
        {"technique": "Name_Calling-Labeling", "text": "بطل", "start": 4, "end": 7},
        {
            "technique": "Obfuscation-Vagueness-Confusion",
            "text": "نُسجت حول قوش وأنشطته العديد من الأساطير الغامضة",
            "start": 134,
            "end": 182,
        },
        {
            "technique": "Loaded_Language",
            "text": "الأساطير الغامضة",
            "start": 166,
            "end": 182,
        },
    ],
    "type": "paragraph",
}

obj2 = dutil.parse_sample(train_ds[0])
dutil.parse_sample(obj)["encoding"].word_to_chars(2)

<class 'dict'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'dict'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>


CharSpan(start=8, end=12)

In [93]:
obj2["encoding"].word_to_chars(2)

CharSpan(start=8, end=12)

## Utility for Datalaoder

In [36]:
print(tokenized_train_ds[0].keys())

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


In [32]:
class CollateFn:
    def __init__(self, tokenizer):
        self.data_collator = DataCollatorWithPadding(tokenizer, padding=True)
        self.tokenizer = tokenizer

    def __call__(self, data):
        data = collate_fn(data)
        fast_encodings = 

In [27]:
collate_fn = CollateFn(tokenizer)

In [29]:
train_dl = DataLoader(tokenized_train_ds, batch_size=3, shuffle=False, collate_fn=collate_fn)

In [30]:
next(iter(train_dl))

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])


In [None]:
tokeni