In [1]:
%load_ext jupyter_black

In [2]:
import torch
from transformers import (
    BertTokenizer,
    BertForTokenClassification,
    AdamW,
    BertTokenizerFast,
)
from torch.utils.data import DataLoader, Dataset
import json
import numpy as np
import argparse
import random
import sys

In [3]:
random.seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
propaganda_techniques = [
    "none",
    "Appeal_to_Values",
    "Loaded_Language",
    "Consequential_Oversimplification",
    "Causal_Oversimplification",
    "Questioning_the_Reputation",
    "Straw_Man",
    "Repetition",
    "Guilt_by_Association",
    "Appeal_to_Hypocrisy",
    "Conversation_Killer",
    "False_Dilemma-No_Choice",
    "Whataboutism",
    "Slogans",
    "Obfuscation-Vagueness-Confusion",
    "Name_Calling-Labeling",
    "Flag_Waving",
    "Doubt",
    "Appeal_to_Fear-Prejudice",
    "Exaggeration-Minimisation",
    "Red_Herring",
    "Appeal_to_Popularity",
    "Appeal_to_Authority",
    "Appeal_to_Time",
]
id_to_label = {i: label for i, label in enumerate(propaganda_techniques)}
label_to_id = {label: i for i, label in enumerate(propaganda_techniques)}

In [5]:
id_to_label

{0: 'none',
 1: 'Appeal_to_Values',
 2: 'Loaded_Language',
 3: 'Consequential_Oversimplification',
 4: 'Causal_Oversimplification',
 5: 'Questioning_the_Reputation',
 6: 'Straw_Man',
 7: 'Repetition',
 8: 'Guilt_by_Association',
 9: 'Appeal_to_Hypocrisy',
 10: 'Conversation_Killer',
 11: 'False_Dilemma-No_Choice',
 12: 'Whataboutism',
 13: 'Slogans',
 14: 'Obfuscation-Vagueness-Confusion',
 15: 'Name_Calling-Labeling',
 16: 'Flag_Waving',
 17: 'Doubt',
 18: 'Appeal_to_Fear-Prejudice',
 19: 'Exaggeration-Minimisation',
 20: 'Red_Herring',
 21: 'Appeal_to_Popularity',
 22: 'Appeal_to_Authority',
 23: 'Appeal_to_Time'}

In [6]:
label_to_id

{'none': 0,
 'Appeal_to_Values': 1,
 'Loaded_Language': 2,
 'Consequential_Oversimplification': 3,
 'Causal_Oversimplification': 4,
 'Questioning_the_Reputation': 5,
 'Straw_Man': 6,
 'Repetition': 7,
 'Guilt_by_Association': 8,
 'Appeal_to_Hypocrisy': 9,
 'Conversation_Killer': 10,
 'False_Dilemma-No_Choice': 11,
 'Whataboutism': 12,
 'Slogans': 13,
 'Obfuscation-Vagueness-Confusion': 14,
 'Name_Calling-Labeling': 15,
 'Flag_Waving': 16,
 'Doubt': 17,
 'Appeal_to_Fear-Prejudice': 18,
 'Exaggeration-Minimisation': 19,
 'Red_Herring': 20,
 'Appeal_to_Popularity': 21,
 'Appeal_to_Authority': 22,
 'Appeal_to_Time': 23}

In [None]:
class PropagandaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(text, is_split_into_words=False, return_offsets_mapping=True, padding='max_length',
                                  truncation=True, max_length=128)
        labels = [labels[i] if i < len(labels) else label_to_id['none'] for i in range(len(encoding['input_ids']))]

        item = {key: torch.tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.tensor(labels)
        return item

In [None]:
def load_data(file_path):
    data = []
    with open(file_path, "r", encoding='utf-8') as f:
      for line in f:
        dat = json.loads(line)
        data.append(dat)
    return data

In [None]:
def preprocess_data(data, tokenizer):
    sentences, labels = [], []
    for item in data:
        text = item['text']
        label_list = ['none'] * len(text)  # Default label
        # print(f"len of label_list: {len(label_list)}")
        for annotation in item['labels']:
            start, end = annotation['start'], annotation['end']
            label = annotation['technique']
            # Ensure label consistency in case of overlapping or nested labels
            if end > len(label_list):
              end = len(label_list)
            for i in range(start, end):
                # print(i)
                label_list[i] = label
        sentences.append(text)
        labels.append([label_to_id[label] for label in label_list])
    return sentences, labels

In [None]:
def compute_span_score(gold_annots, pred_annots):
    # count total no of annotations
    prec_denominator = sum([len(pred_annots[x]) for x in pred_annots])
    rec_denominator = sum([len(gold_annots[x]) for x in gold_annots])

    technique_Spr_prec = {propaganda_technique: 0 for propaganda_technique in propaganda_techniques}
    technique_Spr_rec = {propaganda_technique: 0 for propaganda_technique in propaganda_techniques}
    cumulative_Spr_prec, cumulative_Spr_rec = (0, 0)
    f1_articles = []

    for example_id, pred_annot_obj in pred_annots.items():
        gold_annot_obj = gold_annots[example_id]
        # print("%s\t%d\t%d" % (example_id, len(gold_annot_obj), len(pred_annot_obj)))

        document_cumulative_Spr_prec, document_cumulative_Spr_rec = (0, 0)
        for j, pred_ann in enumerate(pred_annot_obj):
            s = ""
            ann_length = pred_ann[1][1] - pred_ann[1][0]

            for i, gold_ann in enumerate(gold_annot_obj):
                if pred_ann[0] == gold_ann[0]:
                    # print(pred_ann, gold_ann)

                    # s += "\tmatch %s %s-%s - %s %s-%s"%(sd[0],sd[1], sd[2], gd[0], gd[1], gd[2])
                    intersection = span_intersection(gold_ann[1], pred_ann[1])
                    # print(intersection)
                    # print(intersection)
                    s_ann_length = gold_ann[1][1] - gold_ann[1][0]
                    Spr_prec = intersection / ann_length
                    document_cumulative_Spr_prec += Spr_prec
                    cumulative_Spr_prec += Spr_prec
                    s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|p| = %d/%d = %f (cumulative S(p,r)=%f)\n" \
                         % (pred_ann[0], pred_ann[1][0], pred_ann[1][1], gold_ann[0],
                            gold_ann[1][0], gold_ann[1][1], intersection, ann_length, Spr_prec,
                            cumulative_Spr_prec)
                    technique_Spr_prec[gold_ann[0]] += Spr_prec

                    Spr_rec = intersection / s_ann_length
                    document_cumulative_Spr_rec += Spr_rec
                    cumulative_Spr_rec += Spr_rec
                    s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|r| = %d/%d = %f (cumulative S(p,r)=%f)\n" \
                         % (pred_ann[0], pred_ann[1][0], pred_ann[1][1], gold_ann[0],
                            gold_ann[1][0], gold_ann[1][1], intersection, s_ann_length, Spr_rec,
                            cumulative_Spr_rec)
                    technique_Spr_rec[gold_ann[0]] += Spr_rec

        p_article, r_article, f1_article = compute_prec_rec_f1(document_cumulative_Spr_prec,
                                                               len(pred_annot_obj),
                                                               document_cumulative_Spr_rec,
                                                               len(gold_annot_obj))
        f1_articles.append(f1_article)

    p, r, f1 = compute_prec_rec_f1(cumulative_Spr_prec, prec_denominator, cumulative_Spr_rec, rec_denominator)

    f1_per_technique = []

    for technique_name in technique_Spr_prec.keys():
        prec_tech, rec_tech, f1_tech = compute_prec_rec_f1(technique_Spr_prec[technique_name],
                                                           compute_technique_frequency(pred_annots,
                                                                                       technique_name),
                                                           technique_Spr_prec[technique_name],
                                                           compute_technique_frequency(gold_annots,
                                                                                       technique_name))
        f1_per_technique.append(f1_tech)

    return p, r, f1, f1_per_technique


# if per_label is true, the scorer returns F1 score per technique
def FLC_score_to_string(gold_annotations, user_annotations, per_label):
    precision, recall, f1, f1_per_class = compute_span_score(gold_annotations, user_annotations)

    if per_label:
        res_for_screen = f"\nF1=%f\nPrecision=%f\nRecall=%f\n%s\n" % (f1, precision, recall, "\n".join(
            ["F1_" + pr + "=" + str(f) for pr, f in
             zip(propaganda_techniques, f1_per_class)]))
    else:
        average = sum(f1_per_class) / len(f1_per_class)
        res_for_screen = f"Micro-F1\tMacro-F1\tPrecision\tRecall\n%f\t%f\t%f\t%f" % (f1, average, precision, recall)

    res_for_script = "%f\t%f\t%f\t" % (f1, precision, recall)
    res_for_script += "\t".join([str(x) for x in f1_per_class])

    return res_for_screen

In [None]:

# parser = argparse.ArgumentParser(description="Fine-tune and evaluate Bert model.")
# parser.add_argument("--model_name_or_path", type=str, required=True,
#                     help="Path to pretrained model or model identifier from Huggingface.co models.")
# parser.add_argument("--train_file", type=str, required=True, help="A file containing the training data.")
# parser.add_argument("--validation_file", type=str, required=True, help="A file containing the validation data.")
# parser.add_argument("--input_column", type=str,
#                     help="Name of the source text column in the file.")
# parser.add_argument("--label_column", type=str,
#                     help="Name of the target label column in the file.")
# parser.add_argument("--output_dir", type=str, default="./results", help="Where to store the fine-tuned model.")
# parser.add_argument("--output_file", type=str, help="Where to store the predictions.")
# parser.add_argument("--max_input_length", type=int, default=512,
#                     help="The maximum total input sequence length after tokenization.")
# parser.add_argument("--max_target_length", type=int, default=512,
#                     help="The maximum total target sequence length after tokenization.")
# parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
# parser.add_argument("--per_device_train_batch_size", type=int, default=4,
#                     help="Batch size per device during training.")
# parser.add_argument("--per_device_eval_batch_size", type=int, default=4,
#                     help="Batch size per device during evaluation.")
# parser.add_argument("--src_lang_code", type=str, default='ar', help="source language")
# parser.add_argument("--tgt_lang_code", type=str, default='en', help="target language")

# args = parser.parse_args()


tokenizer = BertTokenizerFast.from_pretrained("aubmindlab/bert-base-arabertv2")
train_data = load_data("/content/drive/MyDrive/Thesis/data/araieval24_task1_train.jsonl")
val_data = load_data("/content/drive/MyDrive/Thesis/data/araieval24_task1_dev.jsonl")
train_sentences, train_labels = preprocess_data(train_data, tokenizer)
val_sentences, val_labels = preprocess_data(val_data, tokenizer)

train_dataset = PropagandaDataset(train_sentences, train_labels, tokenizer)
val_dataset = PropagandaDataset(val_sentences, val_labels, tokenizer)

model = BertForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=len(propaganda_techniques))
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

model.train()
for epoch in range(3):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != 'offset_mapping'}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch} loss: {loss.item()}")

# model.save_pretrained("/content/drive/MyDrive/Thesis/test_model")



Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0 loss: 0.8803209066390991
Epoch 1 loss: 4.263891220092773
Epoch 2 loss: 0.7508571743965149


In [None]:
# model.eval()
# predictions, true_labels = [], []
# for batch in val_loader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)
#     predictions.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy().tolist())
#     true_labels.extend(batch['labels'].cpu().numpy().tolist())
# pred_labels = [[id_to_label[label_id] for label_id in pred] for pred in predictions]
# true_labels = [[id_to_label[label_id] for label_id in true] for true in true_labels]

predictions = []

# Iterate over batches in the test dataset
for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    # No need for labels during prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=2)
    batch_predictions = batch_predictions.cpu().numpy()
    input_ids = input_ids.cpu().numpy()

    for idx, (input_id, prediction) in enumerate(zip(input_ids, batch_predictions)):
        tokens = tokenizer.convert_ids_to_tokens(input_id)
        new_entities = []
        current_entity = None
        for token_id, token, label_id in zip(input_id, tokens, prediction):
            if label_id != label_to_id['none']:  # Not an 'O' label
                label = id_to_label[label_id]
                if current_entity is None:
                    current_entity = {"technique": label, "text": token, "start": token_id, "end": token_id}
                elif current_entity["technique"] == label:
                    current_entity["text"] += " " + token
                    current_entity["end"] = token_id
                else:
                    new_entities.append(current_entity)
                    current_entity = {"technique": label, "text": token, "start": token_id, "end": token_id}
            else:
                if current_entity is not None:
                    new_entities.append(current_entity)
                    current_entity = None

        # Add last entity if any
        if current_entity is not None:
            new_entities.append(current_entity)

        # Translate token positions to character positions
        for entity in new_entities:
            start_token, end_token = entity["start"], entity["end"]
            entity["start"], entity["end"] = tokenizer.convert_tokens_to_string(tokens[:start_token + 1]).rfind(
                entity["text"].split()[0]), \
                tokenizer.convert_tokens_to_string(tokens[:end_token + 1]).rfind(entity["text"].split()[-1]) + len(
                    entity["text"].split()[-1])

        predictions.append({"id": str(idx), "labels": new_entities})

# Save predictions to file
with open('/content/drive/MyDrive/Thesis/test_model/predictions.json', 'w', encoding='utf-8') as f:
    json.dump(predictions, f, ensure_ascii=False, indent=4)

In [None]:
res_for_screen = FLC_score_to_string(gold_labels, pred_labels, False)
print(res_for_screen)

NameError: name 'gold_labels' is not defined

In [None]:
len("تحذيرات من حرب جديدة في حال فشل الانتخابات القادمة")

50

In [None]:
{"id": "1392484757930496000", "text": "🚨 #عاجل ▪️ قوات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل.", "labels": [{"start": 12, "end": 25, "technique": "Name_Calling-Labeling", "text": "قوات الاحتلال"}, {"start": 12, "end": 76, "technique": "Questioning_the_Reputation", "text": "قوات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل."}], "type": "tweet"}

In [None]:
txt = "🚨 #عاجل ▪️ قوات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل."

In [None]:
txt[1:76]

'قوات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل.'

In [None]:
txt[0]

'🚨'

In [None]:
txt[1]

' '

In [None]:
txt[2]

'#'

In [None]:
txt[3]

'ع'

In [None]:
len(txt)

75

In [None]:
txt[12:75]

'وات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل.'

In [None]:
txt[12:76]

'وات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل.'

In [None]:
txt[12:74]

'وات الاحتلال تطلق النار على شاب داخل حاجز شارع الشهداء بالخليل'

In [None]:
txt[76]

IndexError: string index out of range

In [None]:
txt[74]

'.'

In [None]:
d = [1,2,3]
{label: i for i, label in enumerate(d, start=1)}

{1: 1, 2: 2, 3: 3}

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

In [None]:
train_ds = Dataset.from_json("/content/drive/MyDrive/Thesis/data/araieval24_task1_train.jsonl")
val_ds = Dataset.from_json("/content/drive/MyDrive/Thesis/data/araieval24_task1_dev.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
propaganda_techniques = ['Appeal_to_Values', 'Loaded_Language', 'Consequential_Oversimplification',
                             'Causal_Oversimplification', 'Questioning_the_Reputation', 'Straw_Man', 'Repetition',
                             'Guilt_by_Association', 'Appeal_to_Hypocrisy', 'Conversation_Killer',
                             'False_Dilemma-No_Choice', 'Whataboutism', 'Slogans',
                             'Obfuscation-Vagueness-Confusion',
                             'Name_Calling-Labeling', 'Flag_Waving', 'Doubt',
                             'Appeal_to_Fear-Prejudice', 'Exaggeration-Minimisation', 'Red_Herring',
                             'Appeal_to_Popularity', 'Appeal_to_Authority', 'Appeal_to_Time']

label_to_id = {label: i for i, label in enumerate(propaganda_techniques, start=1)}
id_to_label = {i:label for label, i in label_to_id.items()}
label_to_id = {
    'none': 0,
    **{f'B-{k}': 2*v - 1 for k, v in label_to_id.items()},
    **{f'I-{k}': 2*v for k, v in label_to_id.items()}
}

id_to_label = {i:label for label, i in label_to_id.items()}

In [None]:

def get_token_role_in_span(token_start: int, token_end: int, span_start: int, span_end: int):
    """
    Check if the token is inside a span.
    Args:
      - token_start, token_end: Start and end offset of the token
      - span_start, span_end: Start and end of the span
    Returns:
      - "B" if beginning
      - "I" if inner
      - "O" if outer
      - "N" if not valid token (like <SEP>, <CLS>, <UNK>)
    """
    if token_end <= token_start:
        return "N"
    if token_start < span_start or token_end > span_end:
        return "O"
    if token_start > span_start:
        return "I"
    else:
        return "B"

MAX_LENGTH = 256

def tokenize_and_adjust_labels(sample):
    """
    Args:
        - sample (dict): {"id": "...", "text": "...", "tags": [{"start": ..., "end": ..., "tag": ...}, ...]
    Returns:
        - The tokenized version of `sample` and the labels of each token.
    """
    # Tokenize the text, keep the start and end positions of tokens with `return_offsets_mapping` option
    # Use max_length and truncation to ajust the text length
    tokenized = tokenizer(sample["text"],
                          return_offsets_mapping=True,
                          padding="max_length",
                          max_length=MAX_LENGTH,
                          truncation=True)

    # We are doing a multilabel classification task at each token, we create a list of size len(label2id)=13
    # for the 13 labels
    labels = [[0 for _ in label_to_id.keys()] for _ in range(MAX_LENGTH)]

    # Scan all the tokens and spans, assign 1 to the corresponding label if the token lies at the beginning
    # or inside the spans
    for (token_start, token_end), token_labels in zip(tokenized["offset_mapping"], labels):
        for span in sample["labels"]:
            role = get_token_role_in_span(token_start, token_end, span["start"], span["end"])
            if role == "B":
                token_labels[label_to_id[f"B-{span['technique']}"]] = 1
            elif role == "I":
                token_labels[label_to_id[f"I-{span['technique']}"]] = 1

    return {**tokenized, "labels": labels}

In [None]:
model_name = "aubmindlab/bert-base-arabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [None]:
tokenized_train_ds = train_ds.map(tokenize_and_adjust_labels, remove_columns=train_ds.column_names)
tokenized_val_ds = val_ds.map(tokenize_and_adjust_labels, remove_columns=val_ds.column_names)

Map:   0%|          | 0/6997 [00:00<?, ? examples/s]

Map:   0%|          | 0/921 [00:00<?, ? examples/s]

In [None]:
sample = tokenized_train_ds[0]
for i in sample["labels"]:
  if sum(i)>0:
    print(i)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0,

In [None]:
print("--------Token---------|--------Labels----------")
for token_id, token_labels in zip(sample["input_ids"], sample["labels"]):
    # Decode the token_id into text
    token_text = tokenizer.decode(token_id)

    # Retrieve all the indices corresponding to the "1" at each token, decode them to label name
    labels = [id_to_label[label_index] for label_index, value in enumerate(token_labels) if value==1]

    # Decode those indices into label name
    print(f" {token_text:20} | {labels}")

    # Finish when we meet the end of sentence.
    if token_text == "</s>":
        break

--------Token---------|--------Labels----------
 [CLS]                | []
 تحذير                | ['B-Appeal_to_Fear-Prejudice']
 ##ات                 | ['I-Appeal_to_Fear-Prejudice']
 من                   | ['I-Appeal_to_Fear-Prejudice']
 حرب                  | ['B-Loaded_Language', 'I-Appeal_to_Fear-Prejudice']
 جديد                 | ['I-Appeal_to_Fear-Prejudice']
 ##ة                  | ['I-Appeal_to_Fear-Prejudice']
 في                   | ['I-Appeal_to_Fear-Prejudice']
 حال                  | ['I-Appeal_to_Fear-Prejudice']
 فشل                  | ['I-Appeal_to_Fear-Prejudice']
 ال                   | ['I-Appeal_to_Fear-Prejudice']
 ##انتخاب             | ['I-Appeal_to_Fear-Prejudice']
 ##ات                 | ['I-Appeal_to_Fear-Prejudice']
 ال                   | ['I-Appeal_to_Fear-Prejudice']
 ##قادم               | ['I-Appeal_to_Fear-Prejudice']
 ##ة                  | ['I-Appeal_to_Fear-Prejudice']
 [SEP]                | []
 [PAD]                | []
 [PAD]                | [

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)