Chunking--> find the tokens that belong to the same entity

LABELS:
    - 0 --> the word doesn't correspond to an entity
    - B-ER/I-PER --> the word corresponds to the beggining of/is inside a person entity
    - B-ORG/I-ORG --> the word corresponds to the beggining of/is inside an organization entity
    - B-LOC/I-LOC --> the word corresponds to the beggining of/is inside a location entity
    - B-MISC/I-MISC --> the word corresponds to the beggining of/is inside a miscellaneous entity

entities spanning two words, like “European Union” and “Werner Zwingmann,” are attributed a B- label for the first word and an I- label for the second.

Texts need to be converted to token IDs before the model can make sense of them

In [None]:
#DATA PROCESSING FOR TOKEN CLASSIFICATION

#Create the tokenizer object

from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#To tokenize a pre-tokenized input, we can use is_split_into_words = True

inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

#we can then expand our label list to match the tokens. The first rule we’ll apply is that special tokens get a label of -100.
# This is because by default -100 is an index that is ignored in the loss function we will use (cross entropy). Then, each token
# gets the same label as the token that started the word it’s inside, since they are part of the same entity
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

#To preprocess our whole dataset, we need to tokenize all the inputs and apply align_labels_with_tokens() on all the labels.

#we’ll write a function that processes a list of examples and use the Dataset.map() method with the option batched=True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

#We can now apply all that preprocessing in one go on the other splits of our dataset:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
# Leer otra vez --> Token classification
#COMPLETE CODE FOR THE TRAINIG LOOP
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )