# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],
        num_rows: 3453
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

'EU    rejects German call to boycott British lamb .'
'B-ORG O       B-MISC O    O  O       B-MISC  O    O'

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

# My data

In [1]:
from moralization import input_data as inp
from moralization import analyse as ae
from moralization import spacy_model
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import datasets 
import evaluate
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorForTokenClassification
torch.cuda.is_available()
# import data as spacy doc and take it from there
data_dir = "../data/All_Data/XMI_11"
test_setup = spacy_model.SpacySetup(data_dir, working_dir="./test")
data_doc = test_setup.convert_data_to_spacy_doc()
example_name = list(data_doc.keys())[0]



In [2]:
sentence_list = [[token.text for token in sent] for sent in data_doc[example_name]["train"].sents]
token_list = [[token for token in sent] for sent in data_doc[example_name]["train"].sents]
# initialize nested label list to 0
label_list = [[0 for token in sent] for sent in data_doc[example_name]["train"].sents]
for i in range(0,5):
    print(sentence_list[i])
    print(label_list[i])
# generate the labels based on the current list of tokens
# now set all Moralisierung, Moralisierung Kontext, 
# Moralisierung explizit, Moralisierung interpretativ, Moralisierung Weltwissen to 1
selected_labels = ["Moralisierung", "Moralisierung Kontext", "Moralisierung Weltwissen",
                  "Moralisierung explizit", "Moralisierung interpretativ"]
# create a list as long as tokens
labels = [0 for token in data_doc[example_name]["train"]]
for span in data_doc[example_name]["train"].spans["task1"]:
    if span.label_ in selected_labels:
        labels[span.start+1:span.end] = [1] * (span.end-span.start)
        # mark the beginning of a span with 2
        labels[span.start] = 2
        
# labels now needs to be structured the same way as label_list
# set the label at beginning of sentence to 2 if it is 1
# also punctuation is included in the moralization label - we 
# definitely need to set those labels to -100
j = 0
for m in range(len(label_list)):
    for i in range(len(label_list[m])):
        label_list[m][i] = labels[j]
        if i==0 and labels[j]==1:
            label_list[m][i] = 2
        if token_list[m][i].is_punct:
            label_list[m][i] = -100
        j = j+1


for i in range(0,10):
    print(sentence_list[i])   
    print(label_list[i])

['                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                ']
[0]
['HMP05', '/', 'AUG.00228', 'Hamburger', 'Morgenpost', ',', '03.08.2005', ',', 'S.', '5', ';']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['ALG', 'II', 'ist', 'mit', 'der', 'Menschenwürde', 'vereinbar', '#', '#', '#']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['BERLIN']
[0]
['Das', 'Arbeitslosengeld', 'II', 'ist', 'nicht', 'so', 'niedrig', ',', 'dass', 

In [12]:
# at this point we can write the text into a csv to load into datasets
# later it can be published as such on huggingface datasets
# column heads are sentence, labels
df = pd.DataFrame(zip(sentence_list,label_list), columns=["Sentences", "Labels"])
print(df.head(10))
data_set = datasets.Dataset.from_pandas(df)
# split in train test
train_test_set = data_set.train_test_split(test_size=0.1)

                                           Sentences  \
0  [                                             ...   
1  [HMP05, /, AUG.00228, Hamburger, Morgenpost, ,...   
2  [ALG, II, ist, mit, der, Menschenwürde, verein...   
3                                           [BERLIN]   
4  [Das, Arbeitslosengeld, II, ist, nicht, so, ni...   
5  [Klägerin, war, eine, 55-jährige, Arbeitslose,...   
6  [Deswegen, bekommen, beide, pro, Kopf, nur, 31...   
7  [Arztkosten, seien, dadurch, ebenso, wenig, fi...   
8  [Dies, führe, zur, #, Ausgrenzung, ,, #, die, ...   
9                                          [#, #, #]   

                                              Labels  
0                                                [0]  
1      [0, -100, 0, 0, 0, -100, 0, -100, 0, 0, -100]  
2            [0, 0, 0, 0, 0, 0, 0, -100, -100, -100]  
3                                                [0]  
4  [2, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, 1, 1, 1, ...  
5  [0, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, -100, 0, ... 

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast

True

In [14]:
inputs = tokenizer(train_test_set["train"][9]["Sentences"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Die',
 'Fr',
 '##age',
 '„',
 'Dar',
 '##f',
 'die',
 'Part',
 '##ei',
 'das',
 '?',
 '”',
 'bean',
 '##t',
 '##wo',
 '##rte',
 '##te',
 'das',
 'V',
 '##er',
 '##fa',
 '##ss',
 '##ung',
 '##s',
 '##ger',
 '##ich',
 '##t',
 'mit',
 'J',
 '##a',
 '.',
 '[SEP]']

In [15]:

# label_list needs to be expanded to cover the new tokens
# beginning of a span needs a different label than inside of a span
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label == 2:
                label -= 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Sentences"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["Labels"]
    new_labels = []
    tokens = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        tokens.append(tokenized_inputs.tokens(i))
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
#     tokenized_inputs["tokens"] = tokens
    return tokenized_inputs

In [16]:
tokenized_datasets = train_test_set.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_test_set["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [18]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         -100,    1,    1,    1,    1,    1,    1,    1,    1,    1, -100,    1,
            1,    1, -100,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1, -100,
            1,    1,    1, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100]])

In [19]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, -100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, -100, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100]


In [20]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: /home/iulusoy/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [21]:
import evaluate

metric = evaluate.load("seqeval")

In [27]:
# labels = raw_datasets["train"][0]["ner_tags"]
# labels = [label_names[i] for i in labels]
label_names = ["0", "M", "M-BEG"]
labels = train_test_set["train"][0]["Labels"]
labels = [label_names[i] for i in labels if i != -100]
labels


['M-BEG', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M']

In [28]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])



{'BEG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 0.9333333333333333}

In [29]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [30]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [31]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [32]:
model.config.num_labels

3

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/sgugger/bert-finetuned-ner/commit/26ab21e5b1568f9afeccdaed2d8715f571d786ed'

In [35]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=8
#     tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [36]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [37]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [38]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [39]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [40]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

OSError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [None]:
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [43]:
output_dir = "."

In [44]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [46]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#         repo.push_to_hub(
#             commit_message=f"Training in progress epoch {epoch}", blocking=False
#         )

  0%|          | 0/153 [00:00<?, ?it/s]

epoch 0: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.9432114882506527}
epoch 1: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.9432114882506527}
epoch 2: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.9432114882506527}


In [47]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [50]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "."
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Das Arbeitslosengeld ist nicht hoch genug da man ungleiche Standards propagiert.")

[{'entity_group': '0',
  'score': 0.8749568,
  'word': 'Das Arbeitslosengeld ist nicht hoch genug da man ungleiche Standards propagiert.',
  'start': 0,
  'end': 80}]