In the first exercise, we will deal with **Part-of-Speech (POS) Tagging** problem. We will fine-tune model from Hugging face and fine tune it using Penn Tree Bank dataset

# Import libs

In [1]:
from typing import List
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk

nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

# Data

## Load data

In [2]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of sentences in the corpus:", len(tagged_sentences))
print("First sentence with tags:", tagged_sentences[0])

Number of sentences in the corpus: 3914
First sentence with tags: [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [3]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    # Convert tags to BIO format
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

# Print the first sentence and its tags
print("First sentence:", sentences[0])
print("First sentence tags:", sentence_tags[0])

First sentence: ['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']
First sentence tags: ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


In [4]:
label2id = {tag: i for i, tag in enumerate(set(tag for tags in sentence_tags for tag in tags))}
label2id['O'] = len(label2id)  # Add 'O' for outside tags
id2label = {i: tag for tag, i in label2id.items()}


print("Label to ID mapping:", label2id)
print("ID to label mapping:", id2label)

Label to ID mapping: {'NNPS': 0, 'NN': 1, 'WP': 2, 'VBG': 3, 'PDT': 4, 'MD': 5, 'WDT': 6, 'VBD': 7, 'UH': 8, 'SYM': 9, '$': 10, 'VBZ': 11, 'EX': 12, 'WRB': 13, 'LS': 14, 'RP': 15, 'CD': 16, 'POS': 17, 'RBS': 18, 'PRP$': 19, 'VBP': 20, 'IN': 21, 'VB': 22, 'PRP': 23, 'TO': 24, '#': 25, '-RRB-': 26, '.': 27, 'NNP': 28, '``': 29, 'JJ': 30, '-NONE-': 31, ':': 32, "''": 33, 'CC': 34, 'RB': 35, ',': 36, '-LRB-': 37, 'NNS': 38, 'FW': 39, 'JJS': 40, 'VBN': 41, 'DT': 42, 'RBR': 43, 'JJR': 44, 'WP$': 45, 'O': 46}
ID to label mapping: {0: 'NNPS', 1: 'NN', 2: 'WP', 3: 'VBG', 4: 'PDT', 5: 'MD', 6: 'WDT', 7: 'VBD', 8: 'UH', 9: 'SYM', 10: '$', 11: 'VBZ', 12: 'EX', 13: 'WRB', 14: 'LS', 15: 'RP', 16: 'CD', 17: 'POS', 18: 'RBS', 19: 'PRP$', 20: 'VBP', 21: 'IN', 22: 'VB', 23: 'PRP', 24: 'TO', 25: '#', 26: '-RRB-', 27: '.', 28: 'NNP', 29: '``', 30: 'JJ', 31: '-NONE-', 32: ':', 33: "''", 34: 'CC', 35: 'RB', 36: ',', 37: '-LRB-', 38: 'NNS', 39: 'FW', 40: 'JJS', 41: 'VBN', 42: 'DT', 43: 'RBR', 44: 'JJR', 45: 

## Split data

In [5]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3
)
valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5
)

## Create Pytorch Dataset

In [6]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

MAX_LENGTH = 256

class PosTaggingDataset(Dataset):
    def __init__(self, 
                sentences: List[List[str]], 
                tags: List[List[str]], 
                tokenizer,
                label2id,
                max_length = MAX_LENGTH):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        words = self.sentences[idx]
        word_tags = self.tags[idx]

        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            return_attention_mask=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Align labels with tokenized inputs
        word_ids = encoding.word_ids(batch_index=0)  # Map token position to word index
        labels = []
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)  # Ignored by loss
            else:
                labels.append(self.label2id[word_tags[word_id]])

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels),
        }

In [7]:
train_dataset = PosTaggingDataset(
    sentences=train_sentences,
    tags=train_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)
valid_dataset = PosTaggingDataset(
    sentences=valid_sentences,
    tags=valid_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)
test_dataset = PosTaggingDataset(
    sentences=test_sentences,
    tags=test_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)

# Modeling

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
)


Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import evaluate
import numpy as np

ignore_label = -100
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        id2label[p] for prediction, label in zip(predictions, labels)
        for p, l in zip(prediction, label) if l != ignore_label
    ]
    true_labels = [
        id2label[l] for prediction, label in zip(predictions, labels)
        for p, l in zip(prediction, label) if l != ignore_label
    ]

    results = accuracy.compute(predictions=true_predictions, references=true_labels)
    return {"accuracy": results["accuracy"]}


In [10]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [['NNS', 'NNS', 'VBD', '-NONE-', 'NNS', 'RB', 'VBD', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'IN', 'JJ', 'NNS', 'NNS', 'IN', 'DT', 'NNP', 'JJ', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'NN', 'IN', '-NONE-', 'VBG', 'JJ', 'JJ', 'JJ', 'NN', 'NN', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', '.'], ['JJR', 'NNS', 'NNS', 'MD', 'VB', 'VB', '-NONE-', 'NNS', 'NNS', 'IN', 'JJR', 'NNS', 'NNS', 'WP', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'VBP', 'JJR', 'NNS', 'NNS', 'VB', 'CC', 'VB', ',', 'PRP', 'VBP', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '.'], ['JJ', 'TO', 'RB', 'RB', 'JJ', 'NN', 'NNS', 'VBN', '-NONE-', 'TO', 'NN', 'NN', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', 'NN', ',', 'NN', 'NN', 'NN', 'VBD', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'IN', 'NN', 'CC', 'NN', 'NNS', ',', 'WDT', 'NNS', 'NNS', 'VBD', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'DT', '``', '``', 'JJ', "''", "''", 'NN', 'RB', 'VBN', '-NONE-', 'WRB', 'DT', 'NN', 'VBZ', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'TO', 'VB', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'CC', 'IN', 'NNS', 'IN', 'NN', '.'], ..., ['IN', 'NN', ',', 'VBZ', 'VBP', 'NNS', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'CD', 'NNS', 'NNS', 'NN', 'CC', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'CD', 'NN', '.'], ['IN', 'IN', ',', 'NNP', 'NNP', 'VBZ', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', ',', 'NN', 'NNS', 'NNS', 'VBP', 'JJ', '-NONE-', 'VBG', 'VBG', 'VBG', 'IN', 'JJ', 'JJ', 'NNS', ':', ':', 'CD', 'JJ', 'JJ', 'NN', 'VBD', 'PRP', 'TO', 'DT', 'NNS', 'NNS', 'NNS', 'IN', 'NNP', 'NNP', '.'], ['DT', 'JJ', 'NNS', 'RB', ',', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'POS', 'POS', 'NN', ',', 'DT', 'NNS', 'VBG', 'VBG', 'VBG', 'VBG', 'DT', 'NN', 'RB', 'VBD', 'VBN', 'VBN', 'VBN', 'PRP$', 'NNP', 'JJ', 'JJ', 'NNP', 'NNS', 'IN', 'DT', 'JJ', 'NN', 'IN', 'JJ', 'NNP', 'NNP', '.']],
Input references: [['NNS', 'NNS', 'VBD', '-NONE-', 'NNS', 'RB', 'VBD', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'IN', 'JJ', 'NNS', 'NNS', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'VBZ', 'IN', 'DT', 'NN', 'NN', 'IN', '-NONE-', 'VBG', 'RBS', 'JJ', 'JJ', 'NN', 'NN', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', '.'], ['JJR', 'NNS', 'NNS', 'MD', 'VB', 'VB', 'RB', 'NNS', 'NNS', 'IN', 'JJR', 'NNS', 'NNS', 'WP', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'VBP', 'JJR', 'NNS', 'NNS', 'VB', 'CC', 'VB', ',', 'PRP', 'VBP', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '.'], ['JJ', 'TO', 'RB', 'RB', 'JJ', 'NN', 'NNS', 'VBN', '-NONE-', 'TO', 'NN', 'NN', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', 'NN', ',', 'NN', 'NN', 'NN', 'VBD', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'IN', 'NN', 'CC', 'NN', 'NNS', ',', 'WDT', 'NNS', 'NNS', 'VBD', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'DT', '``', '``', 'JJ', "''", "''", 'NN', 'RB', 'VBN', '-NONE-', 'WRB', 'DT', 'NN', 'VBZ', 'VBN', '-NONE-', '-NONE-', '-NONE-', 'TO', 'VB', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', 'CC', 'IN', 'NNS', 'IN', 'NN', '.'], ..., ['IN', 'NN', ',', 'EX', 'VBP', 'NNS', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'CD', 'NNS', 'NNS', 'NN', 'CC', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'CD', 'NN', '.'], ['IN', 'IN', ',', 'NNP', 'NNP', 'VBZ', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', '-NONE-', ',', 'NN', 'NNS', 'NNS', 'VBP', 'IN', '-NONE-', 'VBG', 'VBG', 'VBG', 'IN', 'JJ', 'JJ', 'NNS', ':', ':', 'CD', 'JJ', 'JJ', 'NN', 'VBD', 'PRP', 'TO', 'DT', 'NNS', 'NNS', 'NNS', 'IN', 'NNP', 'NNP', '.'], ['DT', 'JJ', 'NNS', 'RB', ',', 'IN', 'DT', 'NNP', 'NNP', 'NNP', 'NNP', 'NN', 'POS', 'POS', 'NN', ',', 'DT', 'NNS', 'VBG', 'VBG', 'VBG', 'VBG', 'DT', 'NN', 'RB', 'VBD', 'VBN', 'VBN', 'VBN', 'PRP$', 'JJ', 'JJ', 'JJ', 'JJ', 'NNS', 'IN', 'DT', 'JJ', 'NN', 'IN', 'JJ', 'NNP', 'NNP', '.']]