In the first exercise, we will deal with **Part-of-Speech (POS) Tagging** problem. We will fine-tune model from Hugging face and fine tune it using Penn Tree Bank dataset

# Import libs

In [35]:
from typing import List
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk

nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

# Data

## Load data

In [36]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of sentences in the corpus:", len(tagged_sentences))
print("First sentence with tags:", tagged_sentences[0])

Number of sentences in the corpus: 3914
First sentence with tags: [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [37]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    # Convert tags to BIO format
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

# Print the first sentence and its tags
print("First sentence:", sentences[0])
print("First sentence tags:", sentence_tags[0])

First sentence: ['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']
First sentence tags: ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


In [38]:

# Sort them (optional, for consistency)
label2id = {tag: i for i, tag in enumerate(set(tag for tags in sentence_tags for tag in tags))}
id2label = {i: tag for tag, i in label2id.items()}


## Split data

In [39]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3
)
valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5
)

## Create Pytorch Dataset

In [40]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

MAX_LENGTH = 256

class PosTaggingDataset(Dataset):
    def __init__(self, 
                sentences: List[List[str]], 
                tags: List[List[str]], 
                tokenizer,
                label2id,
                max_length = MAX_LENGTH):
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        words = self.sentences[idx]
        word_tags = self.tags[idx]

        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            return_attention_mask=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Align labels with tokenized inputs
        word_ids = encoding.word_ids(batch_index=0)  # Map token position to word index
        labels = []
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)  # Ignored by loss
            else:
                labels.append(self.label2id[word_tags[word_id]])

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.long),
        }

In [41]:
train_dataset = PosTaggingDataset(
    sentences=train_sentences,
    tags=train_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)
valid_dataset = PosTaggingDataset(
    sentences=valid_sentences,
    tags=valid_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)
test_dataset = PosTaggingDataset(
    sentences=test_sentences,
    tags=test_tags,
    tokenizer=tokenizer,
    label2id=label2id,
    max_length=MAX_LENGTH
)

# Modeling

In [42]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
)


Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
from seqeval.metrics import accuracy_score

ignore_label = -100

def compute_metrics(p):
    predictions , labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = {
        "accuracy": accuracy_score(true_labels, true_predictions),
    }  
    return results


In [44]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logging

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.137313,0.963415
2,No log,0.100538,0.973013
3,0.638800,0.093588,0.976101
4,0.638800,0.095172,0.976624
5,0.638800,0.09303,0.978192
6,0.045300,0.097347,0.977954
7,0.045300,0.101423,0.977622
8,0.045300,0.103546,0.977907
9,0.025400,0.10505,0.978239
10,0.025400,0.105121,0.977954


TrainOutput(global_step=1720, training_loss=0.20879248020260832, metrics={'train_runtime': 467.7171, 'train_samples_per_second': 58.561, 'train_steps_per_second': 3.677, 'total_flos': 3579882599208960.0, 'train_loss': 0.20879248020260832, 'epoch': 10.0})

In [47]:
for i in range(10):
    test_sentence = test_dataset[i]['input_ids']
    test_sentence = tokenizer.decode(test_sentence, skip_special_tokens=True)
    print("Test sentence:", test_sentence)
    test_sentence = test_sentence.lower()
    test_tokens =  torch.as_tensor ([ tokenizer . convert_tokens_to_ids ( test_sentence . split () ) ])
    test_tokens = test_tokens . to ( 'cuda' )
    outputs = model(test_tokens)
    _ , preds = torch . max( outputs . logits , -1)
    preds = preds [0]. cpu () . numpy ()
    pred_tags = ""
    for i in preds:
        pred_tags += id2label[i] + " "
    print("Predicted tags:", pred_tags)

Test sentence: the proposal comes as a surprise even to administration officials and temporarily throws into chaos the house's work on clean - air legislation.
Predicted tags: NN NN VBZ IN DT NN RB TO NN NNS CC RB `` IN NN DT `` NN IN JJ JJ JJ `` 
Test sentence: mr. stronach, founder and controlling shareholder of magna, resigned as chief executive officer last year * - 1 to seek, unsuccessfully, a seat in canada's parliament.
Predicted tags: . `` NN CC NN `` IN `` VBD IN NN NN NN JJ NN -NONE- -NONE- -NONE- TO `` `` DT NN IN `` `` 
Test sentence: the movie ends with sound, the sound of street people talking, and there isn't anything whimsical or enviable in those rough, beaten voices.
Predicted tags: NN NN VBZ IN `` DT NN IN NN NNS `` CC `` `` NN `` CC `` IN `` `` VBN `` 
Test sentence: - - in britain, the benchmark 11 3 \ / 4 % bond due 2003 \ / 2007 fell 14 \ / 32 to 111 2 \ / 32 * - 1 to yield 10. 19 %.
Predicted tags: . : IN `` DT `` CD CD CD CD CD NN NN JJ CD CD CD CD VBD CD CD CD