pip install transformers dataset tokenizer seqeval -q

NER - Named Entity Recognition
POS - Part of Speech

#Use Hugging face transformer & create pipeline to achieve Data ingestion (with BERT), preprocessing, training, evaluation 

In [None]:
import datasets
import numpy as np
import torch
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification

dataset = datasets.load_dataset("conll2003",trust_remote_code=True) # Load the dataset

print(dataset['train']) # Print the training set info
##  Dataset({
#     features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
#     num_rows: 14041
# })

example_text = dataset['train'][0] # Get the first example from the training set
print(example_text) # Print the training set info
## {'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

print(dataset['train'].features['ner_tags']) # Print the test set info
## Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",trust_remote_code=True) # Load the tokenizer using pretrained lower cased BERT model
## example_text['tokens'] = ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

tokenised_input = tokenizer(example_text['tokens'], truncation=True, is_split_into_words=True) # Tokenize the input text
print(tokenised_input) # Print the tokenized input
## tokenised_input = {'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

tokens = tokenizer.convert_ids_to_tokens(tokenised_input['input_ids']) # Convert the input ids to tokens
print(tokens) # Print the tokens
## tokens = ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']

word_ids = tokenised_input.word_ids() # Get the word ids from the tokenized input
print(word_ids) # Print the word ids
##[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

def tokenize_and_align_labels(examples, label_all_tokens=True):

    #tokeinze ids
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []


    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.

        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

res_tokenized_inputs = tokenize_and_align_labels(dataset['train'][3:4]) # Tokenize and align the labels for the first two examples in the training set
print(res_tokenized_inputs) # Print the tokenized inputs
##{'input_ids': [[101, 1996, 2647, 3222, 2056, 2006, 9432, 2009, 18335, 2007, 2446, 6040, 2000, 10390, 2000, 18454, 2078, 2329, 12559, 2127, 6529, 5646, 3251, 5506, 11190, 4295, 2064, 2022, 11860, 2000, 8351, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

for token, label in zip(tokenizer.convert_ids_to_tokens(res_tokenized_inputs["input_ids"][0]),res_tokenized_inputs["labels"][0]):
    print(f"{token:_<40} {label}") # Print the tokens and labels for the first example in the training set

# Applying on entire data
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
# Print the tokenized datasets info
print(tokenized_datasets["train"][0]) # Print the tokenized datasets info

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased",num_labels=9)

from transformers import TrainingArguments, Trainer
import evaluate

args = TrainingArguments(
"test-ner",
eval_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=1,
weight_decay=0.01
)

# print('dataset info :/n',dataset) # Print the dataset info
# print('dataset keys :/n',dataset.keys()) # Print the keys of the dataset

# Print the training set info
# print('train dataset info:',dataset['train'].features) # Print the features of the training set
# print('dataset train features keys',dataset['train'].features.keys()) # Print the keys of the features of the training set
# print('dataset train features ner_tags',dataset['train'].features['ner_tags']) # Print the ner_tags feature info of the training set
# print('dataset train features pos_tags',dataset['train'].features['pos_tags']) # Print the pos_tags feature info of the training set
# print('First document/sentence: ',dataset['train'][0]) # Print the tokens feature info of the training set

# # Print the test set info
# print('test dataset info',dataset['test'].features) # Print the features of the test set
# print('dataset test keys',dataset['test'].keys()) # Print the keys of the test set
# print('dataset test features',dataset['test'].features) # Print the features of the test set
# print('dataset test features keys',dataset['test'].features.keys()) # Print the keys of the features of the test set

# # Print the validation set info
# print('validation dataset info',dataset['validation'].features) # Print the features of the validation set
# print('dataset validation keys',dataset['validation'].keys()) # Print the keys of the validation set
# print('dataset validation features',dataset['validation'].features) # Print the features of the validation set
# print('dataset validation features keys',dataset['validation'].features.keys()) # Print the keys of the features of the validation set

# print(dataset['train'].features['ner_tags']) # Print the ner_tags feature info
# print(dataset['train'][0]) # Check the first example in the training set



Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)
{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]
{'input_ids': [[101, 1996, 2647, 3222, 2056, 2006, 9432, 2009, 18335, 2007, 2446, 6040, 2000, 10390, 2000, 18454, 2078, 2329, 12559, 2127, 6529, 5646, 3251, 5506, 11190, 4295, 2064, 2022,

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
# Install torch if not already installed
%pip install torch

import torch
x = torch.rand(5, 3)
print(x)


Collecting torch
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
   ---------------------------------------- 0.0/204.1 MB ? eta -:--:--
   ---------------------------------------- 2.1/204.1 MB 10.7 MB/s eta 0:00:19
    --------------------------------------- 4.7/204.1 MB 11.9 MB/s eta 0:00:17
   - -------------------------------------- 6.3/204.1 MB 10.2 MB/s eta 0:00:20
   - -------------------------------------- 8.7/204.1 MB 10.5 MB/s eta 0:00:19
   -- ------------------------------------- 11.0/204.1 MB 10.7 MB/s eta 0:00:18
   -- ------------------------------------- 13.6/204.1 MB 10.8 MB/s eta 0:00:18
   --- ------------------------------------ 16.0/204.1 MB 11.1 MB/s eta 0:00:18
   --- ------------------------------------ 18.6/204.1 MB 11.1 MB/s eta 0:00:17
   ---- ----------------------------------- 21.