In [31]:
#basic imports 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import transformers 
import datasets
from seqeval.metrics import f1_score,classification_report
from collections import defaultdict,Counter
device = torch.device("mps")

### load dataset

In [2]:
#CoNLL 2003 dataset
from datasets import load_dataset
en_ner = load_dataset("eriktks/conll2003")

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for eriktks/conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/eriktks/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [3]:
en_ner 

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
en_ner['train'][100]

{'id': '100',
 'tokens': ['Rabinovich',
  'is',
  'winding',
  'up',
  'his',
  'term',
  'as',
  'ambassador',
  '.'],
 'pos_tags': [21, 42, 39, 33, 29, 21, 15, 21, 7],
 'chunk_tags': [11, 21, 22, 15, 11, 12, 13, 11, 0],
 'ner_tags': [1, 0, 0, 0, 0, 0, 0, 0, 0]}

In [8]:
#removing pos and chunk tags
en_ner = en_ner.remove_columns(['pos_tags','chunk_tags'])

In [9]:
#tags to str
tags = en_ner['train'].features['ner_tags'].feature

In [10]:
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [11]:
def ner_str(batch):
    return {'ner_tags_str':[tags.names[idx] for idx in batch['ner_tags']]}

In [12]:
en_ner = en_ner.map(ner_str)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [14]:
en_ner['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'ner_tags_str': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}

### tag frequency per split

In [24]:
split_freq = defaultdict(Counter)
for split,ds in en_ner.items():
    for row in ds['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                split_freq[split][tag] += 1
    

In [26]:
pd.DataFrame(split_freq)

Unnamed: 0,train,validation,test
B-ORG,6321,1341,1661
B-MISC,3438,922,702
B-PER,6600,1842,1617
B-LOC,7140,1837,1668


### load model tokenizer

In [35]:
from transformers import AutoTokenizer
bert_path = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)

In [28]:
from transformers import BertForTokenClassification,AutoConfig

In [29]:
#config
tag2index = {tag:idx for idx,tag in enumerate(tags.names)}
index2tag = {idx:tag for idx,tag in enumerate(tags.names)}
bert_config = AutoConfig.from_pretrained(bert_path,num_labels=tags.num_classes,id2label=index2tag,
                                        label2id = tag2index)

In [30]:
bert_config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [32]:
bert_model = (BertForTokenClassification.from_pretrained(bert_path,config=bert_config).to(device))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
bert_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

### helper functions

In [36]:
def tag_text(text,tags,model,tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = bert_tokenizer(text,return_tensors="pt").input_ids.to(device)
    out = model(input_ids)[0]
    predictions = np.argmax(out,dim=2)
    pred = [tags.names[p] for p in predictions.cpu().numpy()]
    return pd.DataFrame([tokens,pred],index=['Token','Label'])

In [39]:
def tokenize_align(batch):
    tokenized_inputs = bert_tokenizer(batch['tokens'],truncation=True,is_split_into_words=True)
    labels = []
    for idx,label in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        label_ids = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None or word_id == prev_word:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
            prev_word = word_id
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [40]:
def align_pred(predictions,labels):
    preds = np.argmax(predictions,dim=2)
    batch_size,seq_len = preds.shape
    pred_list,label_list = [],[]
    for batch_idx in range(batch_size):
        ex_pred,ex_label = [],[]
        for seq_idx in range(seq_len):
            if labels[batch_idx,seq_idx] != -100:
                ex_pred.append(index2tag[preds[batch_idx][seq_idx]])
                ex_label.append(index2tag[label[batch_idx][seq_idx]])
        pred_list.append(ex_pred)
        label_list.append(ex_label)
    return pred_list,label_list

In [42]:
#tokenize
en_ner_enc = en_ner.map(tokenize_align,batched=True,remove_columns=['tokens','ner_tags','id'])

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [55]:
ids = en_ner_enc['train'][10]['input_ids']
ids = tokenizer.convert_ids_to_tokens(ids)
lbls = en_ner_enc['train'][10]['labels']

In [56]:
#the sub parts of words are assigned -100 label ##acio,##ch etc
pd.DataFrame([ids,lbls],index=['token','label'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
token,[CLS],spanish,farm,minister,loyola,de,pal,##acio,had,earlier,accused,fis,##ch,##ler,at,an,eu,farm,ministers,',meeting,of,causing,un,##just,##ified,alarm,through,"""",dangerous,general,##isation,.,"""",[SEP]
label,-100,7,0,0,1,2,2,-100,0,0,0,1,-100,-100,0,0,3,0,0,0,0,0,0,0,-100,-100,0,0,0,0,0,-100,0,0,-100


### it assigns -100 label to parts of the same word 

In [50]:
#example
text = "this is a sample text to test transformer tokenizer"
ids = tokenizer(text)['input_ids']
tokenizer.convert_ids_to_tokens(ids)

['[CLS]',
 'this',
 'is',
 'a',
 'sample',
 'text',
 'to',
 'test',
 'transform',
 '##er',
 'token',
 '##izer',
 '[SEP]']

here word transformer is broken into 2 words transform , ##er we want judge the model only based on prediction on transform as the prediction on the word transformer thus we assign label -100 to ##er part and its ignored by crossentropy loss

### compute metrics

In [65]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=bert_tokenizer)

In [57]:
def compute_metrics(pred):
    y_pred,y_true = align_pred(pred.predictions,pred.label_ids)
    f1 = f1_score(y_true,y_pred)
    return {'f1':f1}

### training args

In [62]:
from transformers import TrainingArguments
model_name = 'BERT-NER-CoNLL'
learning_rate = 2e-5
num_train_epochs = 3
weight_decay = 0.01
batch_size = 4
logging_steps = len(en_ner_enc['train'])//batch_size
training_args = TrainingArguments(output_dir=model_name,
                                 per_device_eval_batch_size=batch_size,
                                 per_device_train_batch_size=batch_size,
                                 learning_rate=learning_rate,
                                 num_train_epochs=num_train_epochs,
                                 weight_decay=weight_decay,
                                 logging_steps=logging_steps,
                                 eval_strategy="epoch",
                                 log_level="error",
                                 save_steps=1e6,
                                 disable_tqdm=False,
                                 push_to_hub=True)

### Trainer

In [66]:
from transformers import Trainer
trainer = Trainer(model=bert_model,
                 compute_metrics=compute_metrics,
                 args=training_args,
                 train_dataset=en_ner_enc['train'],
                 eval_dataset=en_ner_enc['test'],
                 data_collator=data_collator,
                 processing_class=bert_tokenizer)

In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 