In [84]:
#basic imports 
!pip install seqeval
import datasets
import transformers
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda")
from collections import defaultdict,Counter



In [85]:
import wandb
import os

# Force terminate any existing runs
try:
    wandb.finish()
except:
    pass

# Clear any wandb processes
os.system('wandb docker-run --clean')

# Now login with your API key
os.environ["WANDB_API_KEY"] = "d26dd1d7dea66bdf56b52f3745f3f3c747c32fa5"  # Replace with your key from wandb.ai/settings

0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,0.99972
train/global_step,3510.0
train/grad_norm,0.51558
train/learning_rate,1e-05
train/loss,0.0999


In [86]:
#load dataset
from datasets import load_dataset
en_ner = load_dataset('eriktks/conll2003')

In [87]:
en_ner

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [88]:
#get tags
tags = en_ner['train'].features['ner_tags'].feature

In [89]:
tags 

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [90]:
#encode
def tag2str(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
    
en_ner = en_ner.map(tag2str)

In [91]:
en_ner['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'ner_tags_str': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}

In [92]:
#check split freq of tags
splitfreq = defaultdict(Counter)
for split,ds in en_ner.items():
    for row in ds['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                splitfreq[split][tag] += 1

In [93]:
pd.DataFrame(splitfreq)

Unnamed: 0,train,validation,test
B-ORG,6321,1341,1661
B-MISC,3438,922,702
B-PER,6600,1842,1617
B-LOC,7140,1837,1668


### load tokenizer

In [94]:
from transformers import AutoTokenizer
bert_path = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)

In [95]:
text = "sample text to test transformer BERT tokenizer"
out = bert_tokenizer(text)['input_ids']
bert_tokenizer.convert_ids_to_tokens(out)

['[CLS]',
 'sample',
 'text',
 'to',
 'test',
 'transform',
 '##er',
 'bert',
 'token',
 '##izer',
 '[SEP]']

### load config

In [96]:
from transformers import AutoConfig
tag2index = {tag:idx for idx,tag in enumerate(tags.names)}
index2tag = {idx:tag for idx,tag in enumerate(tags.names)}
bert_config = AutoConfig.from_pretrained(bert_path,num_labels=tags.num_classes,id2label=index2tag,label2id=tag2index)

In [97]:
bert_config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

### create custom model

In [98]:
from transformers import BertPreTrainedModel,BertModel,BertConfig
from transformers.modeling_outputs import TokenClassifierOutput

class BertForTokenClassification(BertPreTrainedModel):
    config_class = BertConfig
    def __init__(self,config):
        super().__init__(config)
        self.bert = BertModel(config=config,add_pooling_layer=False)
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size,config.num_labels)
        self.init_weights()
    def forward(self,input_ids = None,attention_mask = None,token_type_ids = None,labels=None,**kwagrs):
        out = self.bert(input_ids,attention_mask,token_type_ids,**kwagrs)
        seq_out = self.dropout(out[0])
        logits = self.classifier(seq_out)
        loss = None
        if labels is not None:
            loss_fnc = nn.CrossEntropyLoss()
            loss = loss_fnc(logits.view(-1,self.num_labels),labels.view(-1))
        return TokenClassifierOutput(loss=loss,attentions=out.attentions,hidden_states=out.hidden_states,logits=logits)

In [99]:
bert_model = (BertForTokenClassification.from_pretrained(config=bert_config,pretrained_model_name_or_path=bert_path).to(device))

In [100]:
bert_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

### helper functions

In [101]:
#tag text
def tag_text(text,tags,model,tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = bert_tokenizer(text,return_tensors="pt").input_ids.to(device)
    out = model(input_ids)[0]
    pred = torch.argmax(out,dim=2)
    predictions = [tags.names[p] for p in pred.cpu().numpy()]
    return pd.DataFrame([tokens,predictions],index=['Token','Label'])

In [102]:
#align label (to remove -100 label)
def align_label(predictions,labels):
    preds = np.argmax(predictions,axis=2)
    batch_size,seq_len = preds.shape
    label_list,pred_list = [],[]
    for batch_idx in range(batch_size):
        ex_pred,ex_label = [],[]
        for seq_idx in range(seq_len):
            if labels[batch_idx][seq_idx] != -100:
                ex_pred.append(index2tag[preds[batch_idx,seq_idx]])
                ex_label.append(index2tag[labels[batch_idx,seq_idx]])
        label_list.append(ex_label)
        pred_list.append(ex_pred)
    return pred_list,label_list

In [103]:
#tokenize and align
def tokenize_align(batch):
    tokenized_inputs = bert_tokenizer(batch['tokens'],is_split_into_words=True)
    labels = []
    for idx,label in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        label_ids = []
        prev_word = None
        for word_idx in word_ids:
            if word_idx == prev_word or word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            prev_word = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [104]:
en_ner = en_ner.map(tokenize_align,batched=True,remove_columns=['id', 'tokens', 'pos_tags', 'chunk_tags','ner_tags'])

In [105]:
en_ner['train'][0]

{'ner_tags_str': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

### compute metircs

In [106]:
from seqeval.metrics import f1_score
def compute_metrics(pred):
    y_pred,y_true = align_label(pred.predictions,pred.label_ids)
    f1 = f1_score(y_true,y_pred)
    return {'f1':f1}

In [107]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(bert_tokenizer)

In [108]:
from transformers import TrainingArguments
model_name = 'bert-base-custom-ner'
batch_size = 4
logging_steps = len(en_ner['train'])//batch_size
weight_decay = 0.01
num_train_epochs = 3
learning_rate = 2e-5
training_agrs = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=num_train_epochs,
                                 weight_decay=weight_decay,
                                 learning_rate=learning_rate,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 logging_steps=logging_steps,
                                 log_level="error",
                                 eval_strategy="epoch",
                                 save_steps=1e6,
                                 disable_tqdm=False,
                                 push_to_hub=False,
                                 report_to='wandb')

In [109]:
from transformers import Trainer
trainer = Trainer(model=bert_model,
                 args = training_agrs,
                 compute_metrics=compute_metrics,
                 train_dataset=en_ner['train'],
                 eval_dataset=en_ner['validation'],
                 processing_class=bert_tokenizer,
                 data_collator=data_collator)

In [110]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0999,0.049131,0.93947
2,0.031,0.053546,0.948455
3,0.0146,0.058206,0.950902


TrainOutput(global_step=10533, training_loss=0.04846748370466087, metrics={'train_runtime': 532.5836, 'train_samples_per_second': 79.092, 'train_steps_per_second': 19.777, 'total_flos': 741316211229636.0, 'train_loss': 0.04846748370466087, 'epoch': 3.0})