In [99]:
import wandb
import os

# Force terminate any existing runs
try:
    wandb.finish()
except:
    pass

# Clear any wandb processes
os.system('wandb docker-run --clean')

# Now login with your API key
os.environ["WANDB_API_KEY"] = "d26dd1d7dea66bdf56b52f3745f3f3c747c32fa5"  # Replace with your key from wandb.ai/settings

0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,0.99886
train/global_step,877.0
train/grad_norm,1.86567
train/learning_rate,1e-05
train/loss,0.1308


In [100]:
#basic imports 
!pip install seqeval
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import transformers 
import datasets
from seqeval.metrics import f1_score,classification_report
from collections import defaultdict,Counter
device = torch.device("cuda")



### load dataset

In [101]:
#CoNLL 2003 dataset
from datasets import load_dataset
en_ner = load_dataset("eriktks/conll2003")

In [102]:
en_ner 

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [103]:
en_ner['train'][100]

{'id': '100',
 'tokens': ['Rabinovich',
  'is',
  'winding',
  'up',
  'his',
  'term',
  'as',
  'ambassador',
  '.'],
 'pos_tags': [21, 42, 39, 33, 29, 21, 15, 21, 7],
 'chunk_tags': [11, 21, 22, 15, 11, 12, 13, 11, 0],
 'ner_tags': [1, 0, 0, 0, 0, 0, 0, 0, 0]}

In [104]:
#removing pos and chunk tags
en_ner = en_ner.remove_columns(['pos_tags','chunk_tags'])

In [105]:
#tags to str
tags = en_ner['train'].features['ner_tags'].feature

In [106]:
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [107]:
def ner_str(batch):
    return {'ner_tags_str':[tags.names[idx] for idx in batch['ner_tags']]}

In [108]:
en_ner = en_ner.map(ner_str)

In [109]:
en_ner['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'ner_tags_str': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}

### tag frequency per split

In [110]:
split_freq = defaultdict(Counter)
for split,ds in en_ner.items():
    for row in ds['ner_tags_str']:
        for tag in row:
            if tag.startswith('B'):
                split_freq[split][tag] += 1
    

In [111]:
pd.DataFrame(split_freq)

Unnamed: 0,train,validation,test
B-ORG,6321,1341,1661
B-MISC,3438,922,702
B-PER,6600,1842,1617
B-LOC,7140,1837,1668


### load model tokenizer

In [112]:
from transformers import AutoTokenizer
bert_path = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)

In [113]:
from transformers import BertForTokenClassification,AutoConfig

In [114]:
#config
tag2index = {tag:idx for idx,tag in enumerate(tags.names)}
index2tag = {idx:tag for idx,tag in enumerate(tags.names)}
bert_config = AutoConfig.from_pretrained(bert_path,num_labels=tags.num_classes,id2label=index2tag,
                                        label2id = tag2index)

In [115]:
bert_config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [116]:
bert_model = (BertForTokenClassification.from_pretrained(bert_path,config=bert_config).to(device))

In [117]:
bert_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

### helper functions

In [143]:
def tag_text(text,tags,model,tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = bert_tokenizer(text,return_tensors="pt").input_ids.to(device)
    out = model(input_ids)[0]
    predictions = torch.argmax(out,dim=2)
    pred = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens,pred],index=['Token','Label']),tokens,pred

In [119]:
def tokenize_align(batch):
    tokenized_inputs = bert_tokenizer(batch['tokens'],truncation=True,is_split_into_words=True)
    labels = []
    for idx,label in enumerate(batch['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        label_ids = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None or word_id == prev_word:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
            prev_word = word_id
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [120]:
def align_pred(predictions,labels):
    preds = np.argmax(predictions,axis=2)
    batch_size,seq_len = preds.shape
    pred_list,label_list = [],[]
    for batch_idx in range(batch_size):
        ex_pred,ex_label = [],[]
        for seq_idx in range(seq_len):
            if labels[batch_idx,seq_idx] != -100:
                ex_pred.append(index2tag[preds[batch_idx][seq_idx]])
                ex_label.append(index2tag[labels[batch_idx][seq_idx]])
        pred_list.append(ex_pred)
        label_list.append(ex_label)
    return pred_list,label_list

In [121]:
#tokenize
en_ner_enc = en_ner.map(tokenize_align,batched=True,remove_columns=['tokens','ner_tags','id'])

In [122]:
ids = en_ner_enc['train'][10]['input_ids']
ids = bert_tokenizer.convert_ids_to_tokens(ids)
lbls = en_ner_enc['train'][10]['labels']

In [123]:
#the sub parts of words are assigned -100 label ##acio,##ch etc
pd.DataFrame([ids,lbls],index=['token','label'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
token,[CLS],spanish,farm,minister,loyola,de,pal,##acio,had,earlier,...,##ified,alarm,through,"""",dangerous,general,##isation,.,"""",[SEP]
label,-100,7,0,0,1,2,2,-100,0,0,...,-100,0,0,0,0,0,-100,0,0,-100


### it assigns -100 label to parts of the same word 

In [124]:
#example
text = "this is a sample text to test transformer tokenizer"
ids = bert_tokenizer(text)['input_ids']
bert_tokenizer.convert_ids_to_tokens(ids)

['[CLS]',
 'this',
 'is',
 'a',
 'sample',
 'text',
 'to',
 'test',
 'transform',
 '##er',
 'token',
 '##izer',
 '[SEP]']

here word transformer is broken into 2 words transform , ##er we want judge the model only based on prediction on transform as the prediction on the word transformer thus we assign label -100 to ##er part and its ignored by crossentropy loss

### compute metrics

In [125]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=bert_tokenizer)

In [126]:
def compute_metrics(pred):
    y_pred,y_true = align_pred(pred.predictions,pred.label_ids)
    f1 = f1_score(y_true,y_pred)
    return {'f1':f1}

In [127]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=bert_tokenizer)

### training args

In [128]:
from transformers import TrainingArguments
model_name = 'BERT-NER-CoNLL'
learning_rate = 2e-5
num_train_epochs = 3
weight_decay = 0.01
batch_size = 16
logging_steps = len(en_ner_enc['train'])//batch_size
training_args = TrainingArguments(output_dir=model_name,
                                 per_device_eval_batch_size=batch_size,
                                 per_device_train_batch_size=batch_size,
                                 learning_rate=learning_rate,
                                 num_train_epochs=num_train_epochs,
                                 weight_decay=weight_decay,
                                 logging_steps=logging_steps,
                                 eval_strategy="epoch",
                                 log_level="error",
                                 save_steps=1e6,
                                 disable_tqdm=False,
                                 push_to_hub=True,
                                 report_to='wandb')

### Trainer

In [129]:
from transformers import Trainer
trainer = Trainer(model=bert_model,
                 compute_metrics=compute_metrics,
                 args=training_args,
                 train_dataset=en_ner_enc['train'],
                 eval_dataset=en_ner_enc['test'],
                 data_collator=data_collator,
                 processing_class=bert_tokenizer)

In [130]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.1298,0.109788,0.884936
2,0.0355,0.113901,0.901231
3,0.0202,0.119875,0.903735


TrainOutput(global_step=2634, training_loss=0.06179322343356426, metrics={'train_runtime': 285.0233, 'train_samples_per_second': 147.788, 'train_steps_per_second': 9.241, 'total_flos': 1020143109346326.0, 'train_loss': 0.06179322343356426, 'epoch': 3.0})

## BERT LARGE

In [131]:
bert_lg_path = 'bert-large-uncased'
bert_lg_tokenizer = AutoTokenizer.from_pretrained(bert_lg_path)
tag2index = {tag:idx for idx,tag in enumerate(tags.names)}
index2tag = {idx:tag for idx,tag in enumerate(tags.names)}
bert_lg_config = AutoConfig.from_pretrained(bert_lg_path,num_labels=tags.num_classes,id2label=index2tag,
                                        label2id = tag2index)
bert_lg_model = (BertForTokenClassification.from_pretrained(bert_lg_path,config=bert_lg_config).to(device))
data_collator_lg = DataCollatorForTokenClassification(tokenizer=bert_lg_tokenizer)

In [132]:
bert_lg_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [133]:
trainer_lg = Trainer(model=bert_lg_model,
                 compute_metrics=compute_metrics,
                 args=training_args,
                 train_dataset=en_ner_enc['train'],
                 eval_dataset=en_ner_enc['test'],
                 data_collator=data_collator_lg,
                 processing_class=bert_lg_tokenizer)

In [134]:
trainer_lg.train()

Epoch,Training Loss,Validation Loss,F1
1,0.115,0.100262,0.898263
2,0.0276,0.115735,0.908101
3,0.0128,0.124297,0.910578


TrainOutput(global_step=2634, training_loss=0.05172164177828968, metrics={'train_runtime': 839.9761, 'train_samples_per_second': 50.148, 'train_steps_per_second': 3.136, 'total_flos': 3625672893251094.0, 'train_loss': 0.05172164177828968, 'epoch': 3.0})

In [135]:
trainer.push_to_hub()
trainer_lg.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Sharpaxis/BERT-NER-CoNLL/commit/01c65f7796e177330ab602e7f9c11acf25e2b7b8', commit_message='End of training', commit_description='', oid='01c65f7796e177330ab602e7f9c11acf25e2b7b8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sharpaxis/BERT-NER-CoNLL', endpoint='https://huggingface.co', repo_type='model', repo_id='Sharpaxis/BERT-NER-CoNLL'), pr_revision=None, pr_num=None)

In [146]:
text = "apple is opening a new branch in Hong-kong"
df,tokens,predictions = tag_text(text,tags,bert_model,bert_tokenizer)

In [145]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Token,[CLS],apple,is,opening,a,new,branch,in,hong,-,kong,[SEP]
Label,O,B-ORG,O,O,O,O,O,O,B-LOC,I-LOC,I-LOC,O


In [149]:
from spacy.tokens import Doc, Span
from spacy.displacy import render
from spacy.vocab import Vocab
vocab = Vocab()
doc = Doc(vocab, words=tokens)
text = "On July 20, 2023, Elon Musk, the CEO of Tesla and SpaceX, announced during a conference in San Francisco that his company, Neuralink, had successfully implanted its first brain-computer interface in a human volunteer, a breakthrough that could revolutionize neurotechnology, while the U.S. Food and Drug Administration (FDA) closely monitored the trials; meanwhile, Microsoft and OpenAI continued their partnership to develop advanced artificial intelligence systems, despite increasing regulatory scrutiny from the European Union and concerns raised by privacy advocates at the Massachusetts Institute of Technology (MIT)"
df,tokens,predictions = tag_text(text,tags,bert_model,bert_tokenizer)
# Create spans for entities
entities = []
start = None
label = None

for i, pred in enumerate(predictions):
    if pred.startswith("B-"):
        if start is not None:  # Close the previous entity if any
            entities.append(Span(doc, start, i, label=label))
        start = i
        label = pred[2:]
    elif pred.startswith("I-"):
        continue
    else:  # "O"
        if start is not None:
            entities.append(Span(doc, start, i, label=label))
            start = None

# Add the last entity if needed
if start is not None:
    entities.append(Span(doc, start, len(tokens), label=label))

# Assign the entities to the Doc
doc.ents = entities

# Visualize with displaCy
render(doc, style="ent",page=True,)