<a href="https://www.kaggle.com/code/skshmjn/bert-ner?scriptVersionId=213694117" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install transformers datasets tokenizers seqeval evaluate -q 

In [None]:
import datasets 
import numpy as np 
import torch 
import json
from transformers import pipeline
from evaluate import load
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification , DataCollatorWithPadding
from transformers import AutoModelForTokenClassification 
from transformers import TrainingArguments, Trainer 

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
conll2003 = datasets.load_dataset("conll2003", trust_remote_code=True)
conll2003

In [None]:
conll2003["train"][0]

In [None]:
ner_classes = conll2003["train"].features["ner_tags"].feature.names
ner_classes

In [None]:
example = conll2003["train"][345]
example['tokens'] ,[ner_classes[i] for i in example['ner_tags']]

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 

    # Tokenisation
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 

    # checking all the labels and adding label for word piece 
    for i, label in enumerate(examples["ner_tags"]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None 
        label_ids = []
        
        for word_idx in word_ids: 
            if word_idx is None: 
               
                label_ids.append(-100)
           
            elif word_idx != previous_word_idx:
               
                label_ids.append(label[word_idx]) 
            else: 
               
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [None]:
result = tokenize_and_align_labels(conll2003['train'][345:346])
print(result)

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(result["input_ids"][0]),result["labels"][0]): 
    print(f"{token:_<40} {label}") 

In [None]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)


In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["tokens","pos_tags", "chunk_tags", "ner_tags"])

In [None]:
tokenized_datasets['train'][0:3]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9).to(device)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer) 
metric = load("seqeval") 

In [None]:
def compute_metrics(eval_preds): 
 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    
    predictions = [ 
        [ner_classes[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [ner_classes[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [None]:
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=64, 
per_device_eval_batch_size=64, 
num_train_epochs=5, 
weight_decay=0.01, 
) 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("tokenizer")

In [None]:
id2label = {
    str(i): label for i,label in enumerate(ner_classes)
}
label2id = {
    label: str(i) for i,label in enumerate(ner_classes)
}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model").to(device)
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "Narendra Damodardas Modi[a] (born 17 September 1950)[b] is an Indian politician who has served as Prime Minister of India since 2014. Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister outside the Indian National Congress."

ner_results = nlp(example)

print(ner_results)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!sudo apt-get install git-lfs

In [None]:
trainer.push_to_hub()