In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import mps
from transformers import BertTokenizer, BertForTokenClassification, BertConfig


device = 'mps' if mps.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/data/ner_data.csv'
data = pd.read_csv(path, encoding = 'unicode_escape')

In [3]:
data.isna().sum()

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64

In [4]:
data = data.ffill()

In [5]:
#convert iob tags to base tag
data['base_tag'] = data['Tag'].apply(lambda x: x.split('-')[-1])

freqs = data['Tag'].value_counts()
print("IOB tag count")
freqs

IOB tag count


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [6]:
iob_tags = []
for t, f in zip(freqs.index, freqs):
    iob_tags.append(t)

unique_tag =  []
for tag in iob_tags:
    s = tag.split('-')
    unique_tag.append(s[-1])

unique_tag = list(set(unique_tag))
print(f'Unique base tag: {unique_tag}')

Unique base tag: ['org', 'per', 'O', 'tim', 'gpe', 'nat', 'eve', 'art', 'geo']


In [7]:
# frequency of unique tags
data['base_tag'].value_counts()

base_tag
O      887908
geo     45058
org     36927
per     34241
tim     26861
gpe     16068
art       699
eve       561
nat       252
Name: count, dtype: int64

## Data Processing

In [8]:
# art eve nat is not defined properly, removing them
to_remove = ['art','nat','eve']

data = data[~data.base_tag.isin(to_remove)]

In [9]:
#id lable lookup dict for model

labels = data['Tag'].value_counts().index

label2id = {}
id2label = {}
for idx, label in enumerate(labels):
    label2id[label] = idx
    id2label[idx] = label

In [10]:
# forming sentence from tokens
data['sentence'] = data[['Sentence #','Word', "Tag"]].groupby(['Sentence #'])["Word"].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['Sentence #','Word', "Tag"]].groupby(['Sentence #'])["Tag"].transform(lambda x: ','.join(x))

data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head(5)

Unnamed: 0,sentence,word_labels
0,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Families of soldiers killed in the conflict jo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-per,O,O,..."
2,They marched from the Houses of Parliament to ...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,I-geo,O"
3,"Police put the number of marchers at 10,000 wh...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,The protest comes on the eve of the annual con...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,O,O,B-org,I-org,O,..."


## Dataset and Dataloader

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [12]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [13]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
MODEL_PATH = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

In [14]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (47610, 2)
TRAIN Dataset: (38088, 2)
TEST Dataset: (9522, 2)


In [15]:
# print the first 30 tokens and corresponding labels
idx = 67
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[idx]["ids"][:50]), training_set[idx]["targets"][:50]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

[CLS]       O
and         O
the         O
u           B-geo
.           B-geo
s           B-geo
.           B-geo
military    O
said        O
a           O
roadside    O
bomb        O
blast       O
in          O
east        B-geo
baghdad     I-geo
killed      O
an          O
american    B-gpe
soldier     O
.           O
[SEP]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O
[PAD]       O


In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [18]:
model = BertForTokenClassification.from_pretrained(MODEL_PATH,
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [19]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(2.5228, device='mps:0', grad_fn=<NllLossBackward0>)

In [20]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 11])

In [21]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

## Training and Evaluation

In [22]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [23]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.6173088550567627
Training loss per 100 training steps: 0.4547475338867395
Training loss per 100 training steps: 0.29590264963569923
Training loss per 100 training steps: 0.2319516947746673
Training loss per 100 training steps: 0.19598995233674596
Training loss per 100 training steps: 0.17142078681419531
Training loss per 100 training steps: 0.1542610641663426
Training loss per 100 training steps: 0.1406472508660046
Training loss per 100 training steps: 0.12975703217682152
Training loss per 100 training steps: 0.12070289586353365
Training loss per 100 training steps: 0.1130760846958197
Training loss per 100 training steps: 0.10708191913242142
Training loss per 100 training steps: 0.10158915661173026
Training loss per 100 training steps: 0.09728717789351711
Training loss per 100 training steps: 0.09326924405909602
Training loss per 100 training steps: 0.0896497611763039
Training loss per 100 training steps: 0.08659891296715605
Tra

In [24]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [25]:
labels, predictions = valid(model, testing_loader)


Validation loss per 100 evaluation steps: 0.001311454107053578
Validation loss per 100 evaluation steps: 0.02690354802943337
Validation loss per 100 evaluation steps: 0.026023789825893037
Validation loss per 100 evaluation steps: 0.024191063664700755
Validation loss per 100 evaluation steps: 0.024093230907248627
Validation loss per 100 evaluation steps: 0.025557606428323082
Validation loss per 100 evaluation steps: 0.026847256001378213
Validation loss per 100 evaluation steps: 0.026777546216097747
Validation loss per 100 evaluation steps: 0.027215735167654534
Validation loss per 100 evaluation steps: 0.02690986261961964
Validation loss per 100 evaluation steps: 0.026261705657505754
Validation loss per 100 evaluation steps: 0.026586227521983332
Validation loss per 100 evaluation steps: 0.02620035082363423
Validation loss per 100 evaluation steps: 0.025977186875779927
Validation loss per 100 evaluation steps: 0.025470469970036918
Validation loss per 100 evaluation steps: 0.02512105940720

In [26]:
# save model & tokenizer

path = '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/'

model.save_pretrained(path+'bert_trained')
tokenizer.save_pretrained(path+'tokenizer')

('/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/tokenizer/tokenizer_config.json',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/tokenizer/special_tokens_map.json',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/tokenizer/vocab.txt',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/tokenizer/added_tokens.json')

In [27]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         geo       0.86      0.87      0.86     11585
         gpe       0.97      0.91      0.94      3467
         org       0.72      0.69      0.71      6785
         per       0.79      0.80      0.79      5270
         tim       0.84      0.84      0.84      4457

   micro avg       0.83      0.82      0.82     31564
   macro avg       0.83      0.82      0.83     31564
weighted avg       0.83      0.82      0.82     31564



## Inference

In [48]:
sentence = "India has a capital called Mumbai. On wednesday, the president will give a presentation"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to('cpu')
mask = inputs["attention_mask"].to('cpu')
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

india has a capital called mumbai . on wednesday , the president will give a presentation
['B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [50]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple", device='mps')
pipe("My name is Niels and New York is a city")


[{'entity_group': 'org',
  'score': np.float32(0.5601732),
  'word': 'ni',
  'start': 11,
  'end': 13},
 {'entity_group': 'per',
  'score': np.float32(0.7349655),
  'word': '##els',
  'start': 13,
  'end': 16},
 {'entity_group': 'geo',
  'score': np.float32(0.9530471),
  'word': 'new york',
  'start': 21,
  'end': 29}]

## Visualize

In [45]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy

# Load your BERT model and tokenizer
model_path = "/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/bert_trained/"
tokenizer_path = "/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/v_3/tokenizer/"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create a Hugging Face pipeline for NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Input text
#text = "John Doe works at Google in Washington. Do you want to go to London? How every today is 1st of December 2024"
#text = "On January 8, 2025, Alice Johnson from Microsoft traveled to Berlin, Germany, for a conference."
#text = "Mark Smith attended the IBM event in Tokyo on July 22, 2023, discussing the latest AI advancements."
#text = "The meeting with Sarah Lee at Apple in London will take place on February 5, 2025."
text = f"""
        The annual summit was held in the breathtaking  Himalayas in Nepal and organized by the United Nations. 
        Distinguished speakers, including Professor Williams, discussed climate change impacts during the 
        session on January 8th, with attendees from Tokyo and New York.
    """
# Run the NER pipeline
entities = ner_pipeline(text)

# Convert the output to SpaCy format
nlp = spacy.blank("en")  # Create a blank SpaCy model
doc = nlp(text)

# Create entities
ents = []
for entity in entities:
    start = entity['start']
    end = entity['end']
    label = entity['entity_group']
    span = doc.char_span(start, end, label=label)
    if span:
        ents.append(span)

# Assign entities to the SpaCy Doc
doc.ents = ents

# Define custom colors for each entity type
colors = {
    "PER": "linear-gradient(90deg, #e3d8fd, #fbd8f5)",  
    "ORG": "linear-gradient(90deg, #fff4cc, #ffe6b3)",  
    "GEO": "linear-gradient(90deg, #ccf2ff, #b3e6ff)",  
    "GPE": "linear-gradient(90deg, #ccffee, #b3ffdd)",  
    "TIM": "linear-gradient(90deg, #ffcccc, #ffb3b3)",  
    "O": "linear-gradient(90deg, #f2f2f2, #d9d9d9)"     
}


# Set the options for visualization
options = {"ents": list(colors.keys()), "colors": colors}

# Visualize the entities with SpaCy
displacy.render(doc, style="ent", jupyter=True, options=options)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


![Output Image](/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/output.png)