Direct Application of Ner with bert as explained in the blog
- https://towardsdatascience.com/named-entity-recognition-with-bert-in-pytorch-a454405e0b6a
- https://github.com/marcellusruben/medium-resources/blob/main/NER_BERT/NER_with_BERT.ipynb

In [1]:
%autosave 300
%load_ext autoreload
%autoreload 2 
%reload_ext autoreload
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/abi_genai_bert_ner/"
)
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/abi_genai_bert_ner


In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM']="true"

In [4]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gc

In [5]:
from transformers import BertTokenizer, BertTokenizerFast, BertForTokenClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [6]:
LEARNING_RATE = 1e-05
EPOCHS = 7
BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
MAX_LEN=128

In [7]:
ner_df = pd.read_csv("data/ner.csv")
ner_df_sample = ner_df.sample(frac=0.5).reset_index(drop=True)

In [8]:
print(ner_df_sample.head())
ner_df_sample.shape

                                                                                                                                                                                                                                           text  \
0                          Asia-Pacific Economic Cooperation leaders , meeting in Singapore , have called for more cooperation on global economic recovery efforts , and have warned against withdrawing economic stimulus measures too early .   
1                                                                                                                                                             The United States has offered a $ 25-million reward for the capture of each man .   
2                                                                                                                        Muhammad Arif was handed the death sentence Monday for his role in the December 2000 attack that killed three people .   
3                           

(23980, 2)

In total, there are 9 entity categories with B/I varitaions, which are:

* geo for geographical entity
* org for organization entity
* per for person entity
* gpe for geopolitical entity
* tim for time indicator entity
* art for artifact entity
* eve for event entity
* nat for natural phenomenon entity
* O is assigned if a word doesn’t belong to any entity.

**Basic Data Preprocessing**

In [9]:
def basic_data_preprocessing(df):
    """
    This function will take the dataframe and return the text and labels list
    """
    all_text_list = df["text"].tolist()
    if "labels" in df.columns.tolist():
        all_labels_list = [i.split() for i in df["labels"].tolist()]
    else:
        all_labels_list = None
    return all_text_list, all_labels_list


def create_label_mapping(all_labels_list):
    """
    This function will take the labels list and return the label mapping
    """
    unique_labels = set()
    for lb in all_labels_list:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
    # creating label mapping for keys
    label_key_map = {v: k for k, v in enumerate(unique_labels)}
    key_label_map = {k: v for k, v in enumerate(unique_labels)}

    return label_key_map, key_label_map

In [10]:
all_text_list, all_labels_list = basic_data_preprocessing(ner_df_sample)
label_key_map, key_label_map = create_label_mapping(all_labels_list)

In [11]:
print(f"The length of all text list is: {len(all_text_list)}")
print(f"The length of all labels list is: {len(all_labels_list)}")
print(f"The length of label key map is: {len(label_key_map)}")
print(f"The length of key label map is: {len(key_label_map)}")
print(f"The label key map is: {label_key_map}")
print(f"The key label map is: {key_label_map}")

The length of all text list is: 23980
The length of all labels list is: 23980
The length of label key map is: 17
The length of key label map is: 17
The label key map is: {'I-org': 0, 'I-gpe': 1, 'B-art': 2, 'I-per': 3, 'B-gpe': 4, 'B-geo': 5, 'B-org': 6, 'I-geo': 7, 'B-eve': 8, 'I-eve': 9, 'O': 10, 'B-nat': 11, 'I-art': 12, 'I-nat': 13, 'B-tim': 14, 'I-tim': 15, 'B-per': 16}
The key label map is: {0: 'I-org', 1: 'I-gpe', 2: 'B-art', 3: 'I-per', 4: 'B-gpe', 5: 'B-geo', 6: 'B-org', 7: 'I-geo', 8: 'B-eve', 9: 'I-eve', 10: 'O', 11: 'B-nat', 12: 'I-art', 13: 'I-nat', 14: 'B-tim', 15: 'I-tim', 16: 'B-per'}


**Label Alignment**

In [12]:
def align_label_example(tokenized_input, labels, label_key_map, label_all_tokens=True):
    """
    Align the labels to the tokenized inputs. This can be used for NER or token classification tasks.
    :param tokenized_input: Tokenized input from the tokenizer
    :param labels: Labels to align
    :param label_key_map: Mapping between the labels and the label ids
    :param label_all_tokens: If True, all tokens are given a label. If False, only the first token of a word is given a label.

    """
    # print(f"label_key_map: {label_key_map}")
    word_ids = (
        tokenized_input.word_ids()
    )  # Return a list mapping the tokens to their actual word in the initial sentence
    labels_ids = []  # list of labels for each token
    previous_word_idx = None  # keep track of the previous word index

    for word_idx in word_ids:
        if word_idx is None:
            # print(f"Word index is None: {word_idx}")
            labels_ids.append(-100)

        elif word_idx != previous_word_idx:
            # print("current word index is not equal to previous word index")
            try:
                labels_ids.append(label_key_map[labels[word_idx]])
            except:
                labels_ids.append(-100)

        else:
            try:
                labels_ids.append(
                    label_key_map[labels[word_idx]] if label_all_tokens else -100
                )
            except:
                labels_ids.append(-100)

        # set the previous word index
        previous_word_idx = word_idx

    return labels_ids

In [13]:
# test the align label example function

tokenizer_1 = BertTokenizerFast.from_pretrained("bert-base-cased")
sample_text = all_text_list[171]
sample_labels = all_labels_list[171]
print(sample_text)
print(sample_labels)

Last month , farmers , miners and labor groups held huge protests demanding nationalization of Bolivia 's oil industry and new elections .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [14]:
op1 = tokenizer_1.encode_plus(
    sample_text,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    return_token_type_ids=True,
    return_attention_mask=True,
    return_offsets_mapping=True,
    return_special_tokens_mask=True,
    return_overflowing_tokens=True,
    
)
print(op1.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'offset_mapping', 'overflow_to_sample_mapping'])


In [15]:
sub_word_tokens = tokenizer_1.convert_ids_to_tokens(op1["input_ids"][0])
print(sub_word_tokens)
print(len([id for id in sub_word_tokens if id != "[PAD]"]))
print(len(sample_text.split()))

['[CLS]', 'Last', 'month', ',', 'farmers', ',', 'miners', 'and', 'labor', 'groups', 'held', 'huge', 'protests', 'demanding', 'national', '##ization', 'of', 'Bolivia', "'", 's', 'oil', 'industry', 'and', 'new', 'elections', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [16]:
new_labels = align_label_example(
    op1, sample_labels, label_key_map, label_all_tokens=True
)

In [17]:
print(new_labels)
print(sub_word_tokens)

[-100, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 6, 10, 10, 10, 10, 10, 10, 10, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100

In [18]:
print([key_label_map[i] if i != -100 else "SPL" for i in new_labels])

['SPL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 'SPL', 

**Custom Dataset**

In [19]:
class NerDataset(Dataset):
    def __init__(
        self,
        dataset,
        label_key_map,
        label_all_tokens,
        tokenizer,
    ):
        super(NerDataset, self).__init__()
        self.dataset = dataset
        self.all_text_list, self.all_labels_list = basic_data_preprocessing(dataset)
        self.tokenizer = tokenizer

        self.all_text_list = [
            self.tokenizer.encode_plus(
                text,
                max_length=MAX_LEN,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            for text in self.all_text_list
        ]

        self.label_key_map = label_key_map
        self.label_all_tokens = label_all_tokens

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        text = self.all_text_list[index]
        labels = self.all_labels_list[index]

        labels = align_label_example(
            text, labels, self.label_key_map, self.label_all_tokens
        )

        return {
            "input": text,
            "labels": torch.tensor(labels, dtype=torch.long),
        }

In [20]:
class NerDatasetNew(Dataset):
    def __init__(
        self,
        dataset,
        label_key_map,
        label_all_tokens,
        tokenizer,
    ):
        super(NerDatasetNew, self).__init__()
        self.dataset = dataset
        self.all_text_list, self.all_labels_list = basic_data_preprocessing(dataset)
        self.tokenizer = tokenizer
        self.label_key_map = label_key_map
        self.label_all_tokens = label_all_tokens

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        text = self.all_text_list[index]
        labels = self.all_labels_list[index]

        tokenized_input = self.tokenizer.encode_plus(
            text,
            max_length=MAX_LEN,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        labels = align_label_example(
            tokenized_input, labels, self.label_key_map, self.label_all_tokens
        )

        return {
            "input": tokenized_input,
            "labels": torch.tensor(labels, dtype=torch.long),
        }

In [21]:
bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased", force_download=True)
label_all_tokens = True

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

In [22]:
train_df, test_df = train_test_split(ner_df_sample, test_size=0.2, random_state=42)

print(train_df.shape, test_df.shape)

(19184, 2) (4796, 2)


In [23]:
# final tetsing purposes
real_df = ner_df.tail(1).reset_index(drop=True)
test_text = real_df["text"].tolist()[0]
print(test_text)

The United Nations is praising the use of military helicopters to drop food and rescue survivors in tsunami-ravaged Indonesia , saying the aircraft are " worth their weight in gold . "


In [24]:
train_dataset = NerDataset(train_df, label_key_map, label_all_tokens, bert_tokenizer)
test_dataset = NerDataset(test_df, label_key_map, label_all_tokens, bert_tokenizer)
# real_dataset = NerDataset(real_df, label_key_map, label_all_tokens, bert_tokenizer)

In [25]:
# train_dataset_new = NerDatasetNew(train_df, label_key_map,
#                                     label_all_tokens, bert_tokenizer)
# test_dataset_new = NerDatasetNew(test_df, label_key_map,
#                                     label_all_tokens, bert_tokenizer)

In [26]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers = 4

)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers = 4
)

# real_dataloader = DataLoader(
#     real_dataset,
#     batch_size=1,
#     shuffle=False,
#     num_workers = 4
# )


In [27]:
print(len(train_dataloader), len(test_dataloader)) # batches

2398 600


In [28]:
# train_dataloader_new = DataLoader(
#     train_dataset_new,
#     batch_size=BATCH_SIZE,
#     shuffle=True,
#     num_workers = 4
# )
# test_dataloader_new = DataLoader( 
#     test_dataset_new,
#     batch_size=BATCH_SIZE,
#     shuffle=False,
#     num_workers = 4
# )

In [29]:
for data in train_dataloader:
    print(data["input"]["input_ids"].shape)
    print(data["labels"].shape)
    print(data["labels"].sum())
    break

torch.Size([8, 1, 128])
torch.Size([8, 128])
tensor(-83347)


In [30]:
# for data in test_dataloader_new:
#     print(data["input"]["input_ids"].shape)
#     print(data["labels"].shape)
#     print(data["labels"].sum())
#     break

both the dataset produce the same output , only diff is in the first way all text are pre tokenized before indexing for dataloader and in the second way the text is tokenized in the dataloader itself

**Model Definition**

In [31]:
class BertNerModel(nn.Module):
    def __init__(self, model_type: str, label_key_map: dict) -> None:
        super().__init__()
        self.bert = BertForTokenClassification.from_pretrained(
            pretrained_model_name_or_path=model_type,
            num_labels=len(label_key_map),
            force_download=True,
        )

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=False,
        )
        return outputs

In [32]:
model = BertNerModel("bert-base-cased", label_key_map)
model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertNerModel(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=

In [33]:
model.state_dict().keys()

odict_keys(['bert.bert.embeddings.word_embeddings.weight', 'bert.bert.embeddings.position_embeddings.weight', 'bert.bert.embeddings.token_type_embeddings.weight', 'bert.bert.embeddings.LayerNorm.weight', 'bert.bert.embeddings.LayerNorm.bias', 'bert.bert.encoder.layer.0.attention.self.query.weight', 'bert.bert.encoder.layer.0.attention.self.query.bias', 'bert.bert.encoder.layer.0.attention.self.key.weight', 'bert.bert.encoder.layer.0.attention.self.key.bias', 'bert.bert.encoder.layer.0.attention.self.value.weight', 'bert.bert.encoder.layer.0.attention.self.value.bias', 'bert.bert.encoder.layer.0.attention.output.dense.weight', 'bert.bert.encoder.layer.0.attention.output.dense.bias', 'bert.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.bert.encoder.layer.0.intermediate.dense.weight', 'bert.bert.encoder.layer.0.intermediate.dense.bias', 'bert.bert.encoder.layer.0.output.dense.weight', 'bert.bert.encoder.layer.0.o

In [34]:
print(model.state_dict().get("bert.bert.embeddings.word_embeddings.weight").sum())
print(model.state_dict().get("bert.bert.embeddings.position_embeddings.weight").sum())

tensor(-308353.8750, device='cuda:0')
tensor(1.7204, device='cuda:0')


In [35]:
# input_ids = data["input"]["input_ids"].to(device)
# attention_mask = data["input"]["attention_mask"].to(device)
# labels = data["labels"].to(device)

# print(input_ids.shape, attention_mask.shape, labels.shape)
# input_ids = input_ids.squeeze(1)
# attention_mask = attention_mask.squeeze(1)
# print(input_ids.shape, attention_mask.shape)
# model.train()
# optimizer.zero_grad()

# outputs = model(input_ids, attention_mask, labels=labels)
# outputs

**pre training setup**

In [36]:
# Choosing to apply decay based on the layer type excluding bias and LayerNorm weights and include transformer layers

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [37]:
# total no of training steps : len(dataset)/batch_size * epochs = len(train_dataloader) * epochs

num_training_steps = len(train_dataloader) * EPOCHS
print(num_training_steps)

16786


In [38]:
optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=100, num_training_steps=num_training_steps
)



In [39]:
def training_loop(
    epoch, model, optimizer, scheduler, dataloader, device, label_key_map
):
    """Function to run the training loop for each epoch"""
    tr_loss, tr_accuracy = 0.0, 0.0
    tr_examples, tr_steps = 0, 0
    tr_preds = []
    tr_labels = []

    # put the model in training mode:
    model.train()

    for idx, batch in enumerate(dataloader):
        input_ids = batch["input"]["input_ids"].to(device)
        attention_mask = batch["input"]["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # squeeze the input_ids and attention_mask
        input_ids = input_ids.squeeze(1)
        attention_mask = attention_mask.squeeze(1)

        # zero the gradients
        optimizer.zero_grad()
        # forward pass
        loss, logits = model(input_ids, attention_mask, labels=labels)

        # loss = output.loss
        # logits = output.logits

        tr_loss += loss.item()
        tr_steps += 1  # steps are the number of batches in each epoch
        tr_examples += labels.size(0)

        if idx % 100 == 0:
            loss_step = tr_loss / tr_examples
            print(f"For Epoch: {epoch}, Step: {idx}, Train Loss: {loss_step}")

        # flatten targets and predictions
        flattened_targets = labels.view(
            -1
        )  # from (batch_size, seq_len) to (batch_size*seq_len,)
        active_logits = logits.view(
            -1, len(label_key_map)
        )  # from (batch_size, seq_len, num_labels) to (batch_size*seq_len, num_labels)
        flattened_predictions = torch.argmax(
            active_logits, axis=1
        )  # from (batch_size*seq_len, num_labels) to (batch_size*seq_len,)

        # only consider labels and predictions to store and calc metric on valid ones
        active_accuracy = labels.view(-1) != -100  # shape (batch_size, seq_len)
        labels = torch.masked_select(
            flattened_targets, active_accuracy
        )  # shape (valid_labels,)
        predictions = torch.masked_select(
            flattened_predictions, active_accuracy
        )  # shape (valid_labels,)

        # store predictions and labels
        tr_preds.extend(predictions.cpu().numpy())
        tr_labels.extend(labels.cpu().numpy())

        # calc acc score
        tmp_tr_accuracy = accuracy_score(
            labels.cpu().numpy(), predictions.cpu().numpy()
        )
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # backpropagation
        loss.backward()
        optimizer.step()
        scheduler.step()

    epoch_loss = tr_loss / tr_examples
    epoch_accuracy = tr_accuracy / tr_steps
    print(
        f"For Epoch: {epoch}, Train Loss: {epoch_loss}, Train Accuracy: {epoch_accuracy}"
    )

In [40]:
def validation_loop(epoch, model, dataloader, device, label_key_map, key_label_map):
    val_loss, val_accuracy = 0.0, 0.0
    val_examples, val_steps = 0, 0
    val_preds = []
    val_labels = []

    # put the model in evaluation mode:
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            input_ids = batch["input"]["input_ids"].to(device)
            attention_mask = batch["input"]["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # squeeze the input_ids and attention_mask
            input_ids = input_ids.squeeze(1)
            attention_mask = attention_mask.squeeze(1)

            loss, logits = model(input_ids, attention_mask, labels=labels)

            # loss = outputs.loss
            # logits = outputs.logits

            val_loss += loss.item()
            val_steps += 1
            val_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = val_loss / val_examples
                print(f"For Epoch: {epoch}, Step: {idx}, Val Loss: {loss_step}")

            # flatten targets and predictions
            flattened_targets = labels.view(
                -1
            )  # from (batch_size, seq_len) to (batch_size*seq_len,)
            active_logits = logits.view(
                -1, len(label_key_map)
            )  # from (batch_size, seq_len, num_labels) to (batch_size*seq_len, num_labels)
            flattened_predictions = torch.argmax(
                active_logits, axis=1
            )  # from (batch_size*seq_len, num_labels) to (batch_size*seq_len,)

            # only consider labels and predictions to store and calc metric on valid ones
            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(
                flattened_targets, active_accuracy
            )  # shape (valid_labels,)
            predictions = torch.masked_select(
                flattened_predictions, active_accuracy
            )  # shape (valid_labels,)

            # store predictions and labels
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

            tmp_val_accuracy = accuracy_score(
                labels.cpu().numpy(), predictions.cpu().numpy()
            )
            val_accuracy += tmp_val_accuracy

    # we change the predicted labels to actual labels
    val_labels = [key_label_map[id] for id in val_labels]
    val_preds = [key_label_map[id] for id in val_preds]

    epoch_loss = val_loss / val_examples
    epoch_accuracy = val_accuracy / val_steps

    print(f"For Epoch: {epoch}, Val Loss: {epoch_loss}, Val Accuracy: {epoch_accuracy}")

    return val_labels, val_preds

**Training**

In [42]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()


clear_gpu_memory()

In [43]:
for epoch in range(EPOCHS):
    clear_gpu_memory()
    print(f"Epoch: {epoch}")
    print("Training Loop")
    training_loop(
        epoch, model, optimizer, scheduler, train_dataloader, device, label_key_map
    )
    print("Validation Loop")
    val_labels, val_preds = validation_loop(
        epoch, model, test_dataloader, device, label_key_map, key_label_map
    )

Epoch: 0
Training Loop
For Epoch: 0, Step: 0, Train Loss: 0.3688517212867737
For Epoch: 0, Step: 100, Train Loss: 0.15192103275273106
For Epoch: 0, Step: 200, Train Loss: 0.13046029964415587
For Epoch: 0, Step: 300, Train Loss: 0.12165459928619506
For Epoch: 0, Step: 400, Train Loss: 0.11800032593031179
For Epoch: 0, Step: 500, Train Loss: 0.11537483553894741
For Epoch: 0, Step: 600, Train Loss: 0.11324406981145879
For Epoch: 0, Step: 700, Train Loss: 0.11240440684658995
For Epoch: 0, Step: 800, Train Loss: 0.11211973428726196
For Epoch: 0, Step: 900, Train Loss: 0.11140384230675893
For Epoch: 0, Step: 1000, Train Loss: 0.1104374039549749
For Epoch: 0, Step: 1100, Train Loss: 0.10990672386661646
For Epoch: 0, Step: 1200, Train Loss: 0.10976820738736438
For Epoch: 0, Step: 1300, Train Loss: 0.10950165763669156
For Epoch: 0, Step: 1400, Train Loss: 0.1094331565332319
For Epoch: 0, Step: 1500, Train Loss: 0.10912767358665304
For Epoch: 0, Step: 1600, Train Loss: 0.10902707379713869
For Ep

In [45]:
print(model.state_dict().get("bert.bert.embeddings.word_embeddings.weight").sum())
print(model.state_dict().get("bert.bert.embeddings.position_embeddings.weight").sum())

tensor(-283527.4375, device='cuda:0')
tensor(2.8671, device='cuda:0')


In [46]:
test_text = "On 11 March 1990 , Lithuania became the first of the Soviet republics to declare its independence , but Moscow did not recognize this proclamation until September of 1991 ( following the abortive coup in Moscow ) ."

In [47]:
def align_word_ids_test(texts, tokenizer, label_all_tokens):

    tokenized_inputs = tokenizer.encode_plus(
        texts,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    word_ids = tokenized_inputs.word_ids()
    sub_word_tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])
    sub_word_tokens = [
        i for i in sub_word_tokens if i not in ["[CLS]", "[SEP]", "[PAD]"]
    ]
    # print(len([id for id in word_ids if id != None]))
    # print(f"Word ids: {word_ids}")

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return tokenized_inputs, label_ids, sub_word_tokens

In [48]:
tokenized_inputs, test_label_ids, sub_word_tokens = align_word_ids_test(
    test_text, bert_tokenizer, label_all_tokens
)

In [49]:
mask = tokenized_inputs['attention_mask'].to(device)
input_id = tokenized_inputs['input_ids'].to(device)
label_ids = torch.Tensor(test_label_ids).unsqueeze(0).to(device)

In [50]:
print(input_id.shape, mask.shape, label_ids.shape)

torch.Size([1, 128]) torch.Size([1, 128]) torch.Size([1, 128])


In [51]:
model.eval()
with torch.no_grad():
    logits,  = model(input_id, mask, None)

In [52]:
# flatten targets and predictions
flattened_targets = label_ids.view(
    -1
)  # from (batch_size, seq_len) to (batch_size*seq_len,)
len([id for id in flattened_targets if id != -100])

41

In [53]:

active_logits = logits.view(
    -1, len(label_key_map)
)  # from (batch_size, seq_len, num_labels) to (batch_size*seq_len, num_labels)

active_logits.shape, logits.shape

(torch.Size([128, 17]), torch.Size([1, 128, 17]))

In [54]:

flattened_predictions = torch.argmax(
    active_logits, axis=1
)  # from (batch_size*seq_len, num_labels) to (batch_size*seq_len,)

flattened_predictions

tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10], device='cuda:0')

In [55]:
# only consider labels and predictions to store and calc metric on valid ones
active_accuracy = label_ids.view(-1) != -100
# print(active_accuracy)

labels = torch.masked_select(
    flattened_targets, active_accuracy
)  # shape (valid_labels,)

print(labels.shape)

torch.Size([41])


In [56]:
active_predictions = torch.masked_select(
    flattened_predictions, active_accuracy
)  # shape (valid_labels,)

In [57]:
prediction_label = [key_label_map[i] for i in active_predictions.tolist()]
print(prediction_label)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [58]:
len(prediction_label),len(sub_word_tokens)

(41, 41)

this is problematic , ignore this