Application of method-2 using the latest resources provided in huggingface transformers library

In [1]:
%autosave 300
%load_ext autoreload
%autoreload 2 
%reload_ext autoreload
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/abi_genai_bert_ner/"
)
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/copilot-model-run/code/Users/Soutrik.Chowdhury/abi_genai_bert_ner


In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM']="true"

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
import torch
import torch.nn as nn
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [5]:
LEARNING_RATE = 1e-05
EPOCHS = 7
BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN=256
label_all_tokens = False

In [6]:
ner_data = pd.read_csv("data/NER dataset.csv", encoding="latin1")

In [7]:
print(ner_data.shape)
print(ner_data.isnull().sum())

(1048575, 4)
Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64


In [8]:
# highly skewed dataset
print("Number of tags: {}".format(len(ner_data.Tag.unique())))
frequencies = ner_data.Tag.value_counts()
frequencies

Number of tags: 17


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [9]:
ner_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [10]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('geo', 45058), ('org', 36927), ('per', 34241), ('tim', 26861), ('gpe', 16068), ('art', 699), ('eve', 561), ('nat', 252)]


In [11]:
def data_preparation(df):
    data = df.copy()
    # fill na
    data = data.fillna(method="ffill")
    # group by sentence id and join words
    data["sentence"] = (
        data[["Sentence #", "Word", "Tag"]]
        .groupby(["Sentence #"])["Word"]
        .transform(lambda x: " ".join(x))
    )
    # group by sentence id and join tags
    data["word_labels"] = (
        data[["Sentence #", "Word", "Tag"]]
        .groupby(["Sentence #"])["Tag"]
        .transform(lambda x: ",".join(x))
    )
    # drop duplicates
    data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
    return data

In [12]:
ner_df_clean = data_preparation(ner_data)

  data = data.fillna(method="ffill")


#TODO:
1. Remove any sentences which only have 'O' in its labels , this would add more of a noise to the model

In [13]:
def basic_data_preprocessing(df):
    """
    This function will take the dataframe and return the text and labels list
    """
    all_text_list = df['sentence'].apply(lambda x: x.split()).tolist()
    if "word_labels" in df.columns.tolist():
        all_labels_list = [i.split(',') for i in df["word_labels"].tolist()]
    else:
        all_labels_list = None
    return all_text_list, all_labels_list


def create_label_mapping(all_labels_list):
    """
    This function will take the labels list and return the label mapping
    """
    unique_labels = set()
    for lb in all_labels_list:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
    # creating label mapping for keys
    label_key_map = {v: k for k, v in enumerate(unique_labels)}
    key_label_map = {k: v for k, v in enumerate(unique_labels)}

    return label_key_map, key_label_map

In [14]:
all_text_list, all_labels_list = basic_data_preprocessing(ner_df_clean)
label_key_map, key_label_map = create_label_mapping(all_labels_list)

In [15]:
# all_text_list

In [16]:
print(f"The length of all text list is: {len(all_text_list)}")
print(f"The length of all labels list is: {len(all_labels_list)}")
print(f"The length of label key map is: {len(label_key_map)}")
print(f"The length of key label map is: {len(key_label_map)}")

The length of all text list is: 47610
The length of all labels list is: 47610
The length of label key map is: 17
The length of key label map is: 17


In [17]:
print(f"The label key map is: {label_key_map}")
print(f"The key label map is: {key_label_map}")

The label key map is: {'I-gpe': 0, 'I-eve': 1, 'B-org': 2, 'B-per': 3, 'B-nat': 4, 'B-geo': 5, 'I-tim': 6, 'I-geo': 7, 'B-gpe': 8, 'I-art': 9, 'I-per': 10, 'B-art': 11, 'I-nat': 12, 'O': 13, 'I-org': 14, 'B-eve': 15, 'B-tim': 16}
The key label map is: {0: 'I-gpe', 1: 'I-eve', 2: 'B-org', 3: 'B-per', 4: 'B-nat', 5: 'B-geo', 6: 'I-tim', 7: 'I-geo', 8: 'B-gpe', 9: 'I-art', 10: 'I-per', 11: 'B-art', 12: 'I-nat', 13: 'O', 14: 'I-org', 15: 'B-eve', 16: 'B-tim'}


In [18]:
idx = 104
sample_text = all_text_list[idx]
sample_labels = all_labels_list[idx]

print(f"The sample text is: {sample_text}")
print(f"The sample labels are: {sample_labels}")

The sample text is: ['Elsewhere', 'in', 'the', 'northwest', ',', 'authorities', 'on', 'Saturday', 'found', 'the', 'bodies', 'of', 'six', 'people', 'who', 'had', 'been', 'shot', 'dead', 'in', 'the', 'Kurram', 'region', 'along', 'the', 'Afghan', 'border', '.']
The sample labels are: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-gpe', 'O', 'O']


In [19]:
# samplin the text for faster training
ner_data_sample = ner_df_clean.sample(frac=1.0, random_state=42).reset_index(drop=True)
print(ner_data_sample.shape)

(47610, 2)


In [20]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', force_download=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

In [21]:
# encoding = tokenizer(
#     sample_text,
#     return_offsets_mapping=True,
#     padding="max_length",
#     truncation=True,
#     max_length=MAX_LEN,
# )

In [22]:
def align_label_example(tokenized_input, labels, label_key_map, label_all_tokens=True):
    """
    Align the labels to the tokenized inputs. This can be used for NER or token classification tasks.
    :param tokenized_input: Tokenized input from the tokenizer
    :param labels: Labels to align
    :param label_key_map: Mapping between the labels and the label ids
    :param label_all_tokens: If True, all tokens are given a label. If False, only the first token of a word is given a label.

    """
    # print(f"label_key_map: {label_key_map}")
    word_ids = (
        tokenized_input.word_ids()
    )  # Return a list mapping the tokens to their actual word in the initial sentence
    labels_ids = []  # list of labels for each token
    previous_word_idx = None  # keep track of the previous word index

    for word_idx in word_ids:
        if word_idx is None:
            # print(f"Word index is None: {word_idx}")
            labels_ids.append(-100)

        elif word_idx != previous_word_idx:
            # print("current word index is not equal to previous word index")
            try:
                labels_ids.append(label_key_map[labels[word_idx]])
            except:
                labels_ids.append(-100)

        else:
            try:
                labels_ids.append(
                    label_key_map[labels[word_idx]] if label_all_tokens else -100
                )
            except:
                labels_ids.append(-100)

        # set the previous word index
        previous_word_idx = word_idx

    return labels_ids

In [23]:
class NerDataset(Dataset):
    def __init__(self, dataset, label_key_map, label_all_tokens, tokenizer, max_len):
        super(NerDataset, self).__init__()
        self.dataset = dataset
        self.all_text_list, self.all_labels_list = basic_data_preprocessing(dataset)

        self.label_key_map = label_key_map
        self.label_all_tokens = label_all_tokens
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        text = self.all_text_list[index]
        labels = self.all_labels_list[index]

        # print(f"Text: {text}")
        # print(f"Labels: {labels}")
        text = self.tokenizer(
            text,
            return_offsets_mapping=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            is_split_into_words=True,
        )

        labels_ids = align_label_example(
            text, labels, self.label_key_map, self.label_all_tokens
        )

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in text.items()}
        item["labels"] = torch.as_tensor(labels_ids)

        return item

In [24]:
train_size = 0.8
train_dataset = ner_data_sample.sample(frac=train_size, random_state=200)
test_dataset = ner_data_sample.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(ner_data_sample.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (47610, 2)
TRAIN Dataset: (38088, 2)
TEST Dataset: (9522, 2)


In [25]:
train_dataset_torch = NerDataset(train_dataset, label_key_map, label_all_tokens, tokenizer, MAX_LEN)
test_dataset_torch = NerDataset(test_dataset, label_key_map, label_all_tokens, tokenizer, MAX_LEN)

In [26]:
# next(iter(train_dataset_torch))

In [27]:
for token, label in zip(tokenizer.convert_ids_to_tokens(train_dataset_torch[1]["input_ids"]), train_dataset_torch[1]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
A           13
military    13
spokesman   13
in          13
Baghdad     5
said        13
officials   13
were        13
still       13
gathering   13
details     13
early       13
this        13
afternoon   16
.           13
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]      

In [28]:
# dict(zip(
#     train_dataset.iloc[1]["sentence"].split(),
#     train_dataset.iloc[1]["word_labels"].split(','),
# ))

In [28]:
train_params = {"batch_size": BATCH_SIZE, "shuffle": True, "num_workers": 0}

test_params = {"batch_size": BATCH_SIZE, "shuffle": True, "num_workers": 0}

train_dataloader = DataLoader(train_dataset_torch, **train_params)
testing_dataloader = DataLoader(test_dataset_torch, **test_params)

In [29]:
for data in train_dataloader:
    print(data["input_ids"].shape)
    print(data["labels"].shape)
    print(data["labels"].sum())
    break

torch.Size([8, 256])
torch.Size([8, 256])
tensor(-179685)


In [30]:
class BertNerModel(nn.Module):
    def __init__(self, model_type: str, label_key_map: dict) -> None:
        super().__init__()
        self.bert = BertForTokenClassification.from_pretrained(
            pretrained_model_name_or_path=model_type,
            num_labels=len(label_key_map),
            force_download=True,
        )

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            return_dict=False,
        )
        return outputs

In [31]:
model = BertNerModel("bert-base-cased", label_key_map)
model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertNerModel(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=

In [32]:
inputs = train_dataset_torch[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

In [33]:
input_ids.shape, attention_mask.shape, labels.shape

(torch.Size([1, 256]), torch.Size([1, 256]), torch.Size([1, 256]))

In [34]:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

In [35]:

loss,logits = model(input_ids, attention_mask=attention_mask, labels=labels)
print(loss)
print(logits.shape)

tensor(2.7333, device='cuda:0', grad_fn=<NllLossBackward0>)
torch.Size([1, 256, 17])


**pre training setup**

In [36]:
# Choosing to apply decay based on the layer type excluding bias and LayerNorm weights and include transformer layers

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [37]:
# total no of training steps : len(dataset)/batch_size * epochs = len(train_dataloader) * epochs

num_training_steps = len(train_dataloader) * EPOCHS
print(num_training_steps)

33327


In [38]:
optimizer = torch.optim.AdamW(optimizer_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=100, num_training_steps=num_training_steps
)

In [39]:
def training_loop(
    epoch, model, optimizer, scheduler, dataloader, device, label_key_map
):
    """Function to run the training loop for each epoch"""
    tr_loss, tr_accuracy = 0.0, 0.0
    tr_steps = 0
    tr_preds = []
    tr_labels = []

    # put the model in training mode:
    model.train()

    for idx, batch in enumerate(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # zero the gradients
        optimizer.zero_grad()
        # forward pass
        loss, logits = model(input_ids, attention_mask, labels=labels)

        # loss = output.loss
        # logits = output.logits

        tr_loss += loss.item()
        tr_steps += 1  # steps are the number of batches in each epoch

        if idx % 100 == 0:
            loss_step = tr_loss / tr_steps
            print(f"For Epoch: {epoch}, Step: {idx}, Train Loss: {loss_step}")

        # flatten targets and predictions
        flattened_targets = labels.view(
            -1
        )  # from (batch_size, seq_len) to (batch_size*seq_len,)
        active_logits = logits.view(
            -1, len(label_key_map)
        )  # from (batch_size, seq_len, num_labels) to (batch_size*seq_len, num_labels)
        flattened_predictions = torch.argmax(
            active_logits, axis=1
        )  # from (batch_size*seq_len, num_labels) to (batch_size*seq_len,)

        # only consider labels and predictions to store and calc metric on valid ones
        active_accuracy = labels.view(-1) != -100  # shape (batch_size, seq_len)
        labels = torch.masked_select(
            flattened_targets, active_accuracy
        )  # shape (valid_labels,)
        predictions = torch.masked_select(
            flattened_predictions, active_accuracy
        )  # shape (valid_labels,)

        # store predictions and labels
        tr_preds.extend(predictions.cpu().numpy())
        tr_labels.extend(labels.cpu().numpy())

        # calc acc score
        tmp_tr_accuracy = accuracy_score(
            labels.cpu().numpy(), predictions.cpu().numpy()
        )
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), 10.0)

        # backpropagation
        loss.backward()
        optimizer.step()
        scheduler.step()

    epoch_loss = tr_loss / tr_steps
    epoch_accuracy = tr_accuracy / tr_steps
    print(
        f"For Epoch: {epoch}, Train Loss: {epoch_loss}, Train Accuracy: {epoch_accuracy}"
    )

In [40]:
def validation_loop(epoch, model, dataloader, device, label_key_map, key_label_map):
    val_loss, val_accuracy = 0.0, 0.0
    val_steps = 0
    val_preds = []
    val_labels = []

    # put the model in evaluation mode:
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss, logits = model(input_ids, attention_mask, labels=labels)

            # loss = outputs.loss
            # logits = outputs.logits

            val_loss += loss.item()
            val_steps += 1

            if idx % 100 == 0:
                loss_step = val_loss / val_steps
                print(f"For Epoch: {epoch}, Step: {idx}, Val Loss: {loss_step}")

            # flatten targets and predictions
            flattened_targets = labels.view(
                -1
            )  # from (batch_size, seq_len) to (batch_size*seq_len,)
            active_logits = logits.view(
                -1, len(label_key_map)
            )  # from (batch_size, seq_len, num_labels) to (batch_size*seq_len, num_labels)
            flattened_predictions = torch.argmax(
                active_logits, axis=1
            )  # from (batch_size*seq_len, num_labels) to (batch_size*seq_len,)

            # only consider labels and predictions to store and calc metric on valid ones
            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(
                flattened_targets, active_accuracy
            )  # shape (valid_labels,)
            predictions = torch.masked_select(
                flattened_predictions, active_accuracy
            )  # shape (valid_labels,)

            # store predictions and labels
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

            tmp_val_accuracy = accuracy_score(
                labels.cpu().numpy(), predictions.cpu().numpy()
            )
            val_accuracy += tmp_val_accuracy

    # we change the predicted labels to actual labels
    val_labels = [key_label_map[id] for id in val_labels]
    val_preds = [key_label_map[id] for id in val_preds]

    epoch_loss = val_loss / val_steps
    epoch_accuracy = val_accuracy / val_steps

    print(f"For Epoch: {epoch}, Val Loss: {epoch_loss}, Val Accuracy: {epoch_accuracy}")

    return val_labels, val_preds

**Training**

In [41]:
import gc

In [42]:
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()


clear_gpu_memory()

In [43]:
for epoch in range(EPOCHS):
    clear_gpu_memory()
    print(f"Epoch: {epoch}")
    print("Training Loop")
    training_loop(
        epoch, model, optimizer, scheduler, train_dataloader, device, label_key_map
    )
    print("Validation Loop")
    val_labels, val_preds = validation_loop(
        epoch, model, testing_dataloader, device, label_key_map, key_label_map
    )

Epoch: 0
Training Loop
For Epoch: 0, Step: 0, Train Loss: 2.6930460929870605
For Epoch: 0, Step: 100, Train Loss: 1.5567603801736738
For Epoch: 0, Step: 200, Train Loss: 0.9926022068778081
For Epoch: 0, Step: 300, Train Loss: 0.7412745997161169
For Epoch: 0, Step: 400, Train Loss: 0.6049271616666692
For Epoch: 0, Step: 500, Train Loss: 0.5170512413327208
For Epoch: 0, Step: 600, Train Loss: 0.4568215930980077
For Epoch: 0, Step: 700, Train Loss: 0.4117908514850107
For Epoch: 0, Step: 800, Train Loss: 0.3777573706556609
For Epoch: 0, Step: 900, Train Loss: 0.3521165180440732
For Epoch: 0, Step: 1000, Train Loss: 0.3317637346456056
For Epoch: 0, Step: 1100, Train Loss: 0.3146741366061254
For Epoch: 0, Step: 1200, Train Loss: 0.2996070775878829
For Epoch: 0, Step: 1300, Train Loss: 0.2859234845260019
For Epoch: 0, Step: 1400, Train Loss: 0.273812501897356
For Epoch: 0, Step: 1500, Train Loss: 0.26412529320946876
For Epoch: 0, Step: 1600, Train Loss: 0.2551281112976265
For Epoch: 0, Step: 

KeyboardInterrupt: 

In [44]:
print(model.state_dict().get("bert.bert.embeddings.word_embeddings.weight").sum())
print(model.state_dict().get("bert.bert.embeddings.position_embeddings.weight").sum())

tensor(-308370.5000, device='cuda:0')
tensor(1.7184, device='cuda:0')


In [45]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [46]:

conf_matrix = confusion_matrix(val_labels, val_preds)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[    11      0     10      6      0     11      6      1      0      0
       1      0      0      1      0      0     16]
 [     0     14      1      2      0     12      2      1      0      0
       0      0      0      3      0      1     18]
 [     1      0   6958     32      1    205     87      3      0      0
      57      0      0     27     39      6    151]
 [     1      0    114   2926      0     22      3      0      0      0
       7      0      0      1      0      0      7]
 [     0      0      2      0     15      3      3      0      0      0
       0      0      0      0      0      1     17]
 [     3      2    484     25      2   2897    156      8      0      0
       3      0      0     47     73      1    240]
 [     1      0    113      0      1    110   2908      0      0      0
       3      0      0     41    139      1     71]
 [     0      0     57      1      0     16      2   3600      0      0
       2      0      0      1      2     

In [47]:
precision = precision_score(val_labels, val_preds, average="weighted")
recall = recall_score(val_labels, val_preds, average="weighted")
f1 = f1_score(val_labels, val_preds, average="weighted")

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 0.9732779266095999
Recall: 0.9735539709531907
F1-Score: 0.9731784761744673


In [48]:
report = classification_report(val_labels, val_preds)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

       B-art       0.46      0.17      0.25        63
       B-eve       0.61      0.26      0.36        54
       B-geo       0.87      0.92      0.89      7567
       B-gpe       0.97      0.95      0.96      3081
       B-nat       0.79      0.37      0.50        41
       B-org       0.82      0.74      0.77      3941
       B-per       0.86      0.86      0.86      3388
       B-tim       0.92      0.90      0.91      4014
       I-art       0.00      0.00      0.00        34
       I-eve       0.64      0.15      0.24        48
       I-geo       0.81      0.84      0.82      1514
       I-gpe       1.00      0.48      0.65        48
       I-nat       1.00      0.30      0.46        10
       I-org       0.83      0.80      0.81      3286
       I-per       0.84      0.92      0.88      3350
       I-tim       0.84      0.77      0.81      1295
           O       0.99      0.99      0.99    178203

  

In [49]:
sentence = "HuggingFace is a company based in New York, but is also has employees working in Paris"

inputs = tokenizer(
    sentence.split(),
    return_offsets_mapping=True,
    padding="max_length",
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt",
    is_split_into_words=True,
)

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)

In [50]:
model.eval()
with torch.no_grad():
  outputs = model(ids, attention_mask=mask, labels=None)
  logits = outputs[-1]

In [51]:
active_logits = logits.view(
    -1, len(label_key_map)
)  # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(
    active_logits, axis=1
)  # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [key_label_map[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(
    zip(tokens, token_predictions)
)  # list of tuples. Each tuple = (wordpiece, prediction)

In [52]:
wp_preds

[('[CLS]', 'O'),
 ('Hu', 'B-org'),
 ('##gging', 'O'),
 ('##F', 'O'),
 ('##ace', 'O'),
 ('is', 'O'),
 ('a', 'O'),
 ('company', 'O'),
 ('based', 'O'),
 ('in', 'O'),
 ('New', 'B-geo'),
 ('York', 'I-geo'),
 (',', 'O'),
 ('but', 'O'),
 ('is', 'O'),
 ('also', 'O'),
 ('has', 'O'),
 ('employees', 'O'),
 ('working', 'O'),
 ('in', 'O'),
 ('Paris', 'B-geo'),
 ('[SEP]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'B-geo'),
 ('[PAD]', 'B-geo'),
 ('[PAD]', 'I-geo'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'B-org'),
 ('[PAD]', 'B-geo'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 ('[PAD]', 'O'),
 

In [53]:
prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
    # only predictions on first word pieces are important
    if mapping[0] == 0 and mapping[1] != 0:
        prediction.append(token_pred[1])
    else:
        continue

print(sentence.split())
print(prediction)

['HuggingFace', 'is', 'a', 'company', 'based', 'in', 'New', 'York,', 'but', 'is', 'also', 'has', 'employees', 'working', 'in', 'Paris']
['B-org', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo']


Success in terms of raw process

######################################################--------------------------------------------------------###############################################