<a href="https://colab.research.google.com/github/steffiangel/Large-Language-Model-Projects/blob/main/2348510_LLM_LAB5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NAME ENTITY RECOGNITION USING FINE TUNED MODELING**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install transformers



In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import pandas as pd

In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
dataset_path = '/kaggle/input/named-entity-recognitionner-dataset/ner.csv'
ner_corpus_data = pd.read_csv(dataset_path)

In [None]:
ner_corpus_data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [None]:
import pandas as pd
from transformers import BertTokenizerFast
import torch
from sklearn.model_selection import train_test_split

In [None]:

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:

tag_to_id = {
    'O': 0,
    'B-geo': 1,
    'I-geo': 2,
    'B-gpe': 3,
    'I-gpe': 4,
    'B-per': 5,
    'I-per': 6,
    'B-org': 7,
    'I-org': 8,
    'B-tim': 9,
    'I-tim': 10,
    'B-art': 11,
    'I-art': 12,
    'B-eve': 13,
    'I-eve': 14,
    'B-nat': 15,
    'I-nat': 16
}

In [None]:
def tokenize_and_align_labels(sentence, labels):
    tokenized_inputs = tokenizer(sentence.split(), is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            if word_idx < len(labels):
                label_ids.append(tag_to_id.get(labels[word_idx], -100))
            else:
                label_ids.append(-100)
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    return tokenized_inputs, label_ids

In [None]:
input_ids = []
attention_masks = []
label_ids = []

for idx, row in ner_corpus_data.iterrows():
    sentence = row['Sentence']
    pos_tags = eval(row['POS'])
    ner_tags = eval(row['Tag'])

    tokenized_inputs, labels = tokenize_and_align_labels(sentence, ner_tags)
    input_ids.append(tokenized_inputs['input_ids'])
    attention_masks.append(tokenized_inputs['attention_mask'])
    label_ids.append(labels)

In [None]:

input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
label_ids = torch.tensor(label_ids)

In [None]:

dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, label_ids)

In [None]:

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


batch_size = 16
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

In [None]:

print("Tokenized input example:", input_ids[0])
print("Labels example:", label_ids[0])

Tokenized input example: tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
         1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
         3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,    

In [None]:
from transformers import DistilBertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm

num_labels = len(tag_to_id)
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
loss = torch.nn.CrossEntropyLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [None]:

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for batch in progress_bar:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)


    model.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []

    for batch in val_dataloader:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            all_preds.append(preds)
            all_labels.append(label_ids)

    avg_val_loss = total_val_loss / len(val_dataloader)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Training loss: {avg_train_loss}")
    print(f"Validation loss: {avg_val_loss}")


    all_preds_flat = all_preds.flatten()
    all_labels_flat = all_labels.flatten()

    # filter out -100 labels padding labels
    mask = all_labels_flat != -100
    all_preds_flat = all_preds_flat[mask]
    all_labels_flat = all_labels_flat[mask]

    # classification report
    report = classification_report(all_labels_flat, all_preds_flat, target_names=list(tag_to_id.keys()))
    print(report)

Epoch 1: 100%|██████████| 2398/2398 [04:04<00:00,  9.81it/s, loss=0.0927]


Epoch 1/3
Training loss: 0.14327682563809802
Validation loss: 0.10581349946868916


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.99      0.99      0.99    178119
       B-geo       0.86      0.89      0.87      7674
       I-geo       0.88      0.64      0.74      1544
       B-gpe       0.95      0.94      0.95      3160
       I-gpe       0.86      0.51      0.64        37
       B-per       0.85      0.84      0.85      3459
       I-per       0.83      0.92      0.87      3437
       B-org       0.77      0.68      0.72      3964
       I-org       0.79      0.64      0.71      3276
       B-tim       0.93      0.88      0.90      3987
       I-tim       0.87      0.72      0.79      1285
       B-art       0.67      0.03      0.05        73
       I-art       0.00      0.00      0.00        53
       B-eve       0.64      0.22      0.33        73
       I-eve       0.42      0.08      0.14        61
       B-nat       0.43      0.07      0.12        42
       I-nat       0.00      0.00      0.00        13

    accuracy              

Epoch 2: 100%|██████████| 2398/2398 [04:03<00:00,  9.83it/s, loss=0.127]  


Epoch 2/3
Training loss: 0.08224792614694937
Validation loss: 0.09914999579700332
              precision    recall  f1-score   support

           O       0.99      0.99      0.99    178119
       B-geo       0.88      0.88      0.88      7674
       I-geo       0.85      0.74      0.79      1544
       B-gpe       0.96      0.94      0.95      3160
       I-gpe       0.90      0.51      0.66        37
       B-per       0.85      0.86      0.85      3459
       I-per       0.84      0.93      0.88      3437
       B-org       0.73      0.74      0.73      3964
       I-org       0.71      0.76      0.73      3276
       B-tim       0.92      0.89      0.91      3987
       I-tim       0.81      0.80      0.80      1285
       B-art       0.48      0.14      0.21        73
       I-art       0.56      0.19      0.28        53
       B-eve       0.54      0.29      0.38        73
       I-eve       0.57      0.20      0.29        61
       B-nat       0.62      0.24      0.34        42

Epoch 3: 100%|██████████| 2398/2398 [04:04<00:00,  9.83it/s, loss=0.0436] 


Epoch 3/3
Training loss: 0.056703043737106204
Validation loss: 0.10346151698225488
              precision    recall  f1-score   support

           O       0.99      0.99      0.99    178119
       B-geo       0.87      0.90      0.89      7674
       I-geo       0.80      0.81      0.80      1544
       B-gpe       0.96      0.94      0.95      3160
       I-gpe       0.81      0.59      0.69        37
       B-per       0.86      0.86      0.86      3459
       I-per       0.86      0.91      0.88      3437
       B-org       0.78      0.72      0.75      3964
       I-org       0.77      0.73      0.75      3276
       B-tim       0.92      0.90      0.91      3987
       I-tim       0.85      0.77      0.81      1285
       B-art       0.39      0.21      0.27        73
       I-art       0.48      0.19      0.27        53
       B-eve       0.52      0.38      0.44        73
       I-eve       0.47      0.30      0.36        61
       B-nat       0.56      0.24      0.33        4

In [None]:

model.save_pretrained('/kaggle/working/fine_tuned_distilbert_ner')
tokenizer.save_pretrained('/kaggle/working/fine_tuned_distilbert_ner')

('/kaggle/working/fine_tuned_distilbert_ner/tokenizer_config.json',
 '/kaggle/working/fine_tuned_distilbert_ner/special_tokens_map.json',
 '/kaggle/working/fine_tuned_distilbert_ner/vocab.txt',
 '/kaggle/working/fine_tuned_distilbert_ner/added_tokens.json',
 '/kaggle/working/fine_tuned_distilbert_ner/tokenizer.json')

## Predictions


In [None]:
from transformers import DistilBertForTokenClassification, BertTokenizer
import torch

model_path = '/kaggle/working/fine_tuned_distilbert_ner'
model = DistilBertForTokenClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

id_to_tag = {
    0: 'O',
    1: 'B-geo',
    2: 'I-geo',
    3: 'B-gpe',
    4: 'I-gpe',
    5: 'B-per',
    6: 'I-per',
    7: 'B-org',
    8: 'I-org',
    9: 'B-tim',
    10: 'I-tim',
    11: 'B-art',
    12: 'I-art',
    13: 'B-eve',
    14: 'I-eve',
    15: 'B-nat',
    16: 'I-nat'
}

# Tokenize and predict named entities in a sentence
def predict_named_entities(sentence):
    inputs = tokenizer(sentence.split(), is_split_into_words=True, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_tags = [id_to_tag[pred.item()] for pred in predictions[0]]

    # Extract named entities
    named_entities = []
    for token, tag in zip(tokens, predicted_tags):
        if tag != 'O' and token != '[PAD]':
            named_entities.append((token, tag))

    return named_entities

sentence = "Barack Obama was born in Hawaii and became the 44th President of the United States."
named_entities = predict_named_entities(sentence)

print("Named Entities:", named_entities)


Named Entities: [('barack', 'B-per'), ('obama', 'I-per'), ('hawaii', 'B-geo'), ('44th', 'B-tim'), ('united', 'B-geo'), ('states', 'I-geo')]
