In [3]:
import torch
import nltk
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt 

import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
#from keras.preprocessing.sequence import pad_sequences

## Load Dataset

In [4]:
def coarse_tag(tag):
  LOC = ['Facility', 'OtherLOC', 'HumanSettlement', 'Station']
  CW = ['VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 'Software']
  GRP = ['MusicalGRP', 'PublicCORP', 'PrivateCORP', 'AerospaceManufacturer', 'SportsGRP', 'CarManufacturer', 'ORG']
  PER = ['Scientist', 'Artist', 'Athlete', 'Politician', 'Cleric', 'SportsManager', 'OtherPER']
  PROD = ['Clothing', 'Vehicle', 'Food', 'Drink', 'OtherPROD']
  MED = ['Medication/Vaccine', 'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']
  if tag in LOC:
    return('Location')
  elif tag in CW:
    return('CreativeWork')
  elif tag in GRP:
    return('Group')
  elif tag in PER:
    return('Person')
  elif tag in PROD:
    return('Product')
  elif tag in MED:
    return('Medical')
  elif tag == 'O':
    return('O')
  else:
    return('<PAD>')

In [5]:
nlp = spacy.load('en_core_web_lg')

In [6]:
def make_dataset(file):
  unclean_txt = re.compile("^#|[\n]")
  data = []
  with open(file) as f:
      sent = []
      for i in f.read().splitlines():
          if unclean_txt.match(i) == None:
              s = i.split(' _ _ ')[0]
              t = i.split(' _ _ ')[-1]
              if s!='' and t != '':
                if t != 'O':
                  t = t[2:]
                z = coarse_tag(t)
                sent.append((s,t,z))
          if unclean_txt.match(i):
              data.append(sent)
              sent = []
      data.append(sent)
  data = data[1:] # first line had a new line character
  return(data)

In [7]:
train_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/EN-English/en_train.conll')
dev_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/EN-English/en_dev.conll')
test_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/EN-English/en_test.conll')

In [8]:
train_dataset[0]

[('robert', 'OtherPER', 'Person'),
 ('gottschalk', 'OtherPER', 'Person'),
 ('1939', 'O', 'O'),
 ('academy', 'VisualWork', 'CreativeWork'),
 ('award', 'VisualWork', 'CreativeWork'),
 ('winner', 'O', 'O'),
 ('and', 'O', 'O'),
 ('founder', 'O', 'O'),
 ('of', 'O', 'O'),
 ('panavision', 'ORG', 'Group')]

In [9]:
s = [[token.lower() for token, fine_tag, coarse_tag  in sentence] for sentence in train_dataset]
l = np.array(list(map(len, s)))
print(np.percentile(l,99))

28.0


## Preprocess data

In [12]:
SEQ_LEN = 28

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
fine_tag_to_idx = {"<PAD>": 0}
coarse_tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, fine_tag, coarse_tag in sentence] for sentence in dataset]
    fine_tags = [[fine_tag for token, fine_tag, coarse_tag in sentence] for sentence in dataset]
    coarse_tags = [[coarse_tag for token, fine_tag, coarse_tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            fine_tags[i].append('<PAD>')
            coarse_tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            fine_tags[i] = fine_tags[i][:SEQ_LEN]
            coarse_tags[i] = coarse_tags[i][:SEQ_LEN]
    
    for sentence_tags in fine_tags:
        for fine_tag in sentence_tags:
            if fine_tag not in fine_tag_to_idx:
                word = nlp(fine_tag)
                fine_tag_to_idx[fine_tag] = torch.from_numpy(word.vector)

    for sentence_tags in coarse_tags:
        for coarse_tag in sentence_tags:
            if coarse_tag not in coarse_tag_to_idx:
                word = nlp(coarse_tag)
                coarse_tag_to_idx[coarse_tag] = torch.from_numpy(word.vector)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word = nlp(word)
                word_to_idx[word] = torch.from_numpy(word.vector)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.float32).type(torch.LongTensor)
    fine_Y = torch.tensor([[fine_tag_to_idx[fine_tag] for fine_tag in sentence] for sentence in fine_tags]).type(torch.LongTensor)
    coarse_Y = torch.tensor([[coarse_tag_to_idx[coarse_tag] for coarse_tag in sentence] for sentence in coarse_tags]).type(torch.LongTensor)
    
    return X, fine_Y, coarse_Y

In [17]:
train_X, train_fine_Y, train_coarse_Y = preprocess(train_dataset)
# len(train_X), len(train_fine_Y), len(train_coarse_Y)
dev_X, dev_fine_Y, dev_coarse_Y = preprocess(dev_dataset)
test_X, test_fine_Y, test_coarse_Y = preprocess(test_dataset)

ValueError: only one element tensors can be converted to Python scalars

## Model Definition

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

## Performance of the model on fine-tags

In [21]:
EMBEDDING_DIM = 100
HIDDEN_DIM    = 100
NUM_EPOCHS    = 10 
BATCH_SIZE    = 5

train_dataset = TensorDataset(train_X, train_fine_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_fine_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_fine_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [22]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(fine_tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, accelerator='gpu', callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer.save_checkpoint("english.ckpt")

trainer.test(dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 24.2 M
1 | lstm      | LSTM             | 161 K 
2 | fc        | Linear           | 7.0 K 
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.4 M    Trainable params
0         Non-trainable params
24.4 M    Total params
97.547    Total estimated model params s

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/lightning_logs/version_5/checkpoints/epoch=5-step=20136.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/lightning_logs/version_5/checkpoints/epoch=5-step=20136.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.35923346877098083
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.35923346877098083}]

## Model Inference

In [23]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: fine_tag for fine_tag, idx in fine_tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

fine_y_true = []
fine_y_pred = []

with torch.no_grad():
    for x, fine_y in test_loader:
        # Move the data to the device
        x = x.to(device)
        fine_y = fine_y.to(device)

        # Forward pass
        fine_y_hat = model(x)

        # Compute the predicted tags
        fine_y_pred += [idx_to_tag[i] for i in fine_y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        fine_y_true += [idx_to_tag[i] for i in fine_y.cpu().numpy().flatten().tolist()]

print(classification_report(fine_y_true, fine_y_pred))

                       precision    recall  f1-score   support

                <PAD>       1.00      1.00      1.00   4476021
AerospaceManufacturer       0.28      0.38      0.32      1817
  AnatomicalStructure       0.37      0.23      0.28      7990
              ArtWork       0.27      0.16      0.20      4236
               Artist       0.62      0.46      0.53    116291
              Athlete       0.47      0.45      0.46     55643
      CarManufacturer       0.25      0.34      0.29      4176
               Cleric       0.36      0.23      0.28     11113
             Clothing       0.16      0.14      0.15      3075
              Disease       0.41      0.29      0.34      9516
                Drink       0.17      0.21      0.19      3052
             Facility       0.50      0.39      0.44     41049
                 Food       0.14      0.12      0.13      7247
      HumanSettlement       0.68      0.65      0.66     60425
     MedicalProcedure       0.34      0.16      0.22  

In [24]:
coarse_y_true = list(map(coarse_tag, fine_y_true))
coarse_y_pred = list(map(coarse_tag, fine_y_pred))
print(classification_report(coarse_y_true, coarse_y_pred))

              precision    recall  f1-score   support

       <PAD>       1.00      1.00      1.00   4489102
CreativeWork       0.67      0.42      0.52    169593
       Group       0.49      0.51      0.50    121087
    Location       0.69      0.61      0.65    130918
     Medical       0.39      0.28      0.32     33105
           O       0.92      0.96      0.94   2969007
      Person       0.79      0.70      0.74    291443
     Product       0.28      0.21      0.24     45085

    accuracy                           0.94   8249340
   macro avg       0.65      0.59      0.61   8249340
weighted avg       0.94      0.94      0.94   8249340



## Function to predict NERs

In [25]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

fine_y_true = []
fine_y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        fine_y = fine_y.to(device)

        # Forward pass
        fine_y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        fine_y_pred += [idx_to_tag[i] for i in fine_y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        fine_y_true += [idx_to_tag[i] for i in fine_y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(fine_y_pred)
        coarse_y_pred = list(map(coarse_tag, fine_y_pred))
        print(coarse_y_pred)
        break

Sentence
['the', 'species', 'was', 'described', 'by', 'dietrich', 'brandis', 'after', 'the', 'forester', 't.', 'f.', 'bourdillon', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'by', 'this', 'time', 'she', 'was', 'competing', 'against', 'a', 'new', 'generation', 'of', 'young', 'drivers', 'including', 'stirling', 'moss', 'and', 'peter', 'collins', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'their', 'son', 'was', 'the', 'opera', 'producer', 'knut', 'hendriksen', '(', '1944', '–', '2020', ')', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'he', 'is', 'the', 'younger', 'brother', 'of', 'adam', 'mosseri', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P