In [1]:
import torch
import nltk
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt 

# import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
#from keras.preprocessing.sequence import pad_sequences

## Load Dataset

In [2]:
def coarse_tag(tag):
  LOC = ['Facility', 'OtherLOC', 'HumanSettlement', 'Station']
  CW = ['VisualWork', 'MusicalWork', 'WrittenWork', 'ArtWork', 'Software']
  GRP = ['MusicalGRP', 'PublicCORP', 'PrivateCORP', 'AerospaceManufacturer', 'SportsGRP', 'CarManufacturer', 'ORG']
  PER = ['Scientist', 'Artist', 'Athlete', 'Politician', 'Cleric', 'SportsManager', 'OtherPER']
  PROD = ['Clothing', 'Vehicle', 'Food', 'Drink', 'OtherPROD']
  MED = ['Medication/Vaccine', 'MedicalProcedure', 'AnatomicalStructure', 'Symptom', 'Disease']
  if tag in LOC:
    return('Location')
  elif tag in CW:
    return('CreativeWork')
  elif tag in GRP:
    return('Group')
  elif tag in PER:
    return('Person')
  elif tag in PROD:
    return('Product')
  elif tag in MED:
    return('Medical')
  elif tag == 'O':
    return('O')
  else:
    return('<PAD>')

In [3]:
# nlp = spacy.load('en_core_web_lg')

In [4]:
def make_dataset(file):
  unclean_txt = re.compile("^#|[\n]")
  data = []
  with open(file) as f:
      sent = []
      for i in f.read().splitlines():
          if unclean_txt.match(i) == None:
              s = i.split(' _ _ ')[0]
              t = i.split(' _ _ ')[-1]
              if s!='' and t != '':
                if t != 'O':
                  t = t[2:]
                z = coarse_tag(t)
                sent.append((s,t,z))
          if unclean_txt.match(i):
              data.append(sent)
              sent = []
      data.append(sent)
  data = data[1:] # first line had a new line character
  return(data)

In [5]:
train_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/HI-Hindi/hi_train.conll')
dev_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/HI-Hindi/hi_dev.conll')
test_dataset = make_dataset('/home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/multiconer2023/HI-Hindi/hi_test.conll')

In [6]:
train_dataset[0]

[('यह', 'O', 'O'),
 ('झियान', 'HumanSettlement', 'Location'),
 ('चीन', 'HumanSettlement', 'Location'),
 ('के', 'O', 'O'),
 ('केंद्र', 'O', 'O'),
 ('भाग', 'O', 'O'),
 ('में', 'O', 'O'),
 ('स्थित', 'O', 'O'),
 ('है।', 'O', 'O')]

In [7]:
s = [[token.lower() for token, fine_tag, coarse_tag  in sentence] for sentence in train_dataset]
l = np.array(list(map(len, s)))
print(np.percentile(l,99))

33.0


## Preprocess data

In [8]:
SEQ_LEN = 33

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
fine_tag_to_idx = {"<PAD>": 0}
coarse_tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, fine_tag, coarse_tag in sentence] for sentence in dataset]
    fine_tags = [[fine_tag for token, fine_tag, coarse_tag in sentence] for sentence in dataset]
    coarse_tags = [[coarse_tag for token, fine_tag, coarse_tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            fine_tags[i].append('<PAD>')
            coarse_tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            fine_tags[i] = fine_tags[i][:SEQ_LEN]
            coarse_tags[i] = coarse_tags[i][:SEQ_LEN]
    
    for sentence_tags in fine_tags:
        for fine_tag in sentence_tags:
            if fine_tag not in fine_tag_to_idx:
                # word = nlp(fine_tag)
                fine_tag_to_idx[fine_tag] = len(fine_tag)

    for sentence_tags in coarse_tags:
        for coarse_tag in sentence_tags:
            if coarse_tag not in coarse_tag_to_idx:
                # word = nlp(coarse_tag)
                coarse_tag_to_idx[coarse_tag] = len(coarse_tag)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                # word = nlp(word)
                word_to_idx[word] = len(word)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.float32).type(torch.LongTensor)
    fine_Y = torch.tensor([[fine_tag_to_idx[fine_tag] for fine_tag in sentence] for sentence in fine_tags]).type(torch.LongTensor)
    coarse_Y = torch.tensor([[coarse_tag_to_idx[coarse_tag] for coarse_tag in sentence] for sentence in coarse_tags]).type(torch.LongTensor)
    
    return X, fine_Y, coarse_Y

In [9]:
train_X, train_fine_Y, train_coarse_Y = preprocess(train_dataset)
# len(train_X), len(train_fine_Y), len(train_coarse_Y)
dev_X, dev_fine_Y, dev_coarse_Y = preprocess(dev_dataset)
test_X, test_fine_Y, test_coarse_Y = preprocess(test_dataset)

## Model Definition

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

## Performance of the model on fine-tags

In [11]:
EMBEDDING_DIM = 100
HIDDEN_DIM    = 100
NUM_EPOCHS    = 10 
BATCH_SIZE    = 5

train_dataset = TensorDataset(train_X, train_fine_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_fine_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_fine_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [12]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(fine_tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, accelerator='gpu', callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
# trainer.save_checkpoint("english.ckpt")

trainer.test(dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 3.1 M 
1 | lstm      | LSTM             | 161 K 
2 | fc        | Linear           | 7.0 K 
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
3.2 M     Trainable params
0         Non-trainable params
3.2 M     Total params
12.921    Total estimated model params s

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/lightning_logs/version_8/checkpoints/epoch=9-step=19270.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/friday/Documents/MS/Coursework/1st Sem/Deep Learning/DL23-CS60010/assignment-2/lightning_logs/version_8/checkpoints/epoch=9-step=19270.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.31005585193634033}]

## Model Inference

In [13]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: fine_tag for fine_tag, idx in fine_tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

fine_y_true = []
fine_y_pred = []

with torch.no_grad():
    for x, fine_y in test_loader:
        # Move the data to the device
        x = x.to(device)
        fine_y = fine_y.to(device)

        # Forward pass
        fine_y_hat = model(x)

        # Compute the predicted tags
        fine_y_pred += [idx_to_tag[i] for i in fine_y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        fine_y_true += [idx_to_tag[i] for i in fine_y.cpu().numpy().flatten().tolist()]

print(classification_report(fine_y_true, fine_y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

                <PAD>       1.00      1.00      1.00    312309
AerospaceManufacturer       0.13      0.01      0.02       179
  AnatomicalStructure       0.00      0.00      0.00       599
              ArtWork       0.41      0.13      0.20      6066
      CarManufacturer       0.55      0.08      0.14      7753
               Cleric       0.23      0.03      0.05      4172
                Drink       0.00      0.00      0.00       199
                 Food       0.50      0.01      0.02       579
     MedicalProcedure       0.18      0.04      0.07       560
   Medication/Vaccine       0.40      0.00      0.01       516
          MusicalWork       0.28      0.06      0.10      2238
                    O       0.88      0.99      0.93    250839
                  ORG       0.84      0.39      0.53      4895
           PublicCorp       0.25      0.04      0.08      5693
            Scientist       0.23      0.23      0.23  

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
coarse_y_true = list(map(coarse_tag, fine_y_true))
coarse_y_pred = list(map(coarse_tag, fine_y_pred))
print(classification_report(coarse_y_true, coarse_y_pred))

              precision    recall  f1-score   support

       <PAD>       1.00      0.98      0.99    318002
CreativeWork       0.41      0.10      0.16     13174
       Group       0.78      0.21      0.33     12827
     Medical       0.20      0.02      0.03      1675
           O       0.88      0.99      0.93    250839
      Person       0.30      0.15      0.20      9872
     Product       0.50      0.01      0.01       778

    accuracy                           0.93    607167
   macro avg       0.58      0.35      0.38    607167
weighted avg       0.92      0.93      0.92    607167



## Function to predict NERs

In [15]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

fine_y_true = []
fine_y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        fine_y = fine_y.to(device)

        # Forward pass
        fine_y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        fine_y_pred += [idx_to_tag[i] for i in fine_y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        fine_y_true += [idx_to_tag[i] for i in fine_y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(fine_y_pred)
        coarse_y_pred = list(map(coarse_tag, fine_y_pred))
        print(coarse_y_pred)
        break

Sentence
['शायर', 'प्राचीनता', 'victoria', 'ईव', '३१२', 'शायर', 'ईव', 'शायर', 'शायर', 'जन्म।', 'रॉकलिफ', 'रॉकलिफ', 'ईव', 'ईव', 'ईव', '३१२', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '३१२', 'रॉकलिफ', '♀', 'शायर', 'शायर', '♀', 'रॉकलिफ', 'रॉकलिफ', 'जन्म।', 'ईव', '३१२', 'रॉकलिफ', '३१२', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'ईव', 'victoria', '८१२/८१३', 'victoria', 'रॉकलिफ', 'जन्म।', 'रॉकलिफ', 'ईव', '३१२', '८१२/८१३', 'ईव', 'ईव', 'ईव', '३१२', 'ईव', '३१२', '८१२/८१३', 'रॉकलिफ', 'ईव', '३१२', 'शायर', '३१२', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'रॉकलिफ', 'ईव', '३१२', 'जन्म।', '३१२', '३१२', '३१२', 'रॉकलिफ', 'शायर', 'शायर', '८१२/८१३', 'शायर', 'victoria', 'ईव', '३१२', 'शायर', '८१२/८१३', 'ईव