Importing dependencies

In [25]:
import torch
from torch.utils.data import Dataset, DataLoader
%pip install pytorch-lightning -q -U
%pip install awscli
!aws s3 cp --no-sign-request s3://multiconer/multiconer2023/ multiconer2023/ --recursive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
download: s3://multiconer/multiconer2023/BN-Bangla/bn_dev.conll to multiconer2023/BN-Bangla/bn_dev.conll
download: s3://multiconer/multiconer2023/EN-English/en_dev.conll to multiconer2023/EN-English/en_dev.conll
download: s3://multiconer/multiconer2023/DE-German/de_dev.conll to multiconer2023/DE-German/de_dev.conll
download: s3://multiconer/multiconer2023/ES-Spanish/es_dev.conll to multiconer2023/ES-Spanish/es_dev.conll
download: s3://multiconer/multiconer2023/EN-English/en_train.conll to multiconer2023/EN-English/en_train.conll
download: s3://multiconer/multiconer2023/DE-German/de_train.conll to multiconer2023/DE-German/de_train.conll
download: s3://multiconer/multiconer2023/DE-German/de_test.conll to multiconer2023/DE-German/de_test.conll
download: s3://multiconer/multiconer2023/BN-Bangla/bn_train.conll to multiconer2023/BN-Bangla/bn_train.conll
download: s3://multiconer/multiconer2023/

Parsing the CONLL file

In [26]:

def parse_conll_file(file_path):
    """
    Parse a conll-formatted file and return a list of sentences, where each sentence is a list of tokens and their labels.

    Args:
        file_path (str): The path to the conll file.

    Returns:
        List[List[Tuple[str, str]]]: A list of sentences, where each sentence is a list of (token, label) tuples.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n\n')

        sentences = []
        for line in lines:
            sentence = []
            # print(line)
            flag = 0
            for token in line.strip().split('\n'):
                flag = flag + 1
                if(flag==1):
                  continue
                fields = token.split(' _ _ ')
                sentence.append((fields[0], fields[-1]))
            if sentence:
                sentences.append(sentence)
    return sentences

Parsing and loading training, validation and testing data sets into train_dataset, dev_dataset, test_dataset

In [27]:
train_dataset = parse_conll_file('./multiconer2023/EN-English/en_train.conll')
dev_dataset = parse_conll_file('./multiconer2023/EN-English/en_dev.conll')
test_dataset = parse_conll_file('./multiconer2023/EN-English/en_test.conll')

In [28]:
train_dataset[0]

[('robert', 'B-OtherPER'),
 ('gottschalk', 'I-OtherPER'),
 ('1939', 'O'),
 ('academy', 'B-VisualWork'),
 ('award', 'I-VisualWork'),
 ('winner', 'O'),
 ('and', 'O'),
 ('founder', 'O'),
 ('of', 'O'),
 ('panavision', 'B-ORG')]

Preprocessing the data

In [29]:
SEQ_LEN = 25

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]
    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [30]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [31]:
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 16778
Number of validation examples: 871
Number of testing examples: 249980


# NER MODEL

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [33]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 500
HIDDEN_DIM    = 500
NUM_EPOCHS    = 10 
BATCH_SIZE    = 20

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [34]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=2, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS,  callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
PATH = "./model_english"
torch.save(model.state_dict(), PATH)
trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 120 M 
1 | lstm      | LSTM             | 4.0 M 
2 | fc        | Linear           | 68.1 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
499.770   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_4/checkpoints/epoch=3-step=3356.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_4/checkpoints/epoch=3-step=3356.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.43237850069999695}]

In [35]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)
        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]
        
        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00   2499089
B-AerospaceManufacturer       0.47      0.48      0.47      1013
  B-AnatomicalStructure       0.55      0.28      0.37      5824
              B-ArtWork       0.30      0.21      0.25      1264
               B-Artist       0.56      0.58      0.57     56981
              B-Athlete       0.55      0.45      0.50     27554
      B-CarManufacturer       0.49      0.35      0.41      2977
               B-Cleric       0.33      0.25      0.29      4725
             B-Clothing       0.39      0.17      0.24      2229
              B-Disease       0.48      0.33      0.39      5600
                B-Drink       0.40      0.27      0.32      2235
             B-Facility       0.49      0.39      0.43     16134
                 B-Food       0.17      0.18      0.17      5288
      B-HumanSettlement       0.65      0.66      0.66     41013
     B-MedicalProcedure 

In [36]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['the', 'species', 'was', 'described', 'by', 'dietrich', 'brandis', 'after', 'the', 'forester', 't.', 'f.', 'bourdillon', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'by', 'this', 'time', 'she', 'was', 'competing', 'against', 'a', 'new', 'generation', 'of', 'young', 'drivers', 'including', 'stirling', 'moss', 'and', 'peter', 'collins', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'their', 'son', 'was', 'the', 'opera', 'producer', 'knut', 'hendriksen', '(', '1944', '–', '2020', ')', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'he', 'is', 'the', 'younger', 'brother', 'of', 'adam', 'mosseri', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'smes', ':', 'laura', 'j.', 'van', "'t", 'veer', 'et', 'al', '.', '(', 'nl', ')', 'for', 'their', 'gene', 'based', 'brea

In [43]:
mapping = {
    "facility": "LOC",
    "otherloc": "LOC",
    "humansettlement": "LOC",
    "station": "LOC",
    "visualwork": "CW",
    "musicalwork": "CW",
    "writtenwork": "CW",
    "artwork": "CW",
    "software": "CW",
    "musicalgrp": "GRP",
    "publiccorp": "GRP",
    "privatecorp": "GRP",
    "aerospacemanufacturer": "GRP",
    "sportsgrp": "GRP",
    "carmanufacturer": "GRP",
    "org": "GRP",
    "scientist": "PER",
    "artist": "PER",
    "athlete": "PER",
    "politician": "PER",
    "cleric": "PER",
    "sportsmanager": "PER",
    "otherper": "PER",
    "clothing": "PROD",
    "vehicle": "PROD",
    "food": "PROD",
    "drink": "PROD",
    "otherprod": "PROD",
    "medication/vaccine": "MED",
    "medicalprocedure": "MED",
    "anatomicalstructure": "MED",
    "symptom": "MED",
    "disease": "MED"
}

# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        break
for i in range(len(y_pred)) :
  if y_pred[i] == "O" or y_pred[i] == "<PAD>" :
    continue
  y_pred[i] = mapping[y_pred[i][2:].lower()]
for i in range(len(y_true)) :
  if y_true[i] == "O" or y_true[i] == "<PAD>" :
    continue
  y_true[i] = mapping[y_true[i][2:].lower()]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       <PAD>       1.00      1.00      1.00       219
         GRP       0.00      0.00      0.00         0
         LOC       0.50      0.40      0.44         5
         MED       0.00      0.00      0.00         3
           O       0.95      0.98      0.97       207
         PER       0.97      0.88      0.92        66

    accuracy                           0.96       500
   macro avg       0.57      0.54      0.56       500
weighted avg       0.97      0.96      0.96       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
print(y_pred)
print(y_true)
print(len(y_pred))
print(len(y_true))

['O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'O', 'O', 'O', 'PER', 'PER', 'PER', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'O', 'PER', 'PER', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'O', 'O', 'PER', 'PER', 'PER', 'PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'MED', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<

In [45]:

PATH = "./model_english"
modelEn = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
modelEn.load_state_dict(torch.load(PATH))
import nltk



In [46]:
se='''Hello boys! This is Ben and we I am going to save the world'''
se=se.lower()
import nltk
nltk.download('punkt')
# Use the word_tokenize function to tokenize the sentence
tokens = nltk.word_tokenize(se)
# Print the tokens
print(tokens)
for i in range(len(tokens)):
        while len(tokens) < SEQ_LEN:
            tokens.append('<PAD>')
            

        if len(tokens) > SEQ_LEN:
            tokens = tokens[:SEQ_LEN]

        for word in tokens:
                if word not in word_to_idx:
                  word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
X = torch.tensor([[word_to_idx.get(word, 1) for word in tokens]], dtype=torch.int).type(torch.LongTensor)
print(X)
modelEn.eval()
y=modelEn(X)
x_sent = [idx_to_word[i] for i in X.cpu().numpy().flatten().tolist()]
# Compute the predicted tags
y_preds = [idx_to_tag[i] for i in y.argmax(-1).cpu().numpy().flatten().tolist()]
print("Sentence")
print(x_sent)
print("Predicted tags")
print(y_preds)

['hello', 'boys', '!', 'this', 'is', 'ben', 'and', 'we', 'i', 'am', 'going', 'to', 'save', 'the', 'world']
tensor([[18529,  2794,  3074,   200,    84,  2479,     8,  2648,  1426,  2785,
          4069,    30,  4289,    13,  1483,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]])
Sentence
['hello', 'boys', '!', 'this', 'is', 'ben', 'and', 'we', 'i', 'am', 'going', 'to', 'save', 'the', 'world', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Predicted tags
['B-Software', 'I-VisualWork', 'O', 'O', 'O', 'B-OtherPER', 'O', 'O', 'O', 'I-VisualWork', 'O', 'O', 'O', 'O', 'I-WrittenWork', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
def predict_tags_fun(sentns):
      se=sentns.lower()
      tokens = nltk.word_tokenize(se) 
      for i in range(len(tokens)):
              while len(tokens) < SEQ_LEN:
                  tokens.append('<PAD>')
                  
      
              if len(tokens) > SEQ_LEN:
                  tokens = tokens[:SEQ_LEN]
      
              for word in tokens:
                      if word not in word_to_idx:
                        word_to_idx[word] = len(word_to_idx)
      
          # Convert words and tags to indices
      X = torch.tensor([[word_to_idx.get(word, 1) for word in tokens]], dtype=torch.int).type(torch.LongTensor)
      print(X)
      modelEn.eval()
      y=modelEn(X)
      x_sent = [idx_to_word[i] for i in X.cpu().numpy().flatten().tolist()]
      # Compute the predicted tags
      y_preds = [idx_to_tag[i] for i in y.argmax(-1).cpu().numpy().flatten().tolist()]
      print("Sentence")
      print(x_sent)
      print("Predicted tags")
      print(y_preds)
