Importing dependencies

In [128]:
import torch
from torch.utils.data import Dataset, DataLoader
%pip install pytorch-lightning -q -U
%pip install awscli
!aws s3 cp --no-sign-request s3://multiconer/multiconer2023/ multiconer2023/ --recursive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
download: s3://multiconer/multiconer2023/BN-Bangla/bn_dev.conll to multiconer2023/BN-Bangla/bn_dev.conll
download: s3://multiconer/multiconer2023/EN-English/en_dev.conll to multiconer2023/EN-English/en_dev.conll
download: s3://multiconer/multiconer2023/DE-German/de_dev.conll to multiconer2023/DE-German/de_dev.conll
download: s3://multiconer/multiconer2023/ES-Spanish/es_dev.conll to multiconer2023/ES-Spanish/es_dev.conll
download: s3://multiconer/multiconer2023/BN-Bangla/bn_train.conll to multiconer2023/BN-Bangla/bn_train.conll
download: s3://multiconer/multiconer2023/DE-German/de_train.conll to multiconer2023/DE-German/de_train.conll
download: s3://multiconer/multiconer2023/DE-German/de_test.conll to multiconer2023/DE-German/de_test.conll
download: s3://multiconer/multiconer2023/ES-Spanish/es_train.conll to multiconer2023/ES-Spanish/es_train.conll
download: s3://multiconer/multiconer2023/

Parsing the CONLL file

In [129]:

def parse_conll_file(file_path):
    """
    Parse a conll-formatted file and return a list of sentences, where each sentence is a list of tokens and their labels.

    Args:
        file_path (str): The path to the conll file.

    Returns:
        List[List[Tuple[str, str]]]: A list of sentences, where each sentence is a list of (token, label) tuples.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n\n')

        sentences = []
        for line in lines:
            sentence = []
            # print(line)
            flag = 0
            for token in line.strip().split('\n'):
                flag = flag + 1
                if(flag==1):
                  continue
                fields = token.split(' _ _ ')
                sentence.append((fields[0], fields[-1]))
            if sentence:
                sentences.append(sentence)
    return sentences

Parsing and loading training, validation and testing data sets into train_dataset, dev_dataset, test_dataset

In [130]:
# train_dataset = parse_conll_file('hi_train.conll')
# dev_dataset = parse_conll_file('hi_dev.conll')
# test_dataset = parse_conll_file('hi_test.conll')
# train_dataset = parse_conll_file('./multiconer2023/EN-English/en_train.conll')
# dev_dataset = parse_conll_file('./multiconer2023/EN-English/en_dev.conll')
# test_dataset = parse_conll_file('./multiconer2023/EN-English/en_test.conll')
# train_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_train.conll')
# dev_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_dev.conll')
# test_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_test.conll')
train_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_train.conll')
dev_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_dev.conll')
test_dataset = parse_conll_file('./multiconer2023/BN-Bangla/bn_test.conll')

Preprocessing the data

In [131]:
SEQ_LEN = 25

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]
    print(tags)
    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [132]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

[['O', 'B-OtherPROD', 'I-OtherPROD', 'I-OtherPROD', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-OtherPROD', 'I-OtherPROD', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-OtherPROD', 'I-OtherPROD', 'I-OtherPROD', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-OtherPROD', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-OtherPROD', 'O', 'B-HumanSettlement', 'B-ORG', 'I-ORG', 'I-ORG', 'B-HumanSettlement', 'I-HumanSettlement', 'I-HumanSettlement', 'O'], ['O', 'B-OtherPROD', 'I-OtherPROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-OtherPROD', 'I-OtherPROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OtherPROD', 'I-OtherPROD', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OtherPROD', 'I-OtherPROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OtherPROD', 'O'], ['O

In [133]:
print(tag_to_idx["O"])
print(len(word_to_idx))
print(len(tag_to_idx))
print(len(dev_dataset))
print(dev_dataset)

1
42591
68
507
[[('ফ্যারাডে', 'O'), ('একটি', 'O'), ('যাদুকরী', 'O'), ('টেটা', 'B-OtherPROD'), ('দিয়ে', 'O'), ('সজ্জিত', 'O'), ('যা', 'O'), ('প্লেয়ারকে', 'O'), ('স্বল্প', 'O'), ('দূরত্বে', 'O'), ('টেলিপোর্ট', 'O'), ('করতে', 'O'), ('দেয়।', 'O')], [('সাধারণত', 'O'), ('একটি', 'O'), ('তোয়ালে', 'B-OtherPROD'), ('পরে', 'O'), ('শরীর', 'O'), ('থেকে', 'O'), ('শুকানোর', 'O'), ('জন্য', 'O'), ('ব্যবহৃত', 'O'), ('হয়।', 'O')], [('একটি', 'O'), ('ম্যাচস্টিক', 'O'), ("হ'ল", 'O'), ('দিয়াশলাই', 'B-OtherPROD'), ('হিসাবে', 'O'), ('ব্যবহৃত', 'O'), ('জ্বলনযোগ্য', 'O'), ('কাঠের', 'O'), ('একটি', 'O'), ('সরু', 'O'), ('টুকরো।', 'O')], [('একটি', 'O'), ('🚑', 'B-OtherPROD'), ('ড্রাইভিংয়ের', 'O'), ('প্রস্তুতিতে', 'O'), ('তাকে', 'O'), ('আরও', 'O'), ('একটি', 'O'), ('ড্রাইভিং', 'O'), ('পরীক্ষাও', 'O'), ('নিতে', 'O'), ('হয়েছিল।', 'O')], [('রাস্তায়', 'O'), ('রঙিন', 'O'), ('লাইট', 'O'), ('এবং', 'O'), ('বাদশাহী', 'B-Facility'), ('মসজিদ', 'I-Facility'), ('এর', 'O'), ('দৃশ্য', 'O'), ('রয়েছে', 'O'), ('এবং', 'O'), ('এ

In [134]:
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 9708
Number of validation examples: 507
Number of testing examples: 19859


# NER MODEL

In [135]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [136]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 200
HIDDEN_DIM    = 400
NUM_EPOCHS    = 10 
BATCH_SIZE    = 30

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [137]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=2, mode="min")
# trainer = pl.Trainer(max_epochs=NUM_EPOCHS, gpus=1, callbacks=[early_stopping])
trainer = pl.Trainer(max_epochs=NUM_EPOCHS,  callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
PATH = "./model_bangla"
torch.save(model.state_dict(), PATH)
trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 8.5 M 
1 | lstm      | LSTM             | 1.9 M 
2 | fc        | Linear           | 54.5 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
10.5 M    Trainable params
0         Non-trainable params
10.5 M    Total params
41.996    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_5/checkpoints/epoch=5-step=1944.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_5/checkpoints/epoch=5-step=1944.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.2873665988445282}]

Fine Grained

In [138]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []
i=0

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)
        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]
        
        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    240692
B-AerospaceManufacturer       0.19      0.05      0.08        97
  B-AnatomicalStructure       0.66      0.58      0.62       532
              B-ArtWork       0.18      0.01      0.02       455
               B-Artist       0.50      0.40      0.44      2744
              B-Athlete       0.40      0.33      0.36      1086
      B-CarManufacturer       0.73      0.88      0.80        84
               B-Cleric       0.47      0.55      0.51       240
             B-Clothing       0.21      0.59      0.31        17
              B-Disease       0.79      0.59      0.67       553
                B-Drink       0.72      0.81      0.76       120
             B-Facility       0.56      0.49      0.52       894
                 B-Food       0.54      0.36      0.43       453
      B-HumanSettlement       0.81      0.63      0.71      6011
     B-MedicalProcedure 

In [139]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['প্রোপেলারটি', 'একটি', 'ডি', 'হ্যাভিল্যান্ড', 'এয়ারক্রাফ্ট', 'কোম্পানি', 'স্থির', 'পিচ', 'টাইপ', 'ছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'এটি', '১৯৫৫', 'সালে', '১৯৫৭', 'সালে', 'নর্ড', 'এভিয়েশন', 'পরে', 'snias', 'দ্বারা', 'ডিজাইন', 'করা', 'হয়েছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'পাঁচটি', 'f108', 'চালিত', 'উদাহরণগুলি', 'বোয়িং', 'কোম্পানি', 'থেকে', 'সরাসরি', 'অনুরোধ', 'করা', 'হয়েছিল', 'বাকী', 'অন্যান্য', 'পক্ষ', 'থেকে।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'গ্রিনহিল', '১৯৯৫', 'সালে', "l'auto-neige", 'bombardier', 'limitée', 'যোগদানের', 'আগে', 'mckinsey', '&', 'company,', 'inc.', 'এ', 'তাঁর', 'কেরিয়ার', 'শুরু', 'করেছিলেন।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'যদিও', 'ডকটি', 'অ্যাসোসিয়েটেড', 'ব্রিটিশ', 'পো

Coarse Grained

In [140]:
mapping = {
    "facility": "LOC",
    "otherloc": "LOC",
    "humansettlement": "LOC",
    "station": "LOC",
    "visualwork": "CW",
    "musicalwork": "CW",
    "writtenwork": "CW",
    "artwork": "CW",
    "software": "CW",
    "musicalgrp": "GRP",
    "publiccorp": "GRP",
    "privatecorp": "GRP",
    "aerospacemanufacturer": "GRP",
    "sportsgrp": "GRP",
    "carmanufacturer": "GRP",
    "org": "GRP",
    "scientist": "PER",
    "artist": "PER",
    "athlete": "PER",
    "politician": "PER",
    "cleric": "PER",
    "sportsmanager": "PER",
    "otherper": "PER",
    "clothing": "PROD",
    "vehicle": "PROD",
    "food": "PROD",
    "drink": "PROD",
    "otherprod": "PROD",
    "medication/vaccine": "MED",
    "medicalprocedure": "MED",
    "anatomicalstructure": "MED",
    "symptom": "MED",
    "disease": "MED"
}

# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        break
for i in range(len(y_pred)) :
  if y_pred[i] == "O" or y_pred[i] == "<PAD>" :
    continue
  y_pred[i] = mapping[y_pred[i][2:].lower()]
for i in range(len(y_true)) :
  if y_true[i] == "O" or y_true[i] == "<PAD>" :
    continue
  y_true[i] = mapping[y_true[i][2:].lower()]

print(classification_report(y_true, y_pred))





              precision    recall  f1-score   support

       <PAD>       1.00      1.00      1.00       308
          CW       0.00      0.00      0.00         0
         GRP       0.81      0.16      0.27        80
         LOC       0.43      0.75      0.55         4
         MED       0.00      0.00      0.00         0
           O       0.88      0.97      0.92       355
         PER       0.00      0.00      0.00         0
        PROD       0.00      0.00      0.00         3

    accuracy                           0.89       750
   macro avg       0.39      0.36      0.34       750
weighted avg       0.91      0.89      0.88       750



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [141]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        for i in range(len(y_pred)) :
          if y_pred[i] == "O" or y_pred[i] == "<PAD>" :
            continue
          y_pred[i] = mapping[y_pred[i][2:].lower()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break


Sentence
['প্রোপেলারটি', 'একটি', 'ডি', 'হ্যাভিল্যান্ড', 'এয়ারক্রাফ্ট', 'কোম্পানি', 'স্থির', 'পিচ', 'টাইপ', 'ছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'এটি', '১৯৫৫', 'সালে', '১৯৫৭', 'সালে', 'নর্ড', 'এভিয়েশন', 'পরে', 'snias', 'দ্বারা', 'ডিজাইন', 'করা', 'হয়েছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'পাঁচটি', 'f108', 'চালিত', 'উদাহরণগুলি', 'বোয়িং', 'কোম্পানি', 'থেকে', 'সরাসরি', 'অনুরোধ', 'করা', 'হয়েছিল', 'বাকী', 'অন্যান্য', 'পক্ষ', 'থেকে।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'গ্রিনহিল', '১৯৯৫', 'সালে', "l'auto-neige", 'bombardier', 'limitée', 'যোগদানের', 'আগে', 'mckinsey', '&', 'company,', 'inc.', 'এ', 'তাঁর', 'কেরিয়ার', 'শুরু', 'করেছিলেন।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'যদিও', 'ডকটি', 'অ্যাসোসিয়েটেড', 'ব্রিটিশ', 'পো

# Load Model


In [142]:
# Load model
PATH = "./model_bangla"
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
model.load_state_dict(torch.load(PATH))


<All keys matched successfully>

In [143]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['প্রোপেলারটি', 'একটি', 'ডি', 'হ্যাভিল্যান্ড', 'এয়ারক্রাফ্ট', 'কোম্পানি', 'স্থির', 'পিচ', 'টাইপ', 'ছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'এটি', '১৯৫৫', 'সালে', '১৯৫৭', 'সালে', 'নর্ড', 'এভিয়েশন', 'পরে', 'snias', 'দ্বারা', 'ডিজাইন', 'করা', 'হয়েছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'পাঁচটি', 'f108', 'চালিত', 'উদাহরণগুলি', 'বোয়িং', 'কোম্পানি', 'থেকে', 'সরাসরি', 'অনুরোধ', 'করা', 'হয়েছিল', 'বাকী', 'অন্যান্য', 'পক্ষ', 'থেকে।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'গ্রিনহিল', '১৯৯৫', 'সালে', "l'auto-neige", 'bombardier', 'limitée', 'যোগদানের', 'আগে', 'mckinsey', '&', 'company,', 'inc.', 'এ', 'তাঁর', 'কেরিয়ার', 'শুরু', 'করেছিলেন।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'যদিও', 'ডকটি', 'অ্যাসোসিয়েটেড', 'ব্রিটিশ', 'পো

In [147]:
import nltk
def process_util(sentns):
      se=sentns.lower()
      tokens = nltk.word_tokenize(se) 
      for i in range(len(tokens)):
              while len(tokens) < SEQ_LEN:
                  tokens.append('<PAD>')
                  
      
              if len(tokens) > SEQ_LEN:
                  tokens = tokens[:SEQ_LEN]
      
              for word in tokens:
                      if word not in word_to_idx:
                        word_to_idx[word] = len(word_to_idx)
      
          # Convert words and tags to indices
      X = torch.tensor([[word_to_idx.get(word, 1) for word in tokens]], dtype=torch.int).type(torch.LongTensor)
      print(X)
      model.eval()
      y=model(X)
      x_sent = [idx_to_word[i] for i in X.cpu().numpy().flatten().tolist()]
      # Compute the predicted tags
      y_preds = [idx_to_tag[i] for i in y.argmax(-1).cpu().numpy().flatten().tolist()]
      print("Sentence")
      print(x_sent)
      print("Predicted tags")
      print(y_preds)

In [148]:
testing_a_sentence='''প্রোপেলারটি একটি ডি হ্যাভিল্যান্ড এয়ারক্রাফ্ট কোম্পানি স্থির পিচ'''

process_util(testing_a_sentence)

tensor([[ 1084,    44,  1228, 22910,  1271,  1854,   657, 13222,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]])
Sentence
['প্রোপেলারটি', 'একটি', 'ডি', 'হ্যাভিল্যান্ড', 'এয়ারক্রাফ্ট', 'কোম্পানি', 'স্থির', 'পিচ', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Predicted tags
['O', 'O', 'O', 'B-Artist', 'I-Artist', 'B-ORG', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [146]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
