# Named Entity Recognition

The Task has been divided into the following sections:
  a)English
  b)Hindi
  c)Bangla

With every language having the following steps in fine grain and coarse grain setting:
1. Dataset Preparation
2. Feature Extraction
3. Model Definition
4. Training the Model/Model importing
6. Model Evaluation

Fine grain setting has 36 * 3(I,O,B) tags
Coarse grain setting has 6 * 3(I,O,B) tags


In [None]:
%pip install pytorch-lightning -q -U
import torch
from sklearn.metrics import classification_report

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m715.6/715.6 KB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## English 
Download the folder from "https://drive.google.com/drive/folders/14__E4qyd6gWhjTtmJMZb73gRauynmR-2?usp=sharing" 

In [None]:
#file locations in gdrive, kindly download the Multiconer folder from above link in the main gdrive page 
train_path = '/content/drive/MyDrive/Multiconer/EN/en_train.conll'
dev_path = '/content/drive/MyDrive/Multiconer/EN/en_dev.conll'
test_path = '/content/drive/MyDrive/Multiconer/EN/en_test.conll'

#### Tag mapping logic

In [None]:
#tag mapping to map fine to coarse grain setting
tag_map = {
    'Disease':'Medical',
    'Symptom':'Medical',
    'AnatomicalStructure':'Medical',
    'MedicalProcedure':'Medical',
    'Medication/Vaccine':'Medical',
    'Food':'Product',
    'Drink':'Product',
    'OtherPROD':'Product',
    'Vehicle':'Product',
    'Clothing':'Product',
    'OtherPER':'Person',
    'SportsManager':'Person',
    'Cleric':'Person',
    'Politician':'Person',
    'Athlete':'Person',
    'Artist':'Person',
    'Scientist':'Person',
    'ORG':'Group',
    'TechCORP':'Group',
    'CarManufacturer':'Group',
    'SportsGRP':'Group',
    'AerospaceManufacturer':'Group',
    'OtherCORP':'Group',
    'PrivateCorp':'Group',
    'PublicCorp':'Group',
    'MusicalGRP':'Group',
    'OtherCW':'CreativeWorks',
    'Software':'CreativeWorks',
    'ArtWork':'CreativeWorks',
    'WrittenWork':'CreativeWorks',
    'MusicalWork':'CreativeWorks',
    'VisualWork':'CreativeWorks',
    'Facility':'Location',
    'HumanSettlement':'Location',
    'OtherLOC':'Location',
    'Station':'Location',
    }


### Parsing logic for coarse grain setting

In [None]:
def parse_conll_format_coarse(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        words = []
        tags = []
        #read all words in the sentences
        for line in lines:
            line = line.strip()
            if line.startswith('#'):
                continue  # Ignore comment lines
            if line == '':
                if len(words) > 0:
                    sentence = [{'word': word, 'tag': tag} for word, tag in zip(words, tags)]
                    sentences.append(sentence)
                    words = []
                    tags = []
            else:
              #split the line with sepeartor _ _ and get the tag and word part from every sentence
                parts = line.split(' _ _ ')
                word = parts[0]
                tag = parts[-1]
                if len(tag)>=2:
                  #map the fine setting tag to coarse setting tag and also append I- O- B- as present in the fine tag
                  if tag[0]=='B' and tag[1]=='-':
                    tag = tag[2:]
                    tag = "B-" + tag_map[tag]
                  elif tag[0]=='I' and tag[1]=='-':
                    tag = tag[2:]
                    tag = "I-" + tag_map[tag]
                  elif tag[0]=='O' and tag[1]=='-':
                    tag = tag[2:]
                    tag = "O-" + tag_map[tag]
                #update word and tags list
                words.append(word)
                tags.append(tag)
        if len(words) > 0:
          #store tags and word encountered in every sentence
            sentence = [{'word': word, 'tag': tag} for word, tag in zip(words, tags)]
            sentences.append(sentence)
    return sentences


In [None]:
train_dataset_coarse = parse_conll_format_coarse(train_path)
test_dataset_coarse = parse_conll_format_coarse(test_path)
dev_dataset_coarse = parse_conll_format_coarse(dev_path)

#### Parsing logic for fine grain setting

In [None]:
def parse_conll_format_fine(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        words = []
        tags = []
        for line in lines:
            line = line.strip()
            if line.startswith('#'):
                continue  # Ignore comment lines
            if line == '':
                if len(words) > 0:
                    # print(tag)
                    sentence = [{'word': word, 'tag': tag} for word, tag in zip(words, tags)]
                    sentences.append(sentence)
                    words = []
                    tags = []
            else:
                parts = line.split(' _ _ ')
                word = parts[0]
                tag = parts[-1]
                words.append(word)
                tags.append(tag)
        if len(words) > 0:
            sentence = [{'word': word, 'tag': tag} for word, tag in zip(words, tags)]
            sentences.append(sentence)
    return sentences


#### Pre-process logic coarse grain setting

In [None]:
def preprocess_coarse(dataset,word_to_idx_coarse, tag_to_idx_coarse):
    # Extract sentences and tags
    sent = [[token['word'].lower() for token in sentence] for sentence in dataset]
    tags = [[token['tag'] for token in sentence] for sentence in dataset]

    for i in range(len(sent)):
      #pad shorter sentences
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
          #clip longer sentences
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx_coarse:
                #populate tag_to_idx dictionary
                tag_to_idx_coarse[tag] = len(tag_to_idx_coarse)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx_coarse:
              #populate word_to_idx dictionary
                word_to_idx_coarse[word] = len(word_to_idx_coarse)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx_coarse.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx_coarse[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

#### Pre-process logic for fine grain setting

In [None]:
SEQ_LEN = 30

In [None]:
def preprocess_fine(dataset,word_to_idx_fine, tag_to_idx_fine):
    # Extract sentences and tags
    sent = [[token['word'].lower() for token in sentence] for sentence in dataset]
    tags = [[token['tag'] for token in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx_fine:
                tag_to_idx_fine[tag] = len(tag_to_idx_fine)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx_fine:
                word_to_idx_fine[word] = len(word_to_idx_fine)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx_fine.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx_fine[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

In [None]:
word_to_idx_coarse = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx_coarse = {"<PAD>": 0}

#get X and Y sets and word to index and tag to index lists
train_X_coarse, train_Y_coarse = preprocess_coarse(train_dataset_coarse,word_to_idx_coarse,tag_to_idx_coarse)
dev_X_coarse, dev_Y_coarse = preprocess_coarse(dev_dataset_coarse,word_to_idx_coarse,tag_to_idx_coarse)
test_X_coarse, test_Y_coarse = preprocess_coarse(test_dataset_coarse,word_to_idx_coarse,tag_to_idx_coarse)

# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X_coarse)}")
print(f"Number of validation examples: {len(dev_X_coarse)}")
print(f"Number of testing examples: {len(test_X_coarse)}")

Number of training examples: 16778
Number of validation examples: 871
Number of testing examples: 249980


In [None]:
train_dataset_fine = parse_conll_format_fine(train_path)
test_dataset_fine = parse_conll_format_fine(test_path)
dev_dataset_fine = parse_conll_format_fine(dev_path)

In [None]:
word_to_idx_fine = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx_fine = {"<PAD>": 0}

train_X_fine, train_Y_fine = preprocess_fine(train_dataset_fine,word_to_idx_fine,tag_to_idx_fine)
dev_X_fine, dev_Y_fine = preprocess_fine(dev_dataset_fine,word_to_idx_fine,tag_to_idx_fine)
test_X_fine, test_Y_fine = preprocess_fine(test_dataset_fine,word_to_idx_fine,tag_to_idx_fine)

# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X_fine)}")
print(f"Number of validation examples: {len(dev_X_fine)}")
print(f"Number of testing examples: {len(test_X_fine)}")

Number of training examples: 16778
Number of validation examples: 871
Number of testing examples: 249980


#### Model Definition
Model is defined using PyTorch Lightning's LightningModule class, which allows to organize the training logic into separate methods, making the code easier to understand and maintain.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        #pass the input to the model and calculate softmax to know probabilities
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

The class defines constructor, forward propagation logic, validation logic, test logic and optimization configuration to define model.

 `pl.LightningModule` is inherited to get access to PyTorch Lightning's training loop.

#### Training the Model
 PyTorch Lightning's Trainer class is used to train our model to take care of setting up the training loop, optimizing the model, and handling GPU acceleration.

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 100
HIDDEN_DIM    = 500
NUM_EPOCHS    = 10 
BATCH_SIZE    = 10

train_dataset_coarse = TensorDataset(train_X_coarse, train_Y_coarse)
train_loader_coarse = DataLoader(train_dataset_coarse, batch_size=BATCH_SIZE, shuffle=True)

val_dataset_coarse = TensorDataset(dev_X_coarse, dev_Y_coarse)
val_loader_coarse = DataLoader(val_dataset_coarse, batch_size=BATCH_SIZE)

test_dataset_coarse = TensorDataset(test_X_coarse, test_Y_coarse)
test_loader_coarse = DataLoader(test_dataset_coarse, batch_size=BATCH_SIZE)

train_dataset_fine = TensorDataset(train_X_fine, train_Y_fine)
train_loader_fine = DataLoader(train_dataset_fine, batch_size=BATCH_SIZE, shuffle=True)

val_dataset_fine = TensorDataset(dev_X_fine, dev_Y_fine)
val_loader_fine = DataLoader(val_dataset_fine, batch_size=BATCH_SIZE)

test_dataset_fine = TensorDataset(test_X_fine, test_Y_fine)
test_loader_fine = DataLoader(test_dataset_fine, batch_size=BATCH_SIZE)

#### Uncomment the below lines of code to train the model

In [None]:
# model = NERModel(vocab_size=len(word_to_idx_coarse), tagset_size=len(tag_to_idx_coarse), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
# early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
# trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])
# trainer.fit(model, train_dataloaders=train_loader_coarse, val_dataloaders=val_loader_coarse)
# trainer.test(dataloaders=test_loader_coarse)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 24.2 M
1 | lstm      | LSTM             | 2.4 M 
2 | fc        | Linear           | 14.0 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
26.6 M    Trainable params
0         Non-trainable params
26.6 M    Total params
106.549   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=4-step=8390.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=4-step=8390.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.29478907585144043}]

In [None]:
# torch.save(model.state_dict(), 'drive/MyDrive/en_coarse.pt')

In [None]:
drive.mount('/content/drive')
path = '/content/drive/MyDrive/en_coarse.pt'
#download the en_coarse.pt from 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
en_coarse = NERModel(vocab_size=len(word_to_idx_coarse), tagset_size=len(tag_to_idx_coarse), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
en_coarse.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
en_coarse

NERModel(
  (embedding): Embedding(242153, 100)
  (lstm): LSTM(100, 500, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1000, out_features=14, bias=True)
  (loss_fn): CrossEntropyLoss()
)

In [None]:
def accuracy(tag_to_idx_coarse,test_dataset_coarse,model):
  # define idx_to_tag
  idx_to_tag = {idx: tag for tag, idx in tag_to_idx_coarse.items()}

  # define device
  device = torch.device('cuda')

  # Create a dataloader for the test set
  test_loader = DataLoader(test_dataset_coarse, batch_size=BATCH_SIZE)

  # Set the model to evaluation mode
  model.eval()

  y_true = []
  y_pred = []

  with torch.no_grad():
      for x, y in test_loader:
          # Move the data to the device
          x = x.to(device)
          y = y.to(device)
          model = model.to(device)

          # Forward pass
          y_hat = model(x)

          # Compute the predicted tags
          y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

          # Compute the true tags
          y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

  print(classification_report(y_true, y_pred))

In [None]:
accuracy(tag_to_idx_coarse,test_dataset_coarse,en_coarse)

                 precision    recall  f1-score   support

          <PAD>       1.00      1.00      1.00   3727316
B-CreativeWorks       0.51      0.41      0.45     62124
        B-Group       0.48      0.45      0.47     60026
     B-Location       0.66      0.56      0.60     67893
      B-Medical       0.44      0.24      0.31     22490
       B-Person       0.78      0.70      0.74    137666
      B-Product       0.33      0.17      0.23     27574
I-CreativeWorks       0.64      0.50      0.57    107463
        I-Group       0.58      0.55      0.57     74136
     I-Location       0.71      0.66      0.68     63007
      I-Medical       0.52      0.29      0.37     10613
       I-Person       0.82      0.70      0.76    153751
      I-Product       0.40      0.13      0.19     17503
              O       0.92      0.97      0.94   2967838

       accuracy                           0.94   7499400
      macro avg       0.63      0.52      0.56   7499400
   weighted avg       0.93   

#### Uncomment the below lines of code to train the model

In [None]:
# model = NERModel(vocab_size=len(word_to_idx_fine), tagset_size=len(tag_to_idx_fine), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
# early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
# trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])
# trainer.fit(model, train_dataloaders=train_loader_fine, val_dataloaders=val_loader_fine)

# trainer.test(dataloaders=test_loader_fine)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 24.2 M
1 | lstm      | LSTM             | 2.4 M 
2 | fc        | Linear           | 68.1 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
26.7 M    Trainable params
0         Non-trainable params
26.7 M    Total params
106.765   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_1/checkpoints/epoch=5-step=10068.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_1/checkpoints/epoch=5-step=10068.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.4470483958721161}]

In [None]:
# drive.mount('/content/drive')
# torch.save(model.state_dict(), 'drive/MyDrive/en_fine.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/en_fine.pt'
#download the en_coarse.pt from 

en_fine = NERModel(vocab_size=len(word_to_idx_fine), tagset_size=len(tag_to_idx_fine), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
en_fine.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
en_fine

NERModel(
  (embedding): Embedding(242153, 100)
  (lstm): LSTM(100, 500, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1000, out_features=68, bias=True)
  (loss_fn): CrossEntropyLoss()
)

In [None]:
accuracy(tag_to_idx_fine,test_dataset_fine,en_fine)

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00   3727316
B-AerospaceManufacturer       0.36      0.32      0.34      1015
  B-AnatomicalStructure       0.42      0.20      0.27      5838
              B-ArtWork       0.17      0.17      0.17      1270
               B-Artist       0.51      0.58      0.54     57034
              B-Athlete       0.51      0.42      0.46     27624
      B-CarManufacturer       0.34      0.25      0.28      2984
               B-Cleric       0.36      0.20      0.26      4732
             B-Clothing       0.17      0.11      0.14      2243
              B-Disease       0.39      0.25      0.30      5622
                B-Drink       0.27      0.17      0.20      2246
             B-Facility       0.43      0.35      0.39     16181
                 B-Food       0.13      0.11      0.12      5317
      B-HumanSettlement       0.63      0.61      0.62     41099
     B-MedicalProcedure 

In [None]:
def predict(tag_to_idx_coarse,word_to_idx_coarse,line,model):
  # define idx_to_tag
  idx_to_tag = {idx: tag for tag, idx in tag_to_idx_coarse.items()}

  # define device
  device = torch.device('cuda')

  # Set the model to evaluation mode
  model.eval()

  y_pred = []
  x = []

  sent = line.split()
  if(len(sent)>30):
    i=0  
    for word in sent:
      if word in word_to_idx_coarse:
        x.append(word_to_idx_coarse[word])
      else:
        x.append(1)
      if i==30:
         break
      i=i+1
  else:
    i=0  
    for word in sent:
      if word in word_to_idx_coarse:
        x.append(word_to_idx_coarse[word])
      else:
        x.append(1)
    while i != 30-len(sent):
      x.append(0)
      i=i+1

  x = torch.tensor([x])
  with torch.no_grad():
      # Move the data to the device
      x = x.to(device)
      model = model.to(device)
      # Forward pass
      y_hat = model(x)

      # Compute the predicted tags
      y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]
      # print(y_pred)
  return y_pred        

In [None]:
y_pred = predict(tag_to_idx_coarse,word_to_idx_coarse, "robert gottschalk 1939 academy award winner and founder of pranavision ",en_coarse)
y_pred

['B-Person',
 'I-Person',
 'O',
 'B-CreativeWorks',
 'I-CreativeWorks',
 'O',
 'O',
 'O',
 'O',
 'B-Group',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>']

In [None]:
y_pred = predict(tag_to_idx_fine,word_to_idx_fine, "robert gottschalk 1939 academy award winner and founder of pranavision ",en_fine)
y_pred

['B-OtherPER',
 'I-OtherPER',
 'O',
 'B-VisualWork',
 'I-VisualWork',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>']

In [None]:
print(len(word_to_idx_coarse))
print(len(word_to_idx_fine))
print(len(tag_to_idx_coarse))
print(len(tag_to_idx_fine))

242153
242153
14
68
