# Installing Pytorch lightning module:

In [1]:
%pip install pytorch-lightning -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.4/716.4 KB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# Preprocessing (Data parsing):

In [21]:
# Mapping of fine-grained entities to course-grained entities
fine_coarse_mapping = {
    'OtherPER': 'Person',
    'SportsManager': 'Person',
    'Cleric': 'Person',
    'Politician': 'Person',
    'Athlete': 'Person',
    'Artist': 'Person',
    'Scientist': 'Person',
    
	'OtherPROD': 'Product',
    'Drink': 'Product',
    'Food': 'Product',
    'Vehicle': 'Product',
    'Clothing': 'Product',
    
    'Disease': 'Medical',
    'Symptom': 'Medical',
    'AnatomicalStructure': 'Medical',
    'MedicalProcedure': 'Medical',
    'Medication/Vaccine': 'Medical',
    
	'Station': 'Location',
    'HumanSettlement': 'Location',
    'OtherLOC': 'Location',
    'Facility': 'Location',
    
	'OtherCW': 'Creative Works',
    'Software': 'Creative Works',
    'ArtWork': 'Creative Works',
    'WrittenWork': 'Creative Works',
    'MusicalWork': 'Creative Works',
    'VisualWork': 'Creative Works',
    
	'ORG': 'Group',
    'TechCorp': 'Group',
    'CarManufacturer': 'Group',
    'SportsGRP': 'Group',
    'AerospaceManufacturer': 'Group',
    'OtherCorp': 'Group',
    'PrivateCorp': 'Group',
    'PublicCorp': 'Group',
    'MusicalGRP': 'Group',

    '': '' 
}

In [22]:
def read_conll_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        instance_id = None
        domain = None
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if line.startswith('# id'):
                dataset=[]
                for (Token, Tag) in zip(tokens, tags):
                    dataset.append((Token, Tag))
                data.append(dataset)
                tokens = []
                tags = []
            elif line!='':
                columns = line.split()
                token = columns[0]
                tag = columns[-1][:2] + fine_coarse_mapping[columns[-1][2:]]
                tokens.append(token)
                tags.append(tag)
    return data

# Example usage
train_data = read_conll_file('EN-English/en_train.conll')
dev_data = read_conll_file('EN-English/en_dev.conll')
test_data = read_conll_file('EN-English/en_test.conll')

# Remove the first example as it is empty
train_data = train_data[1:]
dev_data = dev_data[1:]
test_data = test_data[1:]

# Preprocessing (Data  pre-processing and formatting)

In [23]:
import torch

SEQ_LEN = max([len(sentence) for sentence in train_data] + [len(sentence) for sentence in dev_data] + [len(sentence) for sentence in test_data])

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

train_X, train_Y = preprocess(train_data)
dev_X, dev_Y = preprocess(dev_data)
test_X, test_Y = preprocess(test_data)

In [24]:
print("The size of the training set is:", train_X.shape[0])
print("The size of the development set is:", dev_X.shape[0])
print("The size of the test set is:", test_X.shape[0])

The size of the training set is: 16777
The size of the development set is: 870
The size of the test set is: 249979


# Model Creation (Bi-LSTM)

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

class BiLSTM(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss_val = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss_val)
        return loss_val
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss_val = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss_val)
        return loss_val
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss_val = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss_val)
        return loss_val
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer

# Model Training (using train and dev set)

In [26]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

BATCH_SIZE = 4

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [27]:
VOCAB_SIZE = len(word_to_idx)
TAGSET_SIZE = len(tag_to_idx)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 1
NUM_EPOCHS = 10

model = BiLSTM(vocab_size=VOCAB_SIZE, tagset_size=TAGSET_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 24.2 M
1 | lstm      | LSTM             | 235 K 
2 | fc        | Linear           | 3.6 K 
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.5 M    Trainable params
0         Non-trainable params
24.5 M    Total params
97.832    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_2/checkpoints/epoch=5-step=25170.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_2/checkpoints/epoch=5-step=25170.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.139738991856575}]

# Model Evaluation on the Test set

In [28]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                  precision    recall  f1-score   support

           <PAD>       1.00      1.00      1.00  13224913
B-Creative Works       0.47      0.39      0.43     62125
         B-Group       0.45      0.47      0.46     60023
      B-Location       0.66      0.54      0.59     67901
       B-Medical       0.37      0.29      0.33     22491
        B-Person       0.77      0.71      0.74    137681
       B-Product       0.28      0.19      0.23     27580
I-Creative Works       0.61      0.50      0.55    107467
         I-Group       0.56      0.55      0.55     74142
      I-Location       0.75      0.60      0.67     63022
       I-Medical       0.46      0.31      0.37     10614
        I-Person       0.80      0.72      0.76    153777
       I-Product       0.26      0.20      0.22     17506
               O       0.93      0.96      0.95   2969330

        accuracy                           0.97  16998572
       macro avg       0.60      0.53      0.56  16998572
    weighted

# Function to predict the NERs present in the input sentence

In [31]:
import random

# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def predict_entities(sentence):
    y_pred = []
    with torch.no_grad():
        # Convert the sentence to a tensor
        x = torch.tensor([[word_to_idx.get(word, 1) for word in sentence]], dtype=torch.int).type(torch.LongTensor)
        
        # Forward pass
        y_hat = model(x)
        
        # Compute the predicted tags
        y_pred = [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]
        print('Entities:', y_pred)

        B_found = False
        for i in range(len(y_pred)):
            if y_pred[i][0] == 'B':
                if B_found:
                    print(']', y_pred[i-1][2:], end=' ')
                B_found = True
                print('[', sentence[i], end=' ')
            elif y_pred[i][0] == 'I' and B_found:
                print(sentence[i], end=' ')
            elif y_pred[i][0] == 'O':
                if B_found:
                    print(']', y_pred[i-1][2:], end=' ')
                B_found = False
                print(sentence[i], end=' ')
        if B_found:
            print(']', y_pred[-1][2:], end=' ')
        print()

# Get a random string from test_data
random_string = random.choice(test_data)
random_string = [word for (word, _) in random_string]
print('Sentence:', random_string)

# Predict the entities
predict_entities(random_string)

Sentence: ['8', 'd.', 'wayne', 'lukas', '(', '1990', '1991', '1995', '1999', '2000', '2009', '2013', '2017', ')']
Entities: ['O', 'B-Person', 'I-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
8 [ d. wayne ] Person lukas ( 1990 1991 1995 1999 2000 2009 2013 2017 ) 
