In [44]:
import torch
from torch.utils.data import Dataset, DataLoader
%pip install pytorch-lightning -q -U

In [45]:

def parse_conll_file(file_path):
    """
    Parse a conll-formatted file and return a list of sentences, where each sentence is a list of tokens and their labels.

    Args:
        file_path (str): The path to the conll file.

    Returns:
        List[List[Tuple[str, str]]]: A list of sentences, where each sentence is a list of (token, label) tuples.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n\n')

        sentences = []
        for line in lines:
            sentence = []
            # print(line)
            flag = 0
            for token in line.strip().split('\n'):
                flag = flag + 1
                if(flag==1):
                  continue
                fields = token.split(' _ _ ')
                sentence.append((fields[0], fields[-1]))
            if sentence:
                sentences.append(sentence)
    return sentences

In [46]:
# train_dataset = parse_conll_file('en_train.conll')
# dev_dataset = parse_conll_file('en_dev.conll')
# test_dataset = parse_conll_file('en_test.conll')

In [47]:
# train_dataset = parse_conll_file('bn_train.conll')
# dev_dataset = parse_conll_file('bn_dev.conll')
# test_dataset = parse_conll_file('bn_test.conll')

In [48]:
train_dataset = parse_conll_file('hi_train.conll')
dev_dataset = parse_conll_file('hi_dev.conll')
test_dataset = parse_conll_file('hi_test.conll')

In [49]:
print(train_dataset[0])

[('यह', 'O'), ('झियान', 'B-HumanSettlement'), ('चीन', 'B-HumanSettlement'), ('के', 'O'), ('केंद्र', 'O'), ('भाग', 'O'), ('में', 'O'), ('स्थित', 'O'), ('है।', 'O')]


In [50]:
# print(train_dataset)
# print(test_dataset)
# dev_dataset = parse_conll_file('en_dev.conll')
# print(dev_dataset)

In [51]:
# print(torch.utils.data.Dataset(dev_dataset))
# dev_dataset = DataLoader(dev_dataset)

In [52]:
# print(dev_dataset)
# print(list(torch.utils.data.DataLoader(dev_dataset)))

In [53]:
SEQ_LEN = 25

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [54]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [55]:
print(tag_to_idx["O"])

1


In [56]:
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 9632
Number of validation examples: 514
Number of testing examples: 18399


In [57]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class POSModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [58]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 200
HIDDEN_DIM    = 700
NUM_EPOCHS    = 10 
BATCH_SIZE    = 20

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [59]:
model = POSModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=5, mode="min")
# trainer = pl.Trainer(max_epochs=NUM_EPOCHS, gpus=1, callbacks=[early_stopping])
trainer = pl.Trainer(max_epochs=NUM_EPOCHS,  callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 6.1 M 
1 | lstm      | LSTM             | 2.8 M 
2 | fc        | Linear           | 68.1 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
9.0 M     Trainable params
0         Non-trainable params
9.0 M     Total params
35.823    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_2/checkpoints/epoch=5-step=2892.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_2/checkpoints/epoch=5-step=2892.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.3050287961959839}]

In [60]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []
i=0

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)
        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]
        
        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    172076
B-AerospaceManufacturer       0.57      0.05      0.09        85
  B-AnatomicalStructure       0.74      0.61      0.67       485
              B-ArtWork       0.29      0.00      0.01       425
               B-Artist       0.54      0.41      0.47      1847
              B-Athlete       0.62      0.59      0.60      1166
      B-CarManufacturer       0.84      0.85      0.84       146
               B-Cleric       0.64      0.84      0.73       188
             B-Clothing       0.58      0.75      0.65        75
              B-Disease       0.73      0.64      0.68       628
                B-Drink       0.71      0.72      0.71       134
             B-Facility       0.50      0.47      0.49       854
                 B-Food       0.64      0.58      0.61       424
      B-HumanSettlement       0.78      0.65      0.71      5807
     B-MedicalProcedure 

In [61]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['उनकी', 'विशेषताओं', 'आंदोलनों', 'और', 'खेल', 'शैली', 'के', 'कारण', 'उनकी', 'तुलना', 'वाल्टर', 'ज़ेंगा', 'से', 'की', 'गई', 'है।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'एथन', 'काट्ज़', '(', 'जन्म', '१९८३', ')', 'शिकागो', 'व्हाइट', 'सोक्स', 'के', 'लिए', 'पिचिंग', 'कोच', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'वह', 'प्रसिद्ध', 'रैंडविक', 'रेसकोर्स', 'ट्रेनर', 'इसहाक', 'अर्नशॉ', 'के', 'लिए', 'प्रेरित', 'था', 'और', 'उस', 'समय', 'के', 'कुछ', 'बेहतरीन', 'घोड़ों', 'के', 'लिए', 'सवार', 'था।', '<PAD>', '<PAD>', '<PAD>', 'चेल्सी', 'ने', 'उथल', '-पुथल', 'में', 'मैच', 'में', 'प्रवेश', 'किया', 'उनके', 'प्रबंधक', 'टॉमी', 'डोचेर्टी', 'एक', 'दिन', 'पहले', 'इस्तीफा', 'दे', 'दिया।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'शूप', '२००४', 'में', 'मुख्य', 'कोच', 'जॉन', 'ग्रुडेन', 'के', 'तहत', 'बुक्स', 'के', 'लिए', 'क्वार्टरबैक', 'कोच', 'था।', '<PAD>', '<PAD>', '<PAD>', '<P