### Sequence Tagging on Treebank dataset

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk
import torchtext
from torchtext.data.utils import get_tokenizer

# Download the necessary nltk resources
nltk.download('treebank')
from nltk.corpus import treebank

# Load the POS-tagged Penn Treebank sentences
tagged_sentences = treebank.tagged_sents()
print(len(tagged_sentences), tagged_sentences[0])

train_tagged_sentences = tagged_sentences[:3000]
valid_tagged_sentences = tagged_sentences[3000:3414]
test_tagged_sentences = tagged_sentences[3414:]

print(len(train_tagged_sentences), len(valid_tagged_sentences), len(test_tagged_sentences))

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


3914 [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
3000 414 500


In [61]:
## Data Process, Batch, Dataloader

# Tokenizer
tokenizer = get_tokenizer('basic_english')

# Build word vocabulary
def yield_tokens(data_iter):
    for sentence in data_iter:
        yield [word for word, tag in sentence]


vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(tagged_sentences), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

# Build tag vocabulary (based on the tags in the dataset)
pos_tags = list(set(tag for sentence in tagged_sentences for word, tag in sentence))
tag_vocab = {tag: i for i, tag in enumerate(pos_tags)}
tag_vocab['<pad>'] = len(tag_vocab)  # Add a pad token

# Process each sentence to get word indices and POS tag indices
def process_sentence_with_tags(sentence):
    words, tags = zip(*sentence)  # Unzip the sentence into words and tags
    word_indices = [vocab[token] for token in words]  # Convert words to indices
    tag_indices = [tag_vocab[tag] for tag in tags] # Convert tags to indices
    return word_indices, tag_indices

# Collate function for batching and padding
def collate_batch(batch):
    text_list, tag_list = [], []
    
    for sentence in batch:
        word_indices, tag_indices = process_sentence_with_tags(sentence)
        text_list.append(torch.tensor(word_indices, dtype=torch.int64))
        tag_list.append(torch.tensor(tag_indices, dtype=torch.int64))

    # Pad sequences
    text_padded = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    tag_padded = pad_sequence(tag_list, batch_first=True, padding_value=tag_vocab['<pad>'])
    
    return text_padded, tag_padded

# Create DataLoader
train_dataloader = DataLoader(train_tagged_sentences, batch_size=32, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_tagged_sentences, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_tagged_sentences, batch_size=32, shuffle=False, collate_fn=collate_batch)

### Add pre-trained GloVe (50d) embeddings

In [79]:
#Use pre-trained word embeddings from GloVe. 

import numpy as np
# Path to the GloVe file
glove_file = 'glove.6B\glove.6B\glove.6B.50d.txt'  # Change path as needed
#glove_file = 'FastText_PCA_reduced-vectors.txt\FastText_PCA_reduced-vectors.txt'  # Change path as needed
# Initialize an empty dictionary
embeddings_index = {}

# Load the GloVe vectors
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

vocab_size = len(vocab)
embedding_dim = 50

# Initialize the embedding matrix (vocab_size x embedding_dim)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill the embedding matrix with GloVe vectors
for word, i in train_dataloader:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  glove_file = 'glove.6B\glove.6B\glove.6B.50d.txt'  # Change path as needed


In [80]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_tags, dropout_prob=0.5):
        super(LSTMTagger, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = True

        # Layer normalization after LSTM
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        # Dropout after embedding layer
        self.dropout = nn.Dropout(dropout_prob)

        # LSTM layer
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob if num_layers > 1 else 0)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_tags)

    def forward(self, x, hidden):
        # x: [batch_size, seq_len]
        
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        x = self.dropout(x)  # Apply dropout after embedding

        out, hidden = self.lstm(x, hidden)  # [batch_size, seq_len, hidden_size]
        out = self.layer_norm(out)  # Apply layer normalization
        
        out = self.dropout(out)  # Apply dropout after LSTM output

        out = self.fc(out)  # [batch_size, seq_len, num_tags]
        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize LSTM hidden state (h_0, c_0)
        h_0 = torch.zeros(num_layers, batch_size, hidden_size)
        c_0 = torch.zeros(num_layers, batch_size, hidden_size)
        return (h_0, c_0)

# Hyperparameters
vocab_size = len(vocab)
embed_size = 50
hidden_size = 64
num_layers = 2
num_tags = len(tag_vocab)
num_epochs = 20
lr = 0.01
dropout_prob = 0.3

# Initialize the model with fine-tuning and dropout
model = LSTMTagger(vocab_size=vocab_size, embed_size=embed_size, hidden_size=hidden_size, 
                   num_layers=num_layers, num_tags=num_tags, 
                   dropout_prob=dropout_prob)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.000001)

In [82]:
def train_sequence_tagging(model, dataloader, optimizer, criterion, epoch):
    model.train()  # Set model to training mode
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    batch_count = 0

    for batch, (data, targets) in enumerate(dataloader):

        optimizer.zero_grad()  # Zero the gradients from previous batch
        batch_size = data.size(0)

        # Initialize hidden and cell states for LSTM
        hidden = model.init_hidden(batch_size)

        # Forward pass through the LSTM model
        output, hidden = model(data, hidden)  # Output shape: [batch_size, seq_len, num_tags]

        # Reshape output and targets for loss calculation
        output = output.view(-1, num_tags)  # [batch_size * seq_len, num_tags]
        targets = targets.view(-1)  # [batch_size * seq_len]

        # Compute the loss
        loss = criterion(output, targets)
        loss.backward()  # Backpropagation to compute gradients
        optimizer.step()  # Update the model parameters

        total_loss += loss.item()

        # Get predicted tags (using argmax to get the predicted tag indices)
        predicted_tags = torch.argmax(output, dim=1)

        # Ignore padding tokens in accuracy calculation
        non_pad_elements = (targets != tag_vocab['<pad>']).nonzero(as_tuple=True)

        # Calculate the number of correct predictions
        correct_predictions += (predicted_tags[non_pad_elements] == targets[non_pad_elements]).sum().item()

        # Count the number of non-pad tokens
        total_predictions += max(len(non_pad_elements[0]), len((predicted_tags != tag_vocab['<pad>']).nonzero(as_tuple=True)[0]))

        batch_count += 1

        # Print intermediate training statistics every 20 batches
        if batch_count % 20 == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Batch [{batch_count}], Loss: {loss.item():.4f}, Accuracy: {correct_predictions / total_predictions:.4f}')

    # Compute average loss and accuracy over the entire epoch
    avg_loss = total_loss / batch_count
    accuracy = correct_predictions / total_predictions

    print(f'Train Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    return avg_loss, accuracy


In [83]:
def init_hidden(self, batch_size):
    # Return two tensors (hidden state and cell state) of the correct size for LSTM
    return (torch.zeros(num_layers, batch_size, hidden_size).to(device),
            torch.zeros(num_layers, batch_size, hidden_size).to(device))

def evaluate_and_store_examples(model, dataloader, criterion, vocab, tag_vocab, num_examples=5):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    batch_count = 0
    examples = []  # List to store examples

    # Reverse the vocab and tag_vocab for easier lookup
    idx_to_word = {i: word for word, i in vocab.get_stoi().items()}
    idx_to_tag = {i: tag for tag, i in tag_vocab.items()}

    with torch.no_grad():
        for batch_idx, (data, targets) in enumerate(dataloader):
            batch_size = data.size(0)
            seq_len = data.size(1)

            # Initialize hidden and cell states for LSTM
            hidden = model.init_hidden(batch_size)

            # Forward pass through the model
            output, hidden = model(data, hidden)  # hidden is (h_n, c_n) for LSTM

            # Reshape output and targets for computing loss
            output = output.view(-1, num_tags)  # [batch_size * seq_len, num_tags]
            targets = targets.view(-1)  # [batch_size * seq_len]

            # Compute loss
            loss = criterion(output, targets)
            total_loss += loss.item()

            # Get predicted tags
            predicted_tags = torch.argmax(output, dim=1)

            # Ignore padding tokens in accuracy calculation
            non_pad_elements_target = (targets != tag_vocab['<pad>']).nonzero(as_tuple=True)
            non_pad_elements_predicted = (predicted_tags != tag_vocab['<pad>']).nonzero(as_tuple=True)

            # Calculate number of correct predictions (between predicted and actual tags, ignoring <pad>)
            if len(non_pad_elements_predicted[0]) > len(non_pad_elements_target[0]):
                correct_predictions += (predicted_tags[non_pad_elements_predicted] == targets[non_pad_elements_predicted]).sum().item()
                total_predictions += len(non_pad_elements_predicted[0])  # Number of non-pad tokens
            else:
                correct_predictions += (predicted_tags[non_pad_elements_target] == targets[non_pad_elements_target]).sum().item()
                total_predictions += len(non_pad_elements_target[0])  # Number of non-pad tokens

            # Store sample predictions (only store a few examples, controlled by num_examples)
            if len(examples) < num_examples:
                for i in range(min(5, batch_size)):  # Store up to 5 sentences from each batch
                    sentence_words = [idx_to_word[idx.item()] for idx in data[i] if idx != vocab['<pad>']]
                    true_tags = [idx_to_tag[idx.item()] for idx in targets[i * seq_len:(i + 1) * seq_len] if idx != tag_vocab['<pad>']]
                    pred_tags = [idx_to_tag[idx.item()] for idx in predicted_tags[i * seq_len:(i + 1) * seq_len] if idx != tag_vocab['<pad>']]

                    # Store the example as a tuple (sentence, predicted tags, true tags)
                    examples.append((sentence_words, pred_tags, true_tags))

            batch_count += 1
            if len(examples) >= num_examples:
                break  # Stop collecting examples once we've reached the desired number

    # Calculate average loss and accuracy
    avg_loss = total_loss / batch_count
    accuracy = correct_predictions / total_predictions

    print(f'Evaluation Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    return avg_loss, accuracy, examples

In [84]:
# Training loop

for epoch in range(1, num_epochs + 1):
    train_loss, acc = train_sequence_tagging(model, train_dataloader, optimizer, criterion, epoch)
    valid_loss, valid_accuracy, examples = evaluate_and_store_examples(model, valid_dataloader, criterion, vocab, tag_vocab)

Epoch [1/20], Batch [20], Loss: 1.4798, Accuracy: 0.0418
Epoch [1/20], Batch [40], Loss: 0.9104, Accuracy: 0.1379
Epoch [1/20], Batch [60], Loss: 0.7660, Accuracy: 0.2463
Epoch [1/20], Batch [80], Loss: 0.4539, Accuracy: 0.3389
Train Loss: 1.0949, Accuracy: 0.3944
Evaluation Loss: 0.3325, Accuracy: 0.7593
Epoch [2/20], Batch [20], Loss: 0.1294, Accuracy: 0.8128
Epoch [2/20], Batch [40], Loss: 0.2513, Accuracy: 0.8278
Epoch [2/20], Batch [60], Loss: 0.2211, Accuracy: 0.8398
Epoch [2/20], Batch [80], Loss: 0.2362, Accuracy: 0.8477
Train Loss: 0.2505, Accuracy: 0.8549
Evaluation Loss: 0.2094, Accuracy: 0.8490
Epoch [3/20], Batch [20], Loss: 0.1047, Accuracy: 0.9290
Epoch [3/20], Batch [40], Loss: 0.1749, Accuracy: 0.9290
Epoch [3/20], Batch [60], Loss: 0.1711, Accuracy: 0.9291
Epoch [3/20], Batch [80], Loss: 0.1366, Accuracy: 0.9310
Train Loss: 0.1184, Accuracy: 0.9318
Evaluation Loss: 0.2309, Accuracy: 0.8519
Epoch [4/20], Batch [20], Loss: 0.0730, Accuracy: 0.9530
Epoch [4/20], Batch [4

In [85]:
# test performance
test_loss, test_accuracy, examples = evaluate_and_store_examples(model, test_dataloader, criterion, vocab, tag_vocab)

print("\nPrint Example Predictions:")
for i, (sentence, pred_tags, true_tags) in enumerate(examples):
    print(f"\nExample {i + 1}:")
    print(f"Sentence: {' '.join(sentence)}")
    print(f"Pred Tags: {' '.join(pred_tags)}")
    print(f"True Tags: {' '.join(true_tags)}")

Evaluation Loss: 0.1700, Accuracy: 0.9213

Print Example Predictions:

Example 1:
Sentence: An index of economic activity drawn * from the survey stood last month at 47.6 % ; a reading above 50 % would have indicated that the manufacturing sector was improving .
Pred Tags: DT NN IN JJ NN VBN -NONE- IN DT NN VBD JJ NN IN CD NN : DT NN IN CD NN MD VB VBN IN DT NN NN VBD VBG .
True Tags: DT NN IN JJ NN VBN -NONE- IN DT NN VBD JJ NN IN CD NN : DT NN IN CD NN MD VB VBN IN DT VBG NN VBD VBG .

Example 2:
Sentence: But with the index proving somewhat better than * expected and the widely anticipated report on October employment scheduled *-1 to arrive tomorrow , stock prices firmed only modestly in response to the report and then faltered .
Pred Tags: CC IN DT NN VBG NN RBR IN -NONE- VBN CC DT RB VBN NN IN NNP NN VBN -NONE- TO VB JJ , NN NNS VBP RB RB IN NN TO DT NN CC RB VBN .
True Tags: CC IN DT NN VBG RB JJR IN -NONE- VBN CC DT RB VBN NN IN NNP NN VBN -NONE- TO VB NN , NN NNS VBD RB RB IN 