In [9]:
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Download and prepare the Penn Treebank dataset
nltk.download("treebank")
from nltk.corpus import treebank

# Load the dataset
sentences = treebank.sents()

# Load NLTK's pre-trained POS tagger
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
from nltk import pos_tag

# Tokenize and tag the sentences using NLTK's POS tagger
tagged_sentences = [pos_tag(sentence) for sentence in sentences]

# Create a vocabulary and POS tag set
words = [word for sentence in tagged_sentences for word, _ in sentence]
tags = [tag for sentence in tagged_sentences for _, tag in sentence]
word_to_idx = {word: idx for idx, word in enumerate(set(words), 1)}
word_to_idx['<PAD>'] = 0
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
pos_to_idx = {tag: idx for idx, tag in enumerate(set(tags))}
idx_to_pos = {idx: tag for tag, idx in pos_to_idx.items()}

# Convert words and tags to numerical values
X = [[word_to_idx[word] for word, _ in sentence] for sentence in tagged_sentences]
y = [[pos_to_idx[tag] for _, tag in sentence] for sentence in tagged_sentences]

# Pad sequences to have the same length
X_padded = pad_sequence([torch.LongTensor(sentence) for sentence in X], batch_first=True, padding_value=0)
y_padded = pad_sequence([torch.LongTensor(tags) for tags in y], batch_first=True, padding_value=0)

# Define the LSTM model
class POSModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(POSModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

# Initialize the model and set hyperparameters
input_size = len(word_to_idx)
hidden_size = 128
output_size = len(pos_to_idx)
model = POSModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Reduced training epochs and batch size
num_epochs = 10
batch_size = 16
for epoch in range(num_epochs):
    total_loss = 0
    correct = 0
    total_samples = 0

    for i in range(0, len(X_padded), batch_size):
        batch_x = X_padded[i:i + batch_size]
        batch_y = y_padded[i:i + batch_size]
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs.view(-1, output_size), batch_y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 2)
        correct += (predicted == batch_y).sum().item()
        total_samples += batch_x.size(0) * batch_x.size(1)

    accuracy = (correct / total_samples) * 100
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Function to perform POS tagging on a given sentence
def pos_tag_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)
    input_indices = [word_to_idx.get(token, 0) for token, _ in tagged_tokens]
    input_tensor = torch.LongTensor(input_indices).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 2)
        predicted_tags = [idx_to_pos[idx] for idx in predicted.squeeze().tolist()]
        return list(zip(tokens, predicted_tags))

# Input sentence
input_sentence = "As the sun began to set over the serene horizon, casting a warm, golden glow upon the tranquil waters of the lake, families gathered around campfires, sharing stories, laughter, and marshmallows, creating cherished memories that would be etched in their hearts forever."
output_tags = pos_tag_sentence(input_sentence)
print("POS Tags for Input Sentence:")
print(output_tags)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Epoch [1/10], Loss: 81.3374, Accuracy: 93.30%
Epoch [2/10], Loss: 31.8887, Accuracy: 96.50%
Epoch [3/10], Loss: 21.4396, Accuracy: 97.64%
Epoch [4/10], Loss: 15.5690, Accuracy: 98.28%
Epoch [5/10], Loss: 11.9027, Accuracy: 98.69%
Epoch [6/10], Loss: 9.3916, Accuracy: 98.96%
Epoch [7/10], Loss: 7.5477, Accuracy: 99.16%
Epoch [8/10], Loss: 6.1304, Accuracy: 99.32%
Epoch [9/10], Loss: 5.0078, Accuracy: 99.44%
Epoch [10/10], Loss: 4.1015, Accuracy: 99.55%
POS Tags for Input Sentence:
[('As', 'IN'), ('the', 'DT'), ('sun', 'DT'), ('began', 'CD'), ('to', 'TO'), ('set', 'VB'), ('over', 'IN'), ('the', 'DT'), ('serene', 'DT'), ('horizon', 'DT'), (',', ','), ('casting', 'JJ'), ('a', 'DT'), ('warm', 'DT'), (',', ','), ('golden', 'JJ'), ('glow', 'DT'), ('upon', 'NN'), ('the', 'DT'), ('tranquil', 'DT'), ('waters', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('lake', 'DT'), (',', ','), ('families', 'JJ'), ('gathered', 'DT'), ('around', 'IN'), ('campfires', 'DT'), (',', ','), ('sharing', 'DT'), ('stories', '