## Imports

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


## Data Preparation

In [None]:
# Load and preprocess data
train_df = pd.read_csv('train.csv')  # Columns: Sentence, Tags
test_df = pd.read_csv('test.csv')

train_sentences = [sent.split() for sent in train_df['Sentence']]
train_tags = [tag_seq.split() for tag_seq in train_df['Tags']]
test_sentences = [sent.split() for sent in test_df['Sentence']]
test_tags = [tag_seq.split() for tag_seq in test_df['Tags']]

# Vocabulary and tag mapping
word2idx = {"<PAD>": 0, "<UNK>": 1}
idx = 2
for sentence in train_sentences:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

tag2idx = {"<PAD>": 0}
idx = 1
for tag_seq in train_tags:
    for tag in tag_seq:
        if tag not in tag2idx:
            tag2idx[tag] = idx
            idx += 1
idx2tag = {v: k for k, v in tag2idx.items()}


## GloVe Embedding setup

In [None]:
# Prepare GloVe embedding matrix
embedding_matrix = np.random.randn(len(word2idx), 200)
glove_path = 'glove.6B.200d.txt'
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=np.float32)
        if word in word2idx:
            embedding_matrix[word2idx[word]] = vector
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)


## DataLoader


In [None]:
# Dataset and DataLoader definitions
class PosDataset(Dataset):
    def __init__(self, sentences, tags, word2idx, tag2idx, max_len=50):
        self.sentences = sentences
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx][:self.max_len]
        tag = self.tags[idx][:self.max_len]
        x = [self.word2idx.get(word, 1) for word in sent]
        y = [self.tag2idx.get(t, 0) for t in tag]
        x += [self.word2idx["<PAD>"]] * (self.max_len - len(x))
        y += [self.tag2idx["<PAD>"]] * (self.max_len - len(y))
        return torch.tensor(x), torch.tensor(y)

train_data = PosDataset(train_sentences, train_tags, word2idx, tag2idx)
test_data = PosDataset(test_sentences, test_tags, word2idx, tag2idx)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


## Model Definition

In [None]:
# RNN POS Tagger Model
class RNNTagger(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, tagset_size):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out)


## Training and Evaluation

In [None]:
# Training and evaluation
def calculate_accuracy(preds, labels):
    mask = labels != tag2idx["<PAD>"]
    preds = torch.argmax(preds, dim=-1)
    correct = (preds == labels) & mask
    return correct.sum().item() / mask.sum().item()

model = RNNTagger(embedding_matrix, 25, len(tag2idx))
loss_fn = nn.CrossEntropyLoss(ignore_index=tag2idx["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_losses, test_accuracies, train_accuracies = [], [], []

for epoch in range(20):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), y_batch.view(-1))
        acc = calculate_accuracy(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
    train_losses.append(epoch_loss / len(train_loader))
    train_accuracies.append(epoch_acc / len(train_loader))

    # Evaluation
    model.eval()
    test_acc = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            outputs = model(x_batch)
            acc = calculate_accuracy(outputs, y_batch)
            test_acc += acc
            preds = torch.argmax(outputs, dim=-1).view(-1)
            labels = y_batch.view(-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    test_accuracies.append(test_acc / len(test_loader))
    print(f"Epoch {epoch+1} | Train Loss: {epoch_loss:.4f} | Train Acc: {train_accuracies[-1]*100:.2f}% | Test Acc: {test_accuracies[-1]*100:.2f}%")

labels_filtered = [p for p, l in zip(all_preds, all_labels) if l != tag2idx["<PAD>"]]
true_filtered = [l for l in all_labels if l != tag2idx["<PAD>"]]
print("\nClassification Report:")
print(classification_report(true_filtered, labels_filtered, target_names=[idx2tag[i] for i in sorted(idx2tag) if i != tag2idx["<PAD>"]]))






# Plot loss and accuracy
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(train_losses, label='Train Loss')
plt.title("Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(train_accuracies, label='Train Acc')
plt.plot(test_accuracies, label='Test Acc')
plt.title("Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()


## Inference

In [None]:
# Inference: Predict tags for sample sentences
def preprocess_sentence(sent, word2idx, max_len=10):
    unk_idx = word2idx.get("<UNK>", 1)
    pad_idx = word2idx.get("<PAD>", 0)
    x = [word2idx.get(word, unk_idx) for word in sent]
    x += [pad_idx] * (max_len - len(x))
    x = x[:max_len]
    return torch.tensor(x).unsqueeze(0)

sample_sentences = [
    ["I", "love", "natural", "language", "processing"],
    ["The", "quick", "brown", "fox", "jumps"],
    ["Data", "science", "is", "fun"]
]
model.eval()
with torch.no_grad():
    for sent in sample_sentences:
        x_tensor = preprocess_sentence(sent, word2idx, max_len=10).to(next(model.parameters()).device)
        output = model(x_tensor)
        predictions = torch.argmax(output, dim=-1).squeeze(0)
        decoded_tags = [idx2tag[pred.item()] for pred in predictions[:len(sent)]]
        print("Sentence:", sent)
        print("Predicted Tags:", decoded_tags)
        print()
