In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

### Task 1: Data loading \& preprocessing

#### Read CoNLL-U file

In [2]:
def load_conllu(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
                continue

            if line.startswith("#"):
                continue  # skip metadata lines

            parts = line.split("\t")
            if len(parts) != 10:
                continue

            word = parts[1]
            upos = parts[3]
            current_sentence.append((word, upos))

    if current_sentence:
        sentences.append(current_sentence)

    return sentences

In [3]:
train = load_conllu('datasets/UD_English-EWT/en_ewt-ud-train.conllu')
dev = load_conllu('datasets/UD_English-EWT/en_ewt-ud-dev.conllu')

#### Build vocabulary

In [4]:
word_to_ix = {"<UNK>": 0}
tag_to_ix = {}
word_index = 1
tag_index = 0

for sentence in train:
    for word, tag in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = word_index
            word_index += 1
        if tag not in tag_to_ix:
            tag_to_ix[tag] = tag_index
            tag_index += 1

In [5]:
print("Vocabulary size:", len(word_to_ix))
print("Number of UPOS tags:", len(tag_to_ix))

Vocabulary size: 20201
Number of UPOS tags: 18


### Task 2: PyTorch Dataset \& DataLoader

#### POSDataset class

In [6]:
class POSDataset(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words, tags = zip(*self.sentences[idx])
        word_ids = torch.tensor([self.word_to_ix.get(w, 0) for w in words], dtype=torch.long)
        tag_ids = torch.tensor([self.tag_to_ix[t] for t in tags], dtype=torch.long)
        return word_ids, tag_ids

#### DataLoader

In [7]:
def collate_fn(batch):
    sentences, tags = zip(*batch)
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=-1)  # ignore_index
    return padded_sentences, padded_tags

In [8]:
train_dataset = POSDataset(train, word_to_ix, tag_to_ix)
dev_dataset = POSDataset(dev, word_to_ix, tag_to_ix)

In [9]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

### Task 3: Build Model

In [10]:
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)             # (B, T, E)
        output, _ = self.rnn(embedded)           # (B, T, H)
        logits = self.fc(output)                  # (B, T, C)
        return logits

### Task 4: Train \& evaluate

#### Construct model

In [12]:
embedding_dim = 64
hidden_dim = 64

model = SimpleRNNForTokenClassification(
    vocab_size=len(word_to_ix),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_classes=len(tag_to_ix)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-1)

#### Training

In [13]:
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for sentences, tags in dataloader:
            logits = model(sentences)
            predictions = torch.argmax(logits, dim=-1)

            mask = tags != -1
            correct += (predictions[mask] == tags[mask]).sum().item()
            total += mask.sum().item()

    return correct / total

In [14]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for sentences, tags in train_loader:
        optimizer.zero_grad()
        logits = model(sentences)

        loss = criterion(logits.view(-1, logits.size(-1)), tags.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = evaluate(model, train_loader)
    dev_acc = evaluate(model, dev_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Dev Acc: {dev_acc:.4f}")

Epoch 1/5 | Loss: 564.5559 | Train Acc: 0.7131 | Dev Acc: 0.6713
Epoch 2/5 | Loss: 312.1434 | Train Acc: 0.7963 | Dev Acc: 0.7558
Epoch 3/5 | Loss: 235.5255 | Train Acc: 0.8392 | Dev Acc: 0.7999
Epoch 4/5 | Loss: 190.1215 | Train Acc: 0.8702 | Dev Acc: 0.8242
Epoch 5/5 | Loss: 158.2921 | Train Acc: 0.8910 | Dev Acc: 0.8380


### Task 5: Predict sentence

In [19]:
import re

def predict_sentence(sentence):
    model.eval()
    tokens = tokens = re.findall(r"\w+|[^\w\s]", sentence)
    ids = torch.tensor([word_to_ix.get(w, 0) for w in tokens]).unsqueeze(0)

    with torch.no_grad():
        logits = model(ids)
        preds = torch.argmax(logits, dim=-1).squeeze(0).tolist()

    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    return list(zip(tokens, [ix_to_tag[p] for p in preds]))

In [20]:
print(predict_sentence("This is a test sentence."))

[('This', 'DET'), ('is', 'AUX'), ('a', 'DET'), ('test', 'NOUN'), ('sentence', 'NOUN'), ('.', 'PUNCT')]
