In [71]:
import numpy as np
from datasets import load_dataset
from itertools import chain

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

### Task 1: Load \& preprocess data

#### Download data

In [72]:
dataset_dict = load_dataset('Davlan/conll2003_noMISC')

Using the latest cached version of the dataset since Davlan/conll2003_noMISC couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\LENOVO\.cache\huggingface\datasets\Davlan___conll2003_no_misc\default\0.0.0\56e730f1bbd9a40777ea5fbec12793e44f2a4999 (last modified on Tue Nov 18 14:41:48 2025).


In [73]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3453
    })
})

#### Extract data

In [74]:
train_sentences = dataset_dict['train']['tokens']
train_tags = dataset_dict['train']['ner_tags']
val_sentences = dataset_dict['validation']['tokens']
val_tags = dataset_dict['validation']['ner_tags']

In [75]:
train_sentences[0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [76]:
train_tags[0]

['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

#### Build vocabulary

In [77]:
all_tokens = list(chain.from_iterable(train_sentences))
vocab = sorted(set(all_tokens))

In [78]:
word_to_ix = {token: i + 1 for i, token in enumerate(vocab)}
word_to_ix["[UNK]"] = 0

In [79]:
unique_ners = sorted(set(list(chain.from_iterable(train_tags))))
print(unique_ners)

['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']


In [80]:
tag_to_ix = {tag: i for i, tag in enumerate(unique_ners)}

In [81]:
print('Word to index length:', len(word_to_ix))
print('Tag to index length:', len(tag_to_ix))

Word to index length: 23624
Tag to index length: 7


### Task 2: Build Pytorch Dataset \& DataLoader

#### NERDataset class

In [82]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.tags[idx]
        sentence_indices = torch.tensor([self.word_to_ix.get(w, 1) for w in words], dtype=torch.long)
        tag_indices = torch.tensor([self.tag_to_ix[t] for t in tags], dtype=torch.long)
        return sentence_indices, tag_indices

#### DataLoader

In [83]:
def collate_fn(batch):
    sentences, tags = zip(*batch)
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=-1)  # ignore_index
    return padded_sentences, padded_tags

In [84]:
train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags, word_to_ix, tag_to_ix)

In [85]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

### Task 3: Build RNN Model

In [86]:
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)             # (B, T, E)
        output, _ = self.rnn(embedded)           # (B, T, H)
        logits = self.fc(output)                  # (B, T, C)
        return logits

### Task 4+5: Train \& Evaluate model

#### Constructor

In [87]:
embedding_dim = 64
hidden_dim = 64

model = SimpleRNNForTokenClassification(
    vocab_size=len(word_to_ix),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_classes=len(tag_to_ix)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-1)

#### Training \& Evaluating

In [88]:
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct = 0

    with torch.no_grad():
        for sentences, tags in dataloader:
            logits = model(sentences)
            predictions = torch.argmax(logits, dim=-1)

            mask = tags != -1
            correct += (predictions[mask] == tags[mask]).sum().item()
            total += mask.sum().item()

    return correct / total

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for sentences, tags in train_loader:
        optimizer.zero_grad()
        logits = model(sentences)

        loss = criterion(logits.view(-1, logits.size(-1)), tags.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = evaluate(model, train_loader)
    val_acc = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

Epoch 1/10 | Loss: 273.5604 | Train Acc: 0.8733 | Dev Acc: 0.8733
Epoch 2/10 | Loss: 161.2410 | Train Acc: 0.9105 | Dev Acc: 0.9054
Epoch 3/10 | Loss: 114.2361 | Train Acc: 0.9339 | Dev Acc: 0.9235
Epoch 4/10 | Loss: 86.3707 | Train Acc: 0.9486 | Dev Acc: 0.9324
Epoch 5/10 | Loss: 67.6984 | Train Acc: 0.9608 | Dev Acc: 0.9382
Epoch 6/10 | Loss: 53.6927 | Train Acc: 0.9691 | Dev Acc: 0.9412
Epoch 7/10 | Loss: 42.9359 | Train Acc: 0.9764 | Dev Acc: 0.9452
Epoch 8/10 | Loss: 34.4901 | Train Acc: 0.9804 | Dev Acc: 0.9460
Epoch 9/10 | Loss: 27.5767 | Train Acc: 0.9853 | Dev Acc: 0.9478
Epoch 10/10 | Loss: 22.3672 | Train Acc: 0.9880 | Dev Acc: 0.9483


#### Predict sentence

In [90]:
import re

def predict_sentence(sentence):
    model.eval()
    tokens = tokens = re.findall(r"\w+|[^\w\s]", sentence)
    ids = torch.tensor([word_to_ix.get(w, 0) for w in tokens]).unsqueeze(0)

    with torch.no_grad():
        logits = model(ids)
        preds = torch.argmax(logits, dim=-1).squeeze(0).tolist()

    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    return list(zip(tokens, [ix_to_tag[p] for p in preds]))

In [91]:
print(predict_sentence('VNU University is located in Hanoi'))

[('VNU', 'B-ORG'), ('University', 'I-ORG'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Hanoi', 'B-LOC')]
