In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from collections import Counter

# Parameters
WINDOW_SIZE = 3
EMBEDDING_DIM = 50
HIDDEN_DIM = 100

# Load dataset using Hugging Face datasets
conll_dataset = load_dataset("conll2003", trust_remote_code=True)
train_data = conll_dataset["train"]
test_data = conll_dataset["test"]

# Build vocabularies
word_counter = Counter(word for example in train_data for word in example['tokens'])
label_set = set(label for example in train_data for label in example['ner_tags'])

word2idx = {word: idx + 2 for idx, word in enumerate(word_counter)}  # reserve 0 and 1
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

idx2label = conll_dataset["train"].features["ner_tags"].feature.names
label2idx = {label: idx for idx, label in enumerate(idx2label)}
OUTPUT_DIM = len(label2idx)

# Convert data to windowed format
def prepare_window_data(dataset, word2idx, label2idx, window_size):
    X, Y, original_tokens = [], [], []
    pad = [word2idx["<PAD>"]] * (window_size // 2)

    for example in dataset:
        tokens = example["tokens"]
        labels = example["ner_tags"]
        indexed_sentence = [word2idx.get(word, word2idx["<UNK>"]) for word in tokens]
        padded_sentence = pad + indexed_sentence + pad

        for i in range(len(tokens)):
            window = padded_sentence[i:i + window_size]
            X.append(window)
            Y.append(labels[i])
            original_tokens.append((tokens[i], labels[i]))

    return torch.tensor(X), torch.tensor(Y), original_tokens

X_train, y_train, _ = prepare_window_data(train_data, word2idx, label2idx, WINDOW_SIZE)
X_test, y_test, test_tokens = prepare_window_data(test_data, word2idx, label2idx, WINDOW_SIZE)

# Define the model
class WindowBasedNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, window_size, hidden_dim, output_dim):
        super(WindowBasedNER, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * window_size, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(inputs.shape[0], -1)
        hidden = self.relu(self.fc1(embeds))
        out = self.fc2(hidden)
        return out

# Initialize model, loss, and optimizer
model = WindowBasedNER(len(word2idx), EMBEDDING_DIM, WINDOW_SIZE, HIDDEN_DIM, OUTPUT_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train)
    loss = loss_function(predictions, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

# Evaluation with detailed outputs
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    predicted_labels = torch.argmax(predictions, dim=1)
    accuracy = (predicted_labels == y_test).float().mean()
    print(f"Test Accuracy: {accuracy.item() * 100:.2f}%")

    print("\nSample Predictions:")
    for i in range(20):
        token, true_label_id = test_tokens[i]
        pred_label_id = predicted_labels[i].item()
        print(f"Token: {token:15} | True: {idx2label[true_label_id]:10} | Predicted: {idx2label[pred_label_id]}")


Epoch 1, Loss: 2.2853
Epoch 2, Loss: 2.2182
Epoch 3, Loss: 2.1520
Epoch 4, Loss: 2.0869
Epoch 5, Loss: 2.0226
Test Accuracy: 53.99%

Sample Predictions:
Token: SOCCER          | True: O          | Predicted: O
Token: -               | True: O          | Predicted: O
Token: JAPAN           | True: B-LOC      | Predicted: I-MISC
Token: GET             | True: O          | Predicted: O
Token: LUCKY           | True: O          | Predicted: O
Token: WIN             | True: O          | Predicted: O
Token: ,               | True: O          | Predicted: O
Token: CHINA           | True: B-PER      | Predicted: B-LOC
Token: IN              | True: O          | Predicted: I-PER
Token: SURPRISE        | True: O          | Predicted: I-MISC
Token: DEFEAT          | True: O          | Predicted: O
Token: .               | True: O          | Predicted: O
Token: Nadim           | True: B-PER      | Predicted: O
Token: Ladki           | True: I-PER      | Predicted: O
Token: AL-AIN          | True: 

In [10]:
# Print a few training examples
print("\nSample Training Data:")
for i in range(3):
    print(f"Tokens: {train_data[i]['tokens']}")
    print(f"Labels: {[conll_dataset['train'].features['ner_tags'].feature.names[l] for l in train_data[i]['ner_tags']]}")
    print()



Sample Training Data:
Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Labels: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

Tokens: ['Peter', 'Blackburn']
Labels: ['B-PER', 'I-PER']

Tokens: ['BRUSSELS', '1996-08-22']
Labels: ['B-LOC', 'O']



In [11]:
train_data.shape

(14041, 5)

In [12]:
train_data[1]

{'id': '1',
 'tokens': ['Peter', 'Blackburn'],
 'pos_tags': [22, 22],
 'chunk_tags': [11, 12],
 'ner_tags': [1, 2]}