In [1]:
# ======================
# SETUP: Required Imports
# ======================
import numpy as np
from collections import defaultdict
import nltk
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

# ======================
# DOWNLOAD NLTK DATA
# ======================
nltk.download('brown')
nltk.download('universal_tagset')

# ======================
# DATA PREPARATION
# ======================
from nltk.corpus import brown

# Load Brown corpus with universal POS tags
tagged_sents = brown.tagged_sents(tagset='universal')

# Prepare vocabulary and tags
word_counts = defaultdict(int)
tag_counts = defaultdict(int)

for sent in tagged_sents:
    for word, tag in sent:
        word_counts[word.lower()] += 1
        tag_counts[tag] += 1

# Create mappings
vocab = list(word_counts.keys())
tags = list(tag_counts.keys())
word2idx = {w: i for i, w in enumerate(vocab)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}

# MODEL INITIALIZATION

n_states = len(tags)
n_observations = len(vocab)

# Uniform initial probabilities
start_prob = np.ones(n_states) / n_states

# Initialize transition and emission matrices
trans_counts = np.zeros((n_states, n_states))
emit_counts = np.zeros((n_states, n_observations))

# ======================
# PARAMETER ESTIMATION
# ======================
for sent in tagged_sents:
    prev_tag = None
    for word, tag in sent:
        word_idx = word2idx[word.lower()]
        tag_idx = tag2idx[tag]

        if prev_tag is None:
            start_prob[tag_idx] += 1
        else:
            trans_counts[prev_tag, tag_idx] += 1

        emit_counts[tag_idx, word_idx] += 1
        prev_tag = tag_idx

# Normalize to probabilities
start_prob /= start_prob.sum()
trans_mat = trans_counts / trans_counts.sum(axis=1, keepdims=True)
emit_mat = emit_counts / emit_counts.sum(axis=1, keepdims=True)

# ======================
# VITERBI DECODER
# ======================
def viterbi_decode(sentence, tags, start_prob, trans_mat, emit_mat):
    obs_seq = [word2idx.get(w.lower(), 0) for w in sentence.split()]
    T = len(obs_seq)
    N = len(tags)

    delta = np.zeros((T, N))
    psi = np.zeros((T, N), dtype=int)

    delta[0] = start_prob * emit_mat[:, obs_seq[0]]

    for t in range(1, T):
        for j in range(N):
            trans_probs = delta[t - 1] * trans_mat[:, j]
            psi[t, j] = np.argmax(trans_probs)
            delta[t, j] = np.max(trans_probs) * emit_mat[j, obs_seq[t]]

    path = np.zeros(T, dtype=int)
    path[-1] = np.argmax(delta[-1])
    for t in range(T - 2, -1, -1):
        path[t] = psi[t + 1, path[t + 1]]

    return [tags[i] for i in path]

# ======================
# EVALUATION METRICS
# ======================
def evaluate_model(test_sents, word2idx, tag2idx, tags, start_prob, trans_mat, emit_mat):
    y_true, y_pred = [], []

    for sent in test_sents:
        words = [w for w, t in sent]
        true_tags = [t for w, t in sent]
        pred_tags = viterbi_decode(' '.join(words), tags, start_prob, trans_mat, emit_mat)

        y_true.extend(true_tags)
        y_pred.extend(pred_tags)

    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

# ======================
# SAMPLE TEST
# ======================
# Split into train/test if desired
test_sents = tagged_sents[-100:]  # last 100 sentences as test
evaluate_model(test_sents, word2idx, tag2idx, tags, start_prob, trans_mat, emit_mat)


[nltk_data] Downloading package brown to /home/ridoy/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ridoy/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


Accuracy: 0.9706005879882402
Classification Report:
              precision    recall  f1-score   support

           .       1.00      1.00      1.00       334
         ADJ       0.88      0.94      0.91       140
         ADP       0.97      0.96      0.97       283
         ADV       0.85      0.87      0.86       124
        CONJ       1.00      1.00      1.00        84
         DET       1.00      0.99      0.99       295
        NOUN       0.97      0.98      0.98       483
         NUM       0.95      0.95      0.95        21
        PRON       0.99      1.00      0.99       160
         PRT       0.92      0.93      0.92        70
        VERB       0.99      0.96      0.98       370
           X       1.00      1.00      1.00        17

    accuracy                           0.97      2381
   macro avg       0.96      0.96      0.96      2381
weighted avg       0.97      0.97      0.97      2381

