In [1]:
import nltk

# Download the Brown corpus and the universal tagset
nltk.download("brown")
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [2]:
from nltk.corpus import brown

# Get the tagged sentences from the Brown corpus
tagged_sentences = brown.tagged_sents(tagset="universal")
tagged_sentences = list(tagged_sentences)

In [3]:
import numpy as np
from typing import List, Tuple, Dict
from sklearn.model_selection import KFold

In [4]:
class HMMPOSTagger:
    def __init__(self, tagged_sentences: List[List[Tuple[str, str]]]):
        """
        Initialize HMM POS Tagger with BOS and EOS tokens

        :param tagged_sentences: List of sentences with (word, tag) tuples
        """
        # Add special tokens
        self.BOS_TOKEN = '<BOS>'
        self.EOS_TOKEN = '<EOS>'

        # Extract unique tags and words
        # Include BOS and EOS tags
        base_tags = sorted(set(tag for sent in tagged_sentences for _, tag in sent))
        self.tags = [self.BOS_TOKEN, self.EOS_TOKEN] + base_tags

        # Extract words (no special treatment for words)
        self.words = sorted(set(word for sent in tagged_sentences for word, _ in sent))

        # Create mappings
        self.tag_to_index = {tag: i for i, tag in enumerate(self.tags)}
        self.word_to_index = {word: i for i, word in enumerate(self.words)}

        self.tagged_sentences = tagged_sentences

    def train(self, training_sentences: List[List[Tuple[str, str]]], alpha: float = 1.0) -> Tuple[np.ndarray, np.ndarray]:
        """
        Train HMM model with Add-one smoothing, handling BOS and EOS tokens

        :param training_sentences: Training sentences with (word, tag) tuples
        :param alpha: Smoothing parameter (default 1.0 for Laplace smoothing)
        :return: Transition and emission probability matrices
        """
        n_tags = len(self.tags)
        n_words = len(self.words)

        # Initialize count matrices with smoothing
        transition_counts = np.ones((n_tags, n_tags)) * alpha
        emission_counts = np.ones((n_tags, n_words)) * alpha

        # Count tag transitions and word emissions
        for sent in training_sentences:
            # Add BOS and EOS tokens to the sentence
            augmented_sent = [
                (self.BOS_TOKEN, self.BOS_TOKEN),  # Beginning of sentence
                *sent,
                (self.EOS_TOKEN, self.EOS_TOKEN)   # End of sentence
            ]

            # Count tag transitions
            for i in range(len(augmented_sent) - 1):
                current_tag = augmented_sent[i][1]
                next_tag = augmented_sent[i+1][1]

                current_idx = self.tag_to_index[current_tag]
                next_idx = self.tag_to_index[next_tag]

                transition_counts[current_idx, next_idx] += 1

            # Count word emissions (excluding BOS and EOS tokens)
            for word, tag in sent:
                tag_idx = self.tag_to_index[tag]
                word_idx = self.word_to_index.get(word, -1)

                if word_idx != -1:
                    emission_counts[tag_idx, word_idx] += 1

        # Normalize to get probabilities
        transition_matrix = transition_counts / transition_counts.sum(axis=1, keepdims=True)
        emission_matrix = emission_counts / emission_counts.sum(axis=1, keepdims=True)

        return transition_matrix, emission_matrix

    def cross_validate(self, n_folds: int = 5, alpha: float = 1.0) -> List[Tuple[np.ndarray, np.ndarray]]:
        """
        Perform n-fold cross-validation

        :param n_folds: Number of folds
        :param alpha: Smoothing parameter
        :return: List of (transition_matrix, emission_matrix) for each fold
        """
        # Prepare k-fold cross-validation
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

        # Store results for each fold
        fold_results = []

        # Convert sentences to a flat list
        all_sentences = list(self.tagged_sentences)

        # Perform cross-validation
        for train_index, test_index in kf.split(all_sentences):
            # Split into training and test sets
            train_sentences = [all_sentences[i] for i in train_index]

            # Train the model
            transition_matrix, emission_matrix = self.train(train_sentences, alpha)

            fold_results.append((transition_matrix, emission_matrix))

        return fold_results




In [5]:
def main():
    # Download Brown Corpus with Universal tagset
    tagged_sentences = brown.tagged_sents(tagset="universal")

    # Create HMM POS Tagger
    hmm_tagger = HMMPOSTagger(tagged_sentences)

    # Perform 5-fold cross-validation
    fold_results = hmm_tagger.cross_validate()

    # Print basic information about the results
    print(f"Number of folds processed: {len(fold_results)}")

    # Print details of the first fold
    print("\nFirst Fold Analysis:")
    transition_matrix, emission_matrix = fold_results[0]

    print("\nTags (including BOS and EOS):")
    print(hmm_tagger.tags)

    print("\nTransition Matrix Shape:", transition_matrix.shape)
    print("Emission Matrix Shape:", emission_matrix.shape)

    # Print transition probabilities for BOS and EOS
    print("\nBOS Transition Probabilities:")
    bos_idx = hmm_tagger.tag_to_index[hmm_tagger.BOS_TOKEN]
    print(transition_matrix[bos_idx, :])

    print("\nEOS Transition Probabilities:")
    eos_idx = hmm_tagger.tag_to_index[hmm_tagger.EOS_TOKEN]
    print(transition_matrix[eos_idx, :])

if __name__ == "__main__":
    main()

Number of folds processed: 5

First Fold Analysis:

Tags (including BOS and EOS):
['<BOS>', '<EOS>', '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']

Transition Matrix Shape: (14, 14)
Emission Matrix Shape: (14, 56057)

BOS Transition Probabilities:
[2.17931395e-05 2.17931395e-05 8.91775269e-02 3.39755045e-02
 1.23763239e-01 9.01582182e-02 4.84461492e-02 2.12962559e-01
 1.40914440e-01 1.67807174e-02 1.60223162e-01 3.70701303e-02
 4.60706969e-02 4.14069651e-04]

EOS Transition Probabilities:
[0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
 0.07142857 0.07142857]
