In [1]:
import nltk
from nltk import corpus
import numpy as np

# Download the Brown corpus and the universal tagset
nltk.download("brown")
nltk.download("universal_tagset")

# Get the tagged sentences from the Brown corpus
tagged_sentences = list(corpus.brown.tagged_sents(tagset="universal"))

[nltk_data] Downloading package brown to /home/labuser/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/labuser/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
# for sentence in tagged_sentences[:10]:
#     for word_tuple in sentence:
#         print(word_tuple[0], end=" ")
#     print()

pos = {
    "ADJ": 1,
    "ADP": 2,
    "ADV": 3,
    "CONJ": 4,
    "DET": 5,
    "NOUN": 6,
    "NUM": 7,
    "PRON": 8,
    "PRT": 9,
    "VERB": 10,
    ".": 11,
    "X": 12,
}
# beginning = 0, ending = 13

In [3]:
class HiddenMarkov:
    def __init__(self, tagged_sentences, pos):
        self.tagged_sentences = tagged_sentences
        self.pos = pos
        self.word_map = {}

        self.train_sets = []
        self.test_sets = []
        self.transition_matrix = []
        self.emission_matrices = []

        pass

    def get_map(self) -> dict:
        id = 0
        for sentence in tagged_sentences:
            for word, _ in sentence:
                if word in self.word_map.keys():
                    continue
                self.word_map[word] = id
                id += 1

        return

    def get_transition_matrix(self, tagged_sentence) -> np.ndarray:
        transition_matrix = np.ones((14, 14))
        for sentence in tagged_sentence:
            n = len(sentence)
            for i in range(n):
                word, tag = sentence[i]
                tag = pos[tag]
                if i == 0:
                    transition_matrix[0][tag] += 1
                elif i < n - 1:
                    transition_matrix[tag][pos[sentence[i + 1][1]]] += 1
                else:
                    transition_matrix[tag][13] += 1

        transition_matrix = transition_matrix / np.sum(
            transition_matrix, axis=1, keepdims=True
        )

        return transition_matrix

    def get_emission_matrix(self, tagged_sentence) -> np.ndarray:
        n = len(self.word_map)
        emission_matrix = np.ones((14, n))
        for sentence in tagged_sentence:
            for word, tag in sentence:
                tag = pos[tag]
                emission_matrix[tag][self.word_map[word]] += 1

        emission_matrix = emission_matrix / np.sum(
            emission_matrix, axis=1, keepdims=True
        )

        return emission_matrix

    def five_fold(self):
        n = len(self.tagged_sentences)
        sz = n // 5
        parts = []
        for i in range(5):
            parts.append(tagged_sentences[i * sz : i * sz + sz])
        for i in range(5):
            self.test_sets.append(parts[i])
            train_set = []
            for j in range(5):
                if j == i:
                    continue
                train_set.extend(parts[i])

            self.train_sets.append(train_set)

        return

    def get_five_fold_matrices(self):
        self.train_sets = []
        self.test_sets = []
        self.five_fold()
        self.get_map()
        for i in range(5):
            self.transition_matrix.append(
                self.get_transition_matrix(self.train_sets[i])
            )
            self.emission_matrices.append(self.get_emission_matrix(self.train_sets[i]))

        return

In [4]:
model = HiddenMarkov(tagged_sentences=tagged_sentences, pos=pos)
model.get_five_fold_matrices()