In [5]:
import nltk

# Sample training data for POS tagging
training_data = [
    ("I", "PRON"), ("am", "VERB"), ("reading", "VERB"), ("a", "DET"), ("book", "NOUN"),
    ("on", "ADP"), ("natural", "ADJ"), ("language", "NOUN"), ("processing", "NOUN")
]

# Define the HMM tagger
class HMMTagger(nltk.TaggerI):
    def __init__(self, train_data):
        # Calculate the tag frequencies
        self.tag_freq = nltk.FreqDist(tag for (word, tag) in train_data)
        
        # Calculate the transition probabilities
        self.transition_probs = nltk.ConditionalFreqDist(
            (tag1, tag2) for ((word1, tag1), (word2, tag2)) in nltk.bigrams(train_data)
        )
        
        # Calculate the emission probabilities
        self.emission_probs = nltk.ConditionalFreqDist(
            (tag, word) for (word, tag) in train_data
        )
    
    def tag(self, sentence):
        # Initialize the Viterbi matrix
        viterbi = []
        backpointer = []
        first_viterbi = {}
        first_backpointer = {}
        for tag in self.tag_freq.keys():
            if tag == 'START':
                continue
            first_viterbi[tag] = self.transition_probs['START'].freq(tag) * self.emission_probs[tag].freq(sentence[0])
            first_backpointer[tag] = 'START'
        viterbi.append(first_viterbi)
        backpointer.append(first_backpointer)
        
        # Perform the Viterbi algorithm
        for t in range(1, len(sentence)):
            current_viterbi = {}
            current_backpointer = {}
            for tag in self.tag_freq.keys():
                if tag == 'START':
                    continue
                max_prob = max(
                    viterbi[t-1][prev_tag] * self.transition_probs[prev_tag].freq(tag) * self.emission_probs[tag].freq(sentence[t])
                    for prev_tag in self.tag_freq.keys() if prev_tag != 'END'
                )
                for prev_tag in self.tag_freq.keys():
                    if prev_tag == 'START':
                        continue
                    if viterbi[t-1][prev_tag] * self.transition_probs[prev_tag].freq(tag) * self.emission_probs[tag].freq(sentence[t]) == max_prob:
                        current_viterbi[tag] = max_prob
                        current_backpointer[tag] = prev_tag
                        break
            viterbi.append(current_viterbi)
            backpointer.append(current_backpointer)
        
        # Find the best path
        max_prob = max(viterbi[-1][tag] for tag in self.tag_freq.keys() if tag != 'END')
        best_path = ['END']
        current_tag = max(viterbi[-1], key=viterbi[-1].get)
        for i in range(len(sentence) - 1, -1, -1):
            best_path.insert(0, current_tag)
            current_tag = backpointer[i][current_tag]
        best_path.insert(0, 'START')
        
        return list(zip(sentence, best_path))
        

# Test the POS tagger
sentence = ["I", "am", "reading", "a", "book", "on", "natural", "language", "processing"]
tagger = HMMTagger(training_data)
tags = tagger.tag(sentence)
print(tags)


[('I', 'START'), ('am', 'PRON'), ('reading', 'PRON'), ('a', 'PRON'), ('book', 'PRON'), ('on', 'PRON'), ('natural', 'PRON'), ('language', 'PRON'), ('processing', 'PRON')]
