In [12]:
import nltk

# Download the Brown corpus and the universal tagset
nltk.download("brown")
nltk.download("universal_tagset")
# Get the tagged sentences from the Brown corpus
tagged_sentences = list(nltk.corpus.brown.tagged_sents(tagset="universal"))
# The words converted to lower format
tagged_sentences_lower = [[(word.lower(), tag) for word, tag in sentence] for sentence in tagged_sentences]

[nltk_data] Downloading package brown to /Users/emsr/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/emsr/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [13]:
# The parts of speech
pos = {"ADJ": 0, "ADP": 1, "ADV": 2, "CONJ": 3, "DET": 4, "NOUN": 5, "NUM": 6, "PRON": 7, "PRT": 8, "VERB": 9, ".": 10, "X": 11, "START": 12, "END": 13}
pos_keys = list(pos.keys())
num_pos = 12
num_tags = 14
# The word map
word_map = {}
num_words = 0
for sentence in tagged_sentences_lower:
    for word, _ in sentence:
        if word not in word_map:
            word_map[word] = num_words
            num_words += 1

In [14]:
import numpy as np

def get_transition_matrix(dataset):
    transition = np.ones((num_tags, num_tags))
    
    for sentence in dataset:
        previous_tag = "START"
        for _, tag in sentence:
            transition[pos[previous_tag], pos[tag]] += 1
            previous_tag = tag
        transition[pos[previous_tag], pos["END"]] += 1  # Not necessary
    
    return transition / transition.sum(axis=1, keepdims=True)

def get_emission_matrix(dataset):
    emission = np.ones((num_pos, num_words + 1))

    for sentence in dataset:
        for word, tag in sentence:
            emission[pos[tag], word_map[word]] += 1
        
    return emission / emission.sum(axis=1, keepdims=True)

class Dataset:
    def __init__(self, train_set, test_set):
        self.train_set = train_set
        self.test_set = test_set
        self.test_sentences = []
        self.transition_matrix = get_transition_matrix(train_set)
        self.emission_matrix = get_emission_matrix(train_set)

    def get_test_sentences(self):
        for sentence in self.test_set:
            self.test_sentences.append(" ".join(word for word, _ in sentence))

In [15]:
datasets = []
n = len(tagged_sentences_lower)
sz = n // 5
parts = []
for i in range(5):
    parts.append(tagged_sentences_lower[i * sz : i * sz + sz])

test_set_arr = []

for i in range(5):
    train_set_arr = []
    for j in range(5):
        if j == i:
            continue
        train_set_arr.extend(parts[j])

    test_set_arr = parts[i]

    datasets.append(Dataset(train_set_arr, test_set_arr))

In [16]:
def viterbi_general(transition, emission, sentence):
    
    parsed_sentence = sentence.lower().split()
    sentence_len = len(parsed_sentence)
    
    log_prob = np.full((sentence_len, num_pos), -np.inf)
    prev = np.full((sentence_len, num_pos), -1)

    # Determining the probabilities:
    for state in range(num_pos):
        log_prob[0, state] = np.log(transition[pos["START"], state]) 
        if parsed_sentence[0] in word_map:
            log_prob[0, state] += np.log(emission[state, word_map[parsed_sentence[0]]])
        else:
            log_prob[0, state] += np.log(emission[state, num_words])

    
    for word_idx in range(1, sentence_len):
        for state_curr in range(num_pos):
            for state_prev in range(num_pos):
                new_log_prob = log_prob[word_idx - 1, state_prev] + np.log(transition[state_prev, state_curr])
                if parsed_sentence[word_idx] in word_map:
                    new_log_prob += np.log(emission[state_curr, word_map[parsed_sentence[word_idx]]])
                else:
                    new_log_prob += np.log(emission[state_curr, num_words])
                if new_log_prob > log_prob[word_idx, state_curr]:
                    log_prob[word_idx, state_curr] = new_log_prob
                    prev[word_idx, state_curr] = state_prev

    # Backtracking
    rev_tags = [np.argmax(log_prob[-1])]
    for word_idx in range(sentence_len - 1, 0, -1):
        rev_tags.append(prev[word_idx, rev_tags[-1]])

    tags = []
    for tag_idx in rev_tags[::-1]:
        tags.append(pos_keys[tag_idx])
    
    return tags

def viterbi(transition, emission, sentence):
    sentence_len = len(sentence)
    
    log_prob = np.full((sentence_len, num_pos), -np.inf)
    prev = np.full((sentence_len, num_pos), -1)

    # Determining the probabilities:
    for state in range(num_pos):
        log_prob[0, state] = np.log(transition[pos["START"], state]) 
        if sentence[0][0] in word_map:
            log_prob[0, state] += np.log(emission[state, word_map[sentence[0][0]]])
        else:
            log_prob[0, state] += np.log(emission[state, num_words])

    
    for word_idx in range(1, sentence_len):
        for state_curr in range(num_pos):
            for state_prev in range(num_pos):
                new_log_prob = log_prob[word_idx - 1, state_prev] + np.log(transition[state_prev, state_curr])
                if sentence[word_idx][0] in word_map:
                    new_log_prob += np.log(emission[state_curr, word_map[sentence[word_idx][0]]])
                else:
                    new_log_prob += np.log(emission[state_curr, num_words])
                if new_log_prob > log_prob[word_idx, state_curr]:
                    log_prob[word_idx, state_curr] = new_log_prob
                    prev[word_idx, state_curr] = state_prev

    # Backtracking
    rev_tags = [np.argmax(log_prob[-1])]
    for word_idx in range(sentence_len - 1, 0, -1):
        rev_tags.append(prev[word_idx, rev_tags[-1]])

    tags = []
    for tag_idx in rev_tags[::-1]:
        tags.append(pos_keys[tag_idx])
    
    return tags

In [17]:
# Predicting and checking the accuracy
predicted_tags = []
actual_tags = []
for i in range(5):
    for sentence in datasets[i].test_set:
        predicted_tags.extend(viterbi(datasets[i].transition_matrix, datasets[i].emission_matrix, sentence))
    for sentence in datasets[i].test_set:
        actual_tags.extend(tag for _, tag in sentence)

In [18]:
import sklearn.metrics as metrics

print(metrics.classification_report(actual_tags, predicted_tags))

              precision    recall  f1-score   support

           .       0.98      1.00      0.99    147565
         ADJ       0.86      0.87      0.86     83721
         ADP       0.91      0.97      0.94    144766
         ADV       0.88      0.87      0.87     56239
        CONJ       0.97      0.99      0.98     38151
         DET       0.90      0.99      0.94    137019
        NOUN       0.94      0.89      0.91    275558
         NUM       0.99      0.77      0.87     14874
        PRON       0.86      0.95      0.90     49334
         PRT       0.90      0.84      0.87     29829
        VERB       0.96      0.91      0.94    182750
           X       0.58      0.29      0.39      1386

    accuracy                           0.93   1161192
   macro avg       0.89      0.86      0.87   1161192
weighted avg       0.93      0.93      0.93   1161192

