In [2]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from nltk.corpus import conll2000
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('conll2000', quiet=True)
nltk.download('tagsets', quiet=True)
tagged_sentences = list(conll2000.tagged_sents())

In [5]:
print(tagged_sentences[0])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]


In [18]:
print(len(tagged_sentences))

10948


In [6]:
random.seed(1)
train_set, test_set = train_test_split(tagged_sentences,test_size=0.05)

In [7]:
vocabulary = set([word for sentence in train_set for word, tag in sentence])
tags = set([tag for sentence in train_set for word, tag in sentence])

In [8]:
n_obs = len(vocabulary)
n_states = len(tags)

In [9]:
word_to_idx = {word: i for i, word in enumerate(vocabulary)}
tag_to_idx = {tag: i for i, tag in enumerate(tags)}

In [10]:
n_obs, n_states

(21076, 44)

In [11]:
#Emission Probabilities
emission_prob = np.zeros((n_states, n_obs))
epsilon = 1e-6
for sentence in train_set:
    for word, tag in sentence:
        emission_prob[tag_to_idx[tag], word_to_idx[word]] += 1
emission_prob = (emission_prob + epsilon) / (np.sum(emission_prob, axis=1, keepdims=True) + epsilon * n_obs)# Laplace smoothening

In [12]:
#Transition Probabilities
transition_prob = np.zeros((n_states, n_states))
for sentence in train_set:
    for i in range(len(sentence) - 1):
        tag1, tag2 = sentence[i][1], sentence[i + 1][1]
        transition_prob[tag_to_idx[tag1], tag_to_idx[tag2]] += 1
most_common_tag = np.argmax(np.sum(transition_prob, axis=1))
transition_prob = (transition_prob + epsilon) / (np.sum(transition_prob, axis=1, keepdims=True) + epsilon * n_states) # Laplace smoothening

In [13]:
#Initial State Probabilities
initial_state_prob = np.zeros(n_states)
for sentence in train_set:
    tag = sentence[0][1]
    initial_state_prob[tag_to_idx[tag]] += 1
initial_state_prob = (initial_state_prob + epsilon) / (np.sum(initial_state_prob) + epsilon * n_states) # Laplace smoothening

In [14]:
most_common_tag

40

For unknown words in test set, we are going to use the most common tag for it

In [15]:
def viterbi_algorithm(sentence):
    viterbi = np.zeros((n_states, len(sentence)))
    backpointer = np.zeros((n_states, len(sentence)), dtype=int)
    for i, word in enumerate(sentence):
        if i == 0:
            viterbi[:, i] = np.log(emission_prob[:, word_to_idx.get(word, most_common_tag)]) + np.log(initial_state_prob)
        else:
            viterbi[:, i] = np.log(emission_prob[:, word_to_idx.get(word, most_common_tag)]) + np.max(viterbi[:, i - 1] + np.log(transition_prob.T), axis=1)
            backpointer[:, i] = np.argmax(viterbi[:, i - 1] + np.log(transition_prob.T), axis=1)
    prediction = [np.argmax(viterbi[:, -1])]
    for i in range(len(sentence) - 1, 0, -1):
        prediction.append(backpointer[prediction[-1], i])
    prediction = [list(tag_to_idx.keys())[list(tag_to_idx.values()).index(i)] for i in prediction[::-1]]
    return list(prediction)


In [16]:
correct = 0
total = 0
for sentence in test_set:
    tags = [tag for word, tag in sentence]
    sentence = [word for word, tag in sentence]
    prediction = viterbi_algorithm(sentence)
    for p, tag in zip(prediction, tags):
        if p == tag:
            correct += 1
        total += 1
accuracy = correct / total
print('Tets Accuracy:', accuracy)


Tets Accuracy: 0.9433873250511087


In [17]:
test_sentence = 'You are going to have so much fun'.split()
predicted_tags = viterbi_algorithm(test_sentence)
print('Predicted Tags:', predicted_tags)
[nltk.help.upenn_tagset(pred) for pred in predicted_tags]

Predicted Tags: ['PRP', 'VBP', 'VBG', 'TO', 'VB', 'RB', 'RB', 'VBG']
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
TO: "to" as preposition or infinitive marker
    to
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
RB: adverb
    occa

[None, None, None, None, None, None, None, None]