In [1]:
import nltk
import collections
from sklearn.model_selection import train_test_split

In [2]:
#Reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

#Splitting into train and validation data
train_tagged_sentences, validation_tagged_sentences = train_test_split(nltk_data, train_size=0.95, random_state=9999)
print('Total number of sentences in training data', len(train_tagged_sentences))
print('Total number of sentences in validation data', len(validation_tagged_sentences))

#Finding the tagged words from each sentence
train_tagged_words=[word for sentence in train_tagged_sentences for word in sentence]
train_tokens = [tagged_word[0] for tagged_word in train_tagged_words]
print('Total number of words in training data', len(train_tokens))

#Finding the tags from each sentence
train_tags = [tagged_word[1] for tagged_word in train_tagged_words]
print('Total number of tags in training data', len(train_tags))
print('Total number of unique tags in training data', len(set(train_tags)))
print('Unique tags in training data', set(train_tags))

Total number of sentences in training data 3718
Total number of sentences in validation data 196
Total number of words in training data 95615
Total number of tags in training data 95615
Total number of unique tags in training data 12
Unique tags in training data {'ADP', 'NOUN', 'ADV', 'X', 'NUM', 'PRT', 'VERB', 'PRON', 'CONJ', '.', 'DET', 'ADJ'}


In [3]:
#Finding data for Word given Tag count for Emission Probability data and Tag2 following Tag1 count for Transition Probability
def emission_transition_data(word_bag):
    tags = [tagged_word[1] for tagged_word in word_bag]
    tags_count = dict(collections.Counter(tags))
    
    word_given_tag = [tagged_word[0] + '|' + tagged_word[1] for tagged_word in word_bag]
    word_given_tag_count = dict(collections.Counter(word_given_tag))
        
    tag2_following_tag1 = [tags[index] + '-' + tags[index+1] for index in range(len(tags)-1)]
    tag2_following_tag1_count = dict(collections.Counter(tag2_following_tag1))
    
    return tags_count, word_given_tag_count, tag2_following_tag1_count

In [4]:
#Viterbi Heuristic
def Viterbi(words, words_bag):
    tags_count, word_given_tag_count, tag2_following_tag1_count = emission_transition_data(words_bag)
    tags = list(tags_count)
    state = []
    
    for key, word in enumerate(words):
        #Initialising list of probability for a given observation
        state_proba = [] 
        for tag in tags:
            #Computing transition probabilities
            if key == 0:
                previous_tag = '.'                
            else:
                previous_tag = state[-1]
                
            transition_proba = tag2_following_tag1_count.get(previous_tag + '-' + tag, 0) / tags_count.get(previous_tag, 0)
            
            #Computing emission probabilities
            emission_proba = word_given_tag_count.get(words[key] + '|' + tag, 0) / tags_count.get(tag, 0)
            
            #Computing state probabilities            
            state_proba.append(emission_proba * transition_proba)
            
        #Getting state for which probability is maximum
        max_state_proba = tags[state_proba.index(max(state_proba))] 
        state.append(max_state_proba)
        
    return list(zip(words, state))

In [5]:
%%time
#Training set accuracy
words = train_tokens
train_tagged_seq = Viterbi(words, train_tagged_words)
check = [i for i, j in zip(train_tagged_seq, train_tagged_words) if i == j]
train_accuracy = len(check)/len(train_tagged_seq)
print('Tagging accuracy on training set', train_accuracy)

Tagging accuracy on training set 0.9771374784291168
Wall time: 1.65 s


In [6]:
%%time
#Validation set accuracy
validation_tagged_words=[word for sentence in validation_tagged_sentences for word in sentence]
validation_tokens = [tagged_word[0] for tagged_word in validation_tagged_words]
words = validation_tokens
validation_tagged_seq_original = Viterbi(words, train_tagged_words)
check = [i for i, j in zip(validation_tagged_seq_original, validation_tagged_words) if i == j]
validation_accuracy = len(check)/len(validation_tagged_seq_original)
print('Tagging accuracy on validation set', validation_accuracy)

Tagging accuracy on validation set 0.9087136929460581
Wall time: 188 ms
