In [2]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 
#download the treebank corpus from nltk
nltk.download('treebank')
 
#download the universal tagset from nltk
nltk.download('universal_tagset')
 
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
 
#print the first two sentences along with tags
print(nltk_data[:2])

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\syeds\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\syeds\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [4]:

# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)
# create list of train and test tagged words
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [5]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

[[6.83371304e-03 1.41230067e-02 4.19134386e-02 5.01138950e-03
  2.12756261e-01 3.69020514e-02 9.56719834e-03 4.84738052e-01
  6.83371304e-03 2.23234631e-02 8.83826911e-02 7.06150308e-02]
 [1.76125243e-02 1.17416831e-03 4.50097844e-02 2.34833662e-03
  2.50489235e-01 9.39334650e-03 1.01369865e-01 4.01174158e-01
  5.67514673e-02 1.95694715e-02 1.21330721e-02 8.29745606e-02]
 [6.87694475e-02 2.78940029e-03 9.23720598e-02 6.00793920e-02
  2.18538776e-01 5.25694676e-02 1.72191828e-01 8.96899477e-02
  7.82104954e-02 9.29084867e-02 2.56410260e-02 4.61323895e-02]
 [6.03732169e-02 4.39077942e-03 3.51262353e-02 5.48847427e-04
  3.49066973e-01 5.70801310e-02 1.23490669e-01 1.50384188e-01
  4.06147093e-02 5.59824370e-02 9.33040585e-03 1.13611415e-01]
 [4.65906132e-03 4.39345129e-02 2.40094051e-01 4.24540639e-02
  2.62344331e-01 1.68945398e-02 1.31063312e-02 1.49133503e-01
  9.14395228e-03 1.76826611e-01 2.88252197e-02 1.25838192e-02]
 [1.20248254e-02 1.47401085e-02 1.39255241e-01 6.98215654e-03
  3

In [6]:
	
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [7]:
test_sent="Will can see Marry"
#pred_tags_rule=Viterbi_rule_based(test_sent.split())
pred_tags_withoutRules= Viterbi(test_sent.split())
#print(pred_tags_rule)
print(pred_tags_withoutRules)

[('Will', 'PRON'), ('can', 'VERB'), ('see', 'VERB'), ('Marry', 'PRON')]


In [None]:
pred_tags_withoutRules= Viterbi(test_sent.split())
#print(pred_tags_rule)
print(pred_tags_withoutRules)