**Implement HMM/CRF on sequence tagging task**



In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index


False

In [4]:
import nltk
nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [5]:
nltk.download('universal_tagset')

from nltk.corpus import treebank,brown

corpus = brown.tagged_sents(tagset='universal')[:-100] 
#print(corpus[0])
tag_dict={}
word_dict={}

for sent in corpus:
    for elem in sent:
        w = elem[0]
        tag= elem[1]
        if w not in word_dict:
            word_dict[w]=0
        if tag not in tag_dict:
            tag_dict[tag]=0
        word_dict[w]+=1
        tag_dict[tag]+=1

print('Number of words(M): ',len(word_dict))
print('Number of tags(N): ',len(tag_dict))
print(tag_dict)
        
test_data= brown.tagged_sents(tagset='universal')[-10:]


[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
Number of words(M):  55907
Number of tags(N):  12
{'DET': 136724, 'NOUN': 275075, 'ADJ': 83581, 'VERB': 182380, 'ADP': 144483, '.': 147231, 'ADV': 56115, 'CONJ': 38067, 'PRT': 29759, 'PRON': 49174, 'NUM': 14853, 'X': 1369}


In [6]:
brown_tags_words = [ ]
for sent in corpus:  
    brown_tags_words.append( ("START", "START") )
    brown_tags_words.extend([ (tag, word) for (word, tag) in sent ])
    brown_tags_words.append( ("END", "END") )
    
print(brown_tags_words[0:30])


[('START', 'START'), ('DET', 'The'), ('NOUN', 'Fulton'), ('NOUN', 'County'), ('ADJ', 'Grand'), ('NOUN', 'Jury'), ('VERB', 'said'), ('NOUN', 'Friday'), ('DET', 'an'), ('NOUN', 'investigation'), ('ADP', 'of'), ('NOUN', "Atlanta's"), ('ADJ', 'recent'), ('NOUN', 'primary'), ('NOUN', 'election'), ('VERB', 'produced'), ('.', '``'), ('DET', 'no'), ('NOUN', 'evidence'), ('.', "''"), ('ADP', 'that'), ('DET', 'any'), ('NOUN', 'irregularities'), ('VERB', 'took'), ('NOUN', 'place'), ('.', '.'), ('END', 'END'), ('START', 'START'), ('DET', 'The'), ('NOUN', 'jury')]


In [7]:
CFD_tag_words = nltk.ConditionalFreqDist(brown_tags_words)
CPD_tag_words = nltk.ConditionalProbDist(CFD_tag_words, nltk.MLEProbDist)

brown_tags = [tag for (tag, word) in brown_tags_words ]

CFD_tags= nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))

CPD_tags = nltk.ConditionalProbDist(CFD_tags, nltk.MLEProbDist)

tag_trans_matrix=[]
j=-1
for tag1 in tag_dict:
    tag_trans_matrix.append([])
    j=j+1
    for tag2 in tag_dict:
        tag_trans_matrix[j].append(CPD_tags[tag1].prob(tag2))
print("State Transition matrix:")
print(tag_trans_matrix)

State Transition matrix:
[[0.005917029928907872, 0.6263860039203066, 0.23980427723004008, 0.06470700096544864, 0.009091308036628536, 0.012748310464878149, 0.017502413621602646, 0.0006436324273719318, 0.002011351335537287, 0.009895848570843451, 0.009764196483426465, 0.0013969749275913519], [0.01550122693810779, 0.14934835953830775, 0.012894665091338726, 0.1589093883486322, 0.24437698809415614, 0.2835844769608289, 0.026370989730073617, 0.0596782695628465, 0.01782786512769245, 0.01978369535581205, 0.008077796964464238, 0.0003308188675815687], [0.005850611981191898, 0.65281583134923, 0.05690288462688889, 0.01746808485182039, 0.08844115289360022, 0.10033380792285328, 0.009643339993539201, 0.037604240198131154, 0.019286679987078403, 0.003804692454026633, 0.006987233940728156, 0.0004905421088524905], [0.16292904923785503, 0.0975710055927185, 0.05753920386007238, 0.1843020067989911, 0.1691797346200241, 0.08060642614321746, 0.10326241912490404, 0.014376576378988924, 0.06559381511130606, 0.05492

In [8]:
def Viterbi(sentence):
    sentlen = len(sentence)
    distinct_tags = set(brown_tags)

    viterbi = [ ]
    backpointer = [ ]

    viterbi_init = { }
    backpointer_init = { }

    for tag in distinct_tags:
        if tag == "START": continue
        viterbi_init[ tag ] = CPD_tags["START"].prob(tag) * CPD_tag_words[tag].prob(sentence[0])
        backpointer_init[ tag ] = "START"
    viterbi.append(viterbi_init)
    backpointer.append(backpointer_init)

    for wordindex in range(1, len(sentence)):
        cur_viterbi = { }
        cur_backpointer = { }
        prev_viterbi = viterbi[-1]
            
        for tag in distinct_tags:
            if tag == "START": continue
            if sentence[wordindex] not in word_dict.keys():
                best_prevtag = max(prev_viterbi.keys(),key = lambda prevtag: \
                    prev_viterbi[ prevtag ] * CPD_tags[prevtag].prob(tag) *0.0001)
                cur_viterbi[tag] = prev_viterbi[ best_prevtag ] *\
                                    CPD_tags[best_prevtag].prob(tag) * 0.0001
            else:
                best_prevtag = max(prev_viterbi.keys(),key = lambda prevtag: \
                    prev_viterbi[ prevtag ] * CPD_tags[prevtag].prob(tag) * 
                                    CPD_tag_words[tag].prob(sentence[wordindex]))
                cur_viterbi[tag] = prev_viterbi[ best_prevtag ] *\
                                    CPD_tags[best_prevtag].prob(tag) *\
                                    CPD_tag_words[tag].prob(sentence[wordindex])
            cur_backpointer[tag] = best_prevtag

        viterbi.append(cur_viterbi)
        backpointer.append(cur_backpointer)

    prev_viterbi = viterbi[-1]
    best_prevtag = max(prev_viterbi.keys(),key = lambda prevtag: prev_viterbi[ prevtag ] *\
                       CPD_tags[prevtag].prob("END"))

    prob_best_seq = prev_viterbi[ best_prevtag ] * CPD_tags[ best_prevtag].prob("END")
    best_tag_seq = [ "END", best_prevtag ]

    backpointer.reverse()
    current_best_tag = best_prevtag
    for bp in backpointer:
        best_tag_seq.append(bp[current_best_tag])
        current_best_tag = bp[current_best_tag]

    best_tag_seq.reverse()
    
    return best_tag_seq[1:-1]

In [9]:
sentence = ['Hi','there','these','are','college','practicals', '.']

tag_seq=Viterbi(sentence)
print("\nThe given sentence:",sentence)
print("\nThe POS tags:",tag_seq)



The given sentence: ['Hi', 'there', 'these', 'are', 'college', 'practicals', '.']

The POS tags: ['PRT', 'PRT', 'DET', 'VERB', 'NOUN', 'NOUN', '.']


In [10]:
from sklearn.metrics import accuracy_score

test_list=[]
y_true=[]
y_pred=[]
i=-1
for sent in test_data:
    test_list.append([])
    i=i+1
    for elem in sent:
        test_list[i].append(elem[0])
        y_true.append(elem[1])
        
print(test_list[0])
        
print("\nActual tags:")
print("************")
print(y_true)

for sent in test_list:
    y_pred=y_pred+Viterbi(sent)
    
print("\nPredicted tags:")
print("************")
print(y_pred)
print("\nTotal testing accuracy: ",accuracy_score(y_true, y_pred))


['you', "can't", 'very', 'well', 'sidle', 'up', 'to', 'people', 'on', 'the', 'street', 'and', 'ask', 'if', 'they', 'want', 'to', 'buy', 'a', 'hot', 'Bodhisattva', '.']

Actual tags:
************
['PRON', 'VERB', 'ADV', 'ADV', 'VERB', 'ADP', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'CONJ', 'VERB', 'ADP', 'PRON', 'VERB', 'PRT', 'VERB', 'DET', 'ADJ', 'NOUN', '.', 'ADV', '.', 'ADP', 'PRT', 'VERB', 'PRT', 'VERB', 'X', 'X', 'X', 'ADV', 'ADV', 'ADP', 'NOUN', '.', 'NOUN', '.', 'NOUN', 'NOUN', '.', 'DET', 'NOUN', 'NOUN', '.', 'NOUN', 'NOUN', '.', 'NOUN', '.', 'CONJ', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PRT', 'VERB', '.', 'PRON', 'VERB', 'VERB', 'PRT', 'VERB', 'VERB', 'ADV', '.', 'DET', 'NOUN', '.', 'ADP', 'PRON', 'VERB', 'ADJ', 'ADV', 'PRT', 'VERB', 'DET', 'NOUN', '.', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'PRT', 'VERB', 'ADP', 'DET', 'ADJ', '.', 'VERB', 'PRON', 'PRT', '.', 'PRT', 'NOUN', 'CONJ', 'DET', 'NOUN', '.', 'ADP', 'PRON', 'VERB', 'VERB', 'ADP', 'PRON', 'PRT', 'VERB',

In [None]:
# It is using viterbe
