In [39]:
import nltk, unittest, itertools, numpy as np
from nltk.corpus import brown
import copy

####  For ease of grading please use the following template for your code.  Replace the DefaultTagger with code for HMM Tagger.  You may add other classes and functions, but please do not remove the existing functions like untag(), evaluate(), etc.  We should be able to simply run all the cells in your script to get the accuracies of your model.

In [70]:
class DefaultTagger:
    
    def __init__(self):
        self.individual_tag = {}
        self.transitional_prob_counter = {}
        self.emission_prob_counter = {}
        self.transitional_prob = {}
        self.emission_prob = {}
        
    def train(self, tagged_sent):
        prev_tag = '<s>'
        self.individual_tag[prev_tag] = self.individual_tag.get(prev_tag, 0) + 1
        for sentence in tagged_sent:
            
            #print (len(sentence))
            #for x in sentence:
            #print (x)
            word = sentence[0]
            tag = sentence[1]
            #Add 1 to the counter of tags
            self.individual_tag[tag] = self.individual_tag.get(tag, 0) + 1

            #Add to the transitional probability counter
            if prev_tag not in self.transitional_prob_counter.keys():
                self.transitional_prob_counter[prev_tag] = {tag:0}
            #if tag not in self.transitional_prob[prev_tag].keys():
                #self.transitional_prob[prev_tag][tag] = 0
            self.transitional_prob_counter[prev_tag][tag] = self.transitional_prob_counter[prev_tag].get(tag, 0) + 1
            prev_tag = tag  #change the value of preious tag to the next tag

            #Add to the emission probability counter
            if word not in self.emission_prob_counter.keys():
                self.emission_prob_counter[word] = {tag:0}
            self.emission_prob_counter[word][tag] = self.emission_prob_counter[word].get(tag, 0) + 1
        
    
    def convert_to_prob(self):
        '''
        Converts the Transitional and Emission counter to probabilities
        '''
        self.transitional_prob = copy.deepcopy (self.transitional_prob_counter)
        self.emission_prob = copy.deepcopy(self.emission_prob_counter)
        for prev in self.transitional_prob:
            for pos in [tag for tag in self.individual_tag if tag != '<s>']:
                #print (prev, pos)
                self.transitional_prob[prev][pos] = (self.transitional_prob[prev].get(pos, 0)+1)/(self.individual_tag[prev]+len(self.individual_tag.keys()))
                
        #Really??
        #del(self.individual_tag['<s>'])
        
        for word in self.emission_prob:
            for pos in [tag for tag in self.individual_tag if tag != '<s>']:
                self.emission_prob[word][pos] = (self.emission_prob[word].get(pos, 0)+1)/self.individual_tag[pos]+(len(self.individual_tag.keys()))
                #Smoothed??
    
    def test_word_unseen(self, word):
        '''
        Smoothin when a word on the test set has not been seen
        '''
        for tag in self.individual_tag:
            self.emission_prob[word] = {tag: 1/self.individual_tag[tag]}
        
    def predict (self, s):
        '''
        Returns a list of POS for a given test sentence s
        '''
        self.convert_to_prob()
        #s = ['<s>'] + s
        prev = '<s>'
        t = len(self.individual_tag.keys())  #  Must be == len(self.emission_prob[word])
        results = [[1.0, []]]*len(s)*t
        for i in range(len(s)):
            word = s[i]
            j = 0
            if word not in self.emission_prob:
                self.test_word_unseen(word)
            for tag in self.emission_prob[word]:  #TODO Unseen words #Done
                em_prob = self.emission_prob[word][tag]  #Emission probability
                tran_prob = self.transitional_prob[prev][tag]  #Transitional Probability
                prob = em_prob*tran_prob  #Joint probability
                index = i*t+j  
                results[index][0] *= prob
                results[index][1].append(tag)
                prev = tag
                j =+1
                if (j>=t): print ('ALERT!')
        return max(results)[1]
        
    def tag(self, s):
        return list(zip(s, self.predict(s)))
        #return list(zip(s, ['NOUN']*len(s)))


In [73]:
def untag(tagged_sentence):
    return [w for (w,t) in tagged_sentence]

def evaluate(gold, predicted):
    if len(gold) != len(predicted):
        raise Exception("Mismatching length")
    count = 0
    for (g,p) in zip(gold, predicted):
        if g[1] == p[1]:
            count += 1
    l = len(gold)
    print (count, l)
    return(count == l, count, l)

def tagger_train(tagger, train):
    for ts in train:
        tagger.train(ts)

def tagger_accuracy(tagger, test):
    total_words = 0
    total_sentences = len(test)
    correct_words = 0
    correct_sentences = 0
    for ts in test:
        pred = tagger.tag(untag(ts))
        is_correct, num_correct, total =  evaluate(ts, pred)
        if is_correct: 
            correct_sentences += 1
        correct_words += num_correct
        total_words += total
    return(correct_sentences/total_sentences, correct_words/total_words)
                          

In [74]:
k = 5
max_tag_length = 4
# Use smaller values during development and code testing

brown_tagged_sents = [s for s in brown.tagged_sents(tagset="universal") if len(s) <= max_tag_length]
num_in_fold = len(brown_tagged_sents) // k

sentence_accuracies = []
word_accuracies = []
for i in range(k):
    training_set = (brown_tagged_sents[0:i*num_in_fold] + 
                        brown_tagged_sents[(i+1)*num_in_fold:])
    test_set = brown_tagged_sents[i*num_in_fold: (i+1)*num_in_fold]
    #
    # IN THE FOLLOWING REPLACE THE DefaultTagger() WITH YOUR HMM TAGGER
    #
    tagger = DefaultTagger()
    tagger_train(tagger, training_set)
    sentence_accuracy, word_accuracy = tagger_accuracy(tagger, test_set)
    sentence_accuracies.append(sentence_accuracy)
    word_accuracies.append(word_accuracy)
print('Sentence', np.array(sentence_accuracies).mean(), 'Word', np.array(word_accuracies).mean())

#
# WITH HMM TAGGING YOU SHOULD GET SENTENCE LEVEL ACCURACY OF AT LEAST 0.3, 
# AND WORD LEVEL ACCURACY OF AT LEAST 0.6.  
#


0 2
0 2
0 3
0 3
0 2
0 3
0 1
0 3
0 3
0 3
0 3
0 3
1 3
0 3
1 4
0 3
0 3
0 3
0 3
1 4
0 3
0 3
0 1
0 1
0 1
0 3
0 3
1 3
0 4
0 3
1 3
1 4
0 3
0 1
0 2
0 2
0 2
0 1
0 2
0 2
0 2
0 3
0 1
0 1
0 2
0 4
0 1
0 3
0 2
0 2
0 2
0 3
0 3
0 1
0 1
0 2
0 2
0 1
0 2
0 3
0 4
0 4
0 4
1 4
0 3
0 4
0 2
0 2
1 4
1 4
1 4
0 4
0 1
0 2
1 3
0 3
0 3
0 2
0 2
0 4
0 2
0 2
0 2
0 2
0 2
1 3
0 4
1 3
0 2
0 2
0 3
0 3
0 3
0 4
1 4
1 4
0 2
0 3
0 4
0 3
0 3
0 3
0 3
0 3
0 3
0 3
0 2
0 3
0 2
0 2
0 3
0 3
0 3
0 4
0 3
0 2
1 4
0 2
0 3
0 4
1 4
1 4
1 4
0 3
0 4
0 3
1 4
0 3
0 4
1 3
1 4
1 4
1 4
0 3
0 1
1 4
0 2
0 2
0 3
0 1
0 3
0 3
0 1
0 3
0 4
0 4
0 2
0 2
0 3
1 3
0 4
0 3
0 2
0 2
0 3
0 2
0 2
0 4
0 1
1 4
1 4
0 2
0 2
0 2
1 4
0 4
0 3
1 4
1 3
0 3
1 4
0 4
1 3
0 1
0 4
0 2
0 3
1 3
0 4
0 2
1 3
1 3
0 3
0 2
0 2
0 2
0 3
1 3
0 2
0 3
0 3
0 2
1 3
0 2
0 3
0 3
1 3
0 2
0 2
0 3
0 3
0 3
0 2
0 2
0 3
0 4
0 3
0 3
0 2
0 4
0 3
0 3
0 4
0 2
1 4
0 3
0 3
0 3
0 3
0 3
0 4
0 3
1 4
0 2
0 2
0 2
0 2
0 3
0 2
0 3
1 3
0 1
0 1
0 4
0 4
0 2
0 4
0 2
1 3
0 3
0 3
0 1
0 2
0 1
1 3
0 3
1 4
1 3
0 2
0 3


In [68]:
Sentence 0.0 Word 0.0585442575123

KeyError: '<s>'