# Modifying the Viterbi Algorithm

### Required Imports

In [1]:
import json
import math
from sklearn.metrics import classification_report

In [2]:
START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'
RARE_SYMBOL = '_RARE_'
RARE_WORD_MAX_FREQ = 5
LOG_PROB_OF_ZERO = -1000

### Preparing the dataset

In [3]:
# Receives a list of tagged sentences and processes each sentence to generate a list of words and a list of tags.
# Each sentence is a string of space separated "WORD/TAG" tokens, with a newline character in the end.
# Remember to include start and stop symbols in yout returned lists, as defined by the constants START_SYMBOL and STOP_SYMBOL.
# penn_words (the list of words) should be a list where every element is a list of the tags of a particular sentence.
# penn_tags (the list of tags) should be a list where every element is a list of the tags of a particular sentence.
def read_data(data):
    all_words = []
    all_tags = []
    for sentence in data:
        words = sentence[0].split(' ')
        words = [START_SYMBOL, START_SYMBOL] + words + [STOP_SYMBOL]
        tags = [START_SYMBOL, START_SYMBOL] + sentence[1] + [STOP_SYMBOL]
        assert len(words) == len(tags)
        all_words.append(words)
        all_tags.append(tags)
    return all_words, all_tags

# This function takes tags from the training data and calculates tag trigram probabilities.
# It returns a python dictionary where the keys are tuples that represent the tag trigram, and the values are the log probability of that trigram
def get_bigrams(item):
    bigrams_tmp = []
    for i in range(len(item)-1):
        bigrams_tmp.append((item[i], item[i+1]))
    return bigrams_tmp

def get_trigrams(item):
    trirams_tmp = []
    for i in range(len(item)-2):
        trirams_tmp.append((item[i], item[i+1], item[i+2]))
    return trirams_tmp

### Hidden Markov Model

In [4]:
def get_transitions_probs(penn_tags):
    transition_values = {}

    bigram_count = {}
    trigram_count = {}

    for item in penn_tags:
        bigram_tmp = get_bigrams(item)
        trigram_tmp = get_trigrams(item)

        for bigram in bigram_tmp:
            if bigram in bigram_count:
                bigram_count[bigram] += 1
            else:
                bigram_count[bigram] = 1

        for trigram in trigram_tmp:
            if trigram in trigram_count:
                trigram_count[trigram] += 1
            else:
                trigram_count[trigram] =1

    for trigram in trigram_count:
        transition_values[trigram] = math.log(trigram_count[trigram], 2) - math.log(bigram_count[trigram[:2]],2)
    return transition_values

# This function takes output from get_transitions_probs() and outputs it in the proper format
def s2_output(transition_values, filename):
    outfile = open(filename, "w")
    trigrams = transition_values.keys()
    trigrams = sorted(trigrams)  
    for trigram in trigrams:
        output = " ".join(['TRIGRAM', trigram[0], trigram[1], trigram[2], str(transition_values[trigram])])
        outfile.write(output + '\n')
    outfile.close()

# Takes the words from the training data and returns a set of all of the words that occur more than 5 times (use RARE_WORD_MAX_FREQ)
# penn_words is a python list where every element is a python list of the words of a particular sentence.
# Note: words that appear exactly 5 times should be considered rare!
def calc_known(penn_words):
    known_words = set([])
    
    word_count = {}

    for item in penn_words:
        for word in item:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

    for item in word_count:
        if word_count[item] > RARE_WORD_MAX_FREQ:
            known_words.add(item)
	    
    return known_words

# Takes the words from the training data and a set of words that should not be replaced for '_RARE_'
# Returns the equivalent to penn_words but replacing the unknown words by '_RARE_' (use RARE_SYMBOL constant)
def replace_rare(penn_words, known_words):
    penn_words_rare = []

    for item in penn_words:
        tmp = []
        for word in item:
            if word in known_words:
                tmp.append(word)
            else:
                tmp.append(RARE_SYMBOL)
        penn_words_rare.append(tmp)

    return penn_words_rare

# This function takes the ouput from replace_rare and outputs it to a file
def s3_output(rare, filename):
    outfile = open(filename, 'w')
    for sentence in rare:
        outfile.write(' '.join(sentence[2:-1]) + '\n')
    outfile.close()

# Calculates emission probabilities and creates a set of all possible tags
# The first return value is a python dictionary where each key is a tuple in which the first element is a word
# and the second is a tag, and the value is the log probability of the emission of the word given the tag
# The second return value is a set of all possible tags for this data set
def get_emission_probs(penn_words_rare, penn_tags):
    e_values = {}
    taglist = set([])

    tag_count = {}
    word_tag_count = {}
    for i in range(len(penn_tags)):
        sentence = penn_words_rare[i]
        tags = penn_tags[i]
        for j in range(1, len(tags)):
            word = sentence[j]
            tag = tags[j]
            tag_prev = tags[j-1]
            if (word, tag, tag_prev) in word_tag_count:
                word_tag_count[(word,tag,tag_prev)] += 1
            else:
                word_tag_count[(word,tag,tag_prev)] =1
            if (tag, tag_prev) in tag_count:
                tag_count[tag, tag_prev] += 1
            else:
                tag_count[tag, tag_prev] = 1

    for item in word_tag_count:
        e_values[item] = math.log(word_tag_count[item],2) - math.log(tag_count[item[1:]],2)

    for item in tag_count:
        taglist.add(item[0])
        taglist.add(item[1])

    return e_values, taglist

# This function takes the output from calc_emissions() and outputs it
def s4_output(e_values, filename):
    outfile = open(filename, "w")
    emissions = e_values.keys()
    emissions = sorted(emissions)  
    for item in emissions:
        output = " ".join([item[0], item[1], item[2], str(e_values[item])])
        outfile.write(output + '\n')
    outfile.close()



### The Modified Viterbi Algorithm
This function takes data to tag (penn_dev_words), a set of all possible tags (taglist), a set of all known words (known_words), trigram probabilities (transition_values) and emission probabilities (e_values) and outputs a list where every element is a tagged sentence (in the WORD_/TAG format, separated by spaces and with a newline in the end, just like our input tagged data) penn_dev_words is a python list where every element is a python list of the words of a particular sentence. taglist is a set of all possible tags known_words is a set of all known words transition_values is from the return of get_transitions_probs() e_values is from the return of calc_emissions() The return value is a list of tagged sentences in the format "WORD/TAG", separated by spaces. Each sentence is a string with a terminal newline, not a list of tokens. Remember also that the output should not contain the "_RARE_" symbol, but rather the original words of the sentence!

In [5]:
def do_viterbi(penn_dev_words, taglist, known_words, transition_values, e_values):
    tagged = []

    print(len(penn_dev_words))

    Pi_Init = {}
    for u in taglist:
        for v in taglist:
            Pi_Init[(u,v)] = LOG_PROB_OF_ZERO

    for ctr, item in enumerate(penn_dev_words):

        sentence = item + [STOP_SYMBOL]
        converted_sentence = []
        n = len(sentence)

        cur_len = 0
        Path_Pre = {} 
        Path_Pre[(START_SYMBOL, START_SYMBOL)] = [START_SYMBOL, START_SYMBOL]
        Bigram_Pre = [(START_SYMBOL, START_SYMBOL)]           

        Pi_Pre = {}
        Pi_Pre[(START_SYMBOL, START_SYMBOL)] = 0
        
        # For each sentence
        while cur_len < n:
            Path_Cur = {}
            Bigram_Cur = []
            Pi_Cur = {}

            if cur_len == n-1:
                word = STOP_SYMBOL
                tagspace = [STOP_SYMBOL]
            else:
                word = sentence[cur_len]
                if word not in known_words:
                    word = RARE_SYMBOL
                tagspace = list(taglist)
            converted_sentence.append(word)

            for v in tagspace:
                for u in taglist:
                    emi_tmp = (word, v, u)
                    if emi_tmp not in e_values:
                        e_values[emi_tmp] = LOG_PROB_OF_ZERO
                    w_tmp = ''
                    for w in taglist:
                        if (w,u) not in Bigram_Pre:
                            continue
                        trigram_cur = (w,u,v)
                        if trigram_cur not in transition_values:
                            transition_values[trigram_cur] = LOG_PROB_OF_ZERO
                        if (u,v) not in Pi_Cur:
                            Pi_Cur[(u,v)] = Pi_Pre[(w,u)]+transition_values[trigram_cur]+e_values[emi_tmp]
                            w_tmp = w
                        elif Pi_Pre[(w,u)]+transition_values[trigram_cur]+e_values[emi_tmp] > Pi_Cur[(u,v)]:
                            Pi_Cur[(u,v)] = Pi_Pre[(w,u)]+transition_values[trigram_cur]+e_values[emi_tmp]
                            w_tmp = w
                    if w_tmp != '':
                        Path_Cur[(u,v)] =  Path_Pre[(w_tmp,u)]+[v]
                        Bigram_Cur.append((u,v))

            Pi_Pre = dict(Pi_Cur)
            Bigram_Pre = list(Bigram_Cur)
            Path_Pre = dict(Path_Cur)
            cur_len += 1

        st = ''
        bigram_max = Bigram_Pre[0]
        for bigram in Bigram_Pre:
            if Pi_Cur[bigram] > Pi_Cur[bigram_max]:
                bigram = bigram_max
        for i,tag in enumerate(Path_Cur[bigram_max][2:-1]):
            st = st + sentence[i]+'_/'+tag+' '
        tagged.append(st.strip()+'\n')
        if len(tagged) % 100 == 0:
            print(len(tagged))

    return tagged

def s5_read(filename):
    file = open(filename, 'r')
    tagged = file.readline()
    file.close()
    return tagged

# This function takes the output of do_viterbi() and outputs it to file
def s5_output(tagged, filename):
    tagged = s5_read(filename)
    outfile = open(filename, 'w')
    outfile.write(tagged)
    outfile.close()


### Evaluation and Testing

In [6]:
def split_devtags(dev_sentences):
    penn_words = []
    penn_tags = []

    for item in dev_sentences:
        words_tmp = []
        tags_tmp = []
        sentence = item.strip().split(' ')
        for token in sentence:
            words_tmp.append(token.rsplit('_/',1)[0])
            tags_tmp.append(token.rsplit('_/',1)[1])
        penn_words.append(words_tmp)
        penn_tags.append(tags_tmp)

    return penn_words, penn_tags

def print_accuracy(test_penn_tags, test_tags, padding):
    # Flattening all original tags
    flat_orig_tags = []
    for orig_tag in test_penn_tags:
        flat_orig_tags.extend(orig_tag[2:-1])

    # Flattening all original tags
    reduce_orig_tags = []
    for predict_tag in test_tags:
        reduce_orig_tags.extend(predict_tag[2:-1])

    assert len(flat_orig_tags) == len(reduce_orig_tags)
    print(classification_report(flat_orig_tags, reduce_orig_tags))

def change_tags(all_tags, choice):
    reconstruct_tag = []
    for taglist in all_tags:
        new_tags = []
        for tag in taglist:
            split_tag = tag.split('-')
            if len(split_tag) == 1:
                new_tags.append(tag)
            else:
                if choice=='1':
                    new_tags.append(split_tag[0])
                else:
                    new_tags.append(split_tag[1])
        reconstruct_tag.append(new_tags)
    return reconstruct_tag

In [7]:
def main():
    # open penn training data
    with open('./penn-data.json', 'r') as infile:
        json_data = json.load(infile)

    # split words and tags, and add start and stop symbols (question 1)
    all_words, all_tags = read_data(json_data)
    padding = 'result'
    
    # calculate tag trigram probabilities (question 2)
    transition_values = get_transitions_probs(all_tags)

    # calculate list of words with count > 5 (question 3)
    known_words = calc_known(all_words)

    # get a version of penn_words with rare words replace with '_RARE_' (question 3)
    penn_words_rare = replace_rare(all_words, known_words)

    # calculate emission probabilities (question 4)
    e_values, taglist = get_emission_probs(penn_words_rare, all_tags)

    # delete unneceessary data
    del penn_words_rare

    # Test Data
    t_sentence = 'That former Sri Lanka skipper and ace batsman Aravinda De Silva is a man of few words was very much evident on Wednesday when the legendary batsman , who has always let his bat talk , struggled to answer a barrage of questions at a function to_F promote.'
    t_words = t_sentence.split(' ')
    test_put = [START_SYMBOL, START_SYMBOL]
    test_put.extend(t_words)
    test_put.append(STOP_SYMBOL)
    test_data = [test_put]

    tag_out = do_viterbi(test_data, taglist, known_words, transition_values, e_values)
    s5_output(tag_out, 'tagged'+padding+'.txt')

if __name__ == "__main__": 
    main()

1
