In [117]:
import pandas as pd
import numpy as np
import os
import math

### Import Data as Dataframe

In [161]:
cwd = os.getcwd()
rel_path = '/data_release'
filename = '/train.csv'
train_data = pd.read_csv(cwd + rel_path + filename, encoding = "ISO-8859-1")
train_data.head(5)

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."


### Unknown Word Handling

In [214]:
def unknown_words(wordlist):
    for i in range(len(wordlist)):
        toss = np.random.binomial(size=1, n=1, p= 0.005)
        if toss == 1:
            wordlist[i] = 'UNK'
    return wordlist

### Converting all sentences into corresponding Words in a List

In [215]:
# O(n) Complexity

sentence = train_data['sentence']
sentence = list(sentence)
word_str = ''
for item in sentence:
    word_str += item + ' '

word_str = word_str.lower()
word_list = word_str.split()
word_list = unknown_words(word_list)          # def unknown_words()
print(len(word_list))


116622


### Converting all metaphor labels into one huge list of labels of 0's and 1's

In [216]:
# O(n) complexity

tag = train_data['label_seq'].tolist()
tag_str = ''
for item in tag:
    item = item[1:-1]
    tag_str += item + ', '
    
tag_list_str = tag_str.split(', ')
tag_list_str = tag_list_str[0:-1]
tag_list = []
for item in tag_list_str:
    tag_list.append(int(item))

len(tag_list)

116622

### Creating Dictionary for each word and its count of tags

In [217]:
# O(n) Complexity

word_dict = {}

for item, tag in zip(word_list, tag_list):
    if item not in word_dict:
        val_list = [0,0]
        val_list[tag] += 1
        word_dict[item] = val_list
    else:
        val_list = word_dict[item]
        val_list[tag] += 1
        word_dict[item] = val_list
        
print(word_dict['UNK'])

[530, 75]


### Creating Dictionary of Tag counts (Unigrams and Bigrams)

In [218]:
# O(n) Complexity

tag_dict = {}

tag = train_data['label_seq'].tolist()
tag_str = ''
for item in tag:
    item = item[1:-1]
    tag_str += item + ', ' + '<s>' + ', '

tag_list_str_s = tag_str.split(', ')
tag_list_str_s = tag_list_str_s[0:-1]

tag_list_str_bigram = []
for i in range(len(tag_list_str_s)-1):
    tag_list_str_bigram.append(tag_list_str_s[i]+tag_list_str_s[i+1])

for item in tag_list_str_s:
    if item not in tag_dict:
        tag_dict[item] = 1
    else:
        tag_dict[item] += 1

for item in tag_list_str_bigram:
    if item not in tag_dict:
        tag_dict[item] = 1
    else:
        tag_dict[item] += 1

print(tag_dict)

{'0': 103571, '<s>': 6323, '1': 13051, '00': 87135, '0<s>': 6267, '<s>0': 5829, '01': 10169, '10': 10606, '11': 2389, '<s>1': 493, '1<s>': 56}


### Transition Probability

In [92]:
def t_prob(tag_dict, i, j):
    if j==-1:
        prob = tag_dict[str(i)]/(tag_dict[str(0)] + tag_dict[str(1)])
    else:
        key = str(j) + str(i)
        prob = tag_dict[key]/tag_dict[str(j)]
    
    return prob

### Emission Probability

In [170]:
def emiss_prob(word_dict,tag_dict, word, i):
    
    if word in word_dict:
        prob = word_dict[word][i]/tag_dict[str(i)]
    else:
        prob = word_dict['UNK'][i]/tag_dict[str(i)]
    
    return prob

### Viterbi Algorithm on HMM

In [178]:
def viterbi(item, word_dict, tag_dict, ntags):
    
    item = item.split()
    word0 = item[0]
    c_prev = -1
    score = np.zeros((ntags, len(item)))
    b_ptr = np.zeros((ntags, len(item)))
    
    for c in range(ntags):
#         score[c,0] = math.log(t_prob(tag_dict, c, c_prev)) + math.log(emiss_prob(word_dict, tag_dict, word0, c))
        score[c,0] = t_prob(tag_dict, c, c_prev)*emiss_prob(word_dict, tag_dict, word0, c)
        b_ptr[c,0] = 0
    
    for t in range(1, len(item)):
        for c in range(ntags):
            emission_prob = emiss_prob(word_dict, tag_dict, item[t], c)
            temp = []
            for j in range(ntags):
#                 temp.append(score[j,t-1] + math.log(t_prob(tag_dict, c, j)) + log(emission_prob))
                temp.append(score[j,t-1]*t_prob(tag_dict, c, j)*emission_prob)
            score[c,t] = max(temp)
            b_ptr[c,t] = int(temp.index(max(temp)))
    
    b_ptr = np.int_(b_ptr)
    Tseq = [-1]*len(item)
    Tseq[len(item)-1] = np.argmax(score[:,len(item)-1])
    for i in range(len(item)-2, -1, -1):
        Tseq[i] = b_ptr[Tseq[i+1], i+1]
    
    return Tseq

### Loading Validation Set and implementing Viterbi Algorithm on each sentence

In [226]:
# filename = '/val.csv'
filename = '/test_no_label.csv'
val_data = pd.read_csv(cwd + rel_path + filename, encoding = "ISO-8859-1")
val_data.head(5)

Unnamed: 0,sentence,pos_seq
0,For all his use of the model of the boxing mat...,"['ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'N..."
1,"Trade you in for a couple of camels , he 'd co...","['NOUN', 'PRON', 'ADV', 'ADP', 'DET', 'NOUN', ..."
2,She does n't stoop in and,"['PRON', 'VERB', 'ADV', 'VERB', 'PART', 'CCONJ']"
3,take you up .,"['VERB', 'PRON', 'PART', 'PUNCT']"
4,I will go now and deliver them myself to Narok .,"['PRON', 'VERB', 'VERB', 'ADV', 'CCONJ', 'VERB..."


In [227]:
val_sentence = val_data['sentence'].tolist()
ntags = 2
tagseq = []                                   # Predicted Labels
for item in val_sentence:
    item = item.lower()
    tagseq += viterbi(item, word_dict, tag_dict, ntags)

print(len(tagseq))

# # NOT REQUIRED
# val_label =  val_data['label_seq'].tolist()
# tag_str_val = ''
# for item in val_label:
#     item = item[1:-1]
#     tag_str_val += item + ', '
    
# tag_list_str_val = tag_str_val.split(', ')
# tag_list_str_val = tag_list_str_val[0:-1]
# labelseq = []                                # Truthful Labels (Ground Truths)
# for item in tag_list_str_val:
#     labelseq.append(int(item))

# print(len(labelseq))

output = pd.DataFrame(tagseq)
output.columns = ['label']
output.index.name = 'idx'
output.index += 1
path = os.getcwd() + '/result_HMM_test.csv'
output.to_csv(path)
output.head()


50175


Unnamed: 0_level_0,label
idx,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0
