In [1]:
import pandas as pd
import numpy as np

# Task 1: Vocabulary Creation

In [196]:
train = pd.read_csv("data/train", sep='\t', header=None, names=['index', 'word', 'postag'])

In [197]:
vocabulary = {}
def create_vocab(word, vocabulary):
    if word in vocabulary:
        vocabulary[word] += 1
    else:
        vocabulary[word] = 1 
    return word

train['word_duplicate'] = train['word'].apply(lambda x: create_vocab(x, vocabulary))

In [198]:
def filter_vocab(vocabulary, threshold):
    filtered_vocabulary = {}
    filtered_vocabulary['< unk >'] = 0
    
    for word, value in vocabulary.items():
        if value <= threshold:
            filtered_vocabulary['< unk >'] += value
        else:
            filtered_vocabulary[word] = value
    
    return dict(sorted(filtered_vocabulary.items(), key = lambda x:x[1], reverse=True))
            
threshold = 1
vocab = filter_vocab(vocabulary, threshold)          

In [199]:
print("Size of vocabulary: ", len(vocab))
print("Number of occurrences of < unk >: ", vocab['< unk >'])

Size of vocabulary:  23183
Number of occurrences of < unk >:  20011


In [200]:
vocabtxt = open('vocab.txt', 'w')
s = '< unk >\t' + '0' + '\t' + str(vocab['< unk >']) + '\n'
vocabtxt.write(s)

index = 1
for word, value in vocab.items():
    if word != '< unk >':
        s = word + '\t' + str(index) + '\t' + str(vocab[word]) + '\n'
        vocabtxt.write(s)
        index += 1
        
vocabtxt.close()

# Task 2: Model Learning

In [201]:
postag_vocab = {}

train['postag_duplicate'] = train['postag'].apply(lambda x: create_vocab(x, postag_vocab))

In [202]:
print(postag_vocab)
print(len(postag_vocab))

{'NNP': 87608, ',': 46480, 'CD': 34876, 'NNS': 57859, 'JJ': 58944, 'MD': 9437, 'VB': 25489, 'DT': 78775, 'NN': 127534, 'IN': 94758, '.': 37883, 'VBZ': 20982, 'VBG': 14348, 'CC': 22817, 'VBD': 28309, 'VBN': 19330, 'RB': 29621, 'TO': 21461, 'PRP': 16766, 'RBR': 1675, 'WDT': 4194, 'VBP': 12326, 'RP': 2515, 'PRP$': 7989, 'JJS': 1867, 'POS': 8284, '``': 6782, 'EX': 833, "''": 6622, 'WP': 2285, ':': 4680, 'JJR': 3174, 'WRB': 2050, '$': 6937, 'NNPS': 2505, 'WP$': 166, '-LRB-': 1305, '-RRB-': 1321, 'PDT': 333, 'RBS': 435, 'FW': 224, 'UH': 87, 'SYM': 55, 'LS': 47, '#': 127}
45


In [203]:
transition_nums = {}
postags_extracted = train['postag'].values

ptesize = postags_extracted.shape[0]

for i in range(1,ptesize):
    sstuple = (postags_extracted[i-1], postags_extracted[i])
    if sstuple in transition_nums:
        transition_nums[sstuple] += 1
    else:
        transition_nums[sstuple] = 1

In [204]:
emission_nums = {}
words_extracted = train['word'].values

for i in range(0,ptesize):
    word = words_extracted[i]
    if word not in vocab:
        word = '< unk >'
        
    xstuple = (postags_extracted[i], word)
    if xstuple in emission_nums:
        emission_nums[xstuple] += 1
    else:
        emission_nums[xstuple] = 1

In [205]:
for key, value in transition_nums.items():
    s = key[0]
    denom = postag_vocab[s]
    transition_nums[key] = float(value/denom)

In [206]:
for key, value in emission_nums.items():
    s = key[0]
    denom = postag_vocab[s]
    emission_nums[key] = float(value/denom)

In [264]:
print("# of transition parameters: ", len(transition_nums))
print("# of emission parameters: ", len(emission_nums))

# of transition parameters:  1378
# of emission parameters:  30303


In [207]:
def tup2str(tup):
    s = "('" + tup[0] + "', '" + tup[1] + "')"
    return s

In [208]:
start_probabilities = {}
index_extracted = train['index'].values
sentences = 0

for i in range(0, len(index_extracted)):
    index = index_extracted[i]
    pos = postags_extracted[i]
    
    if pos in start_probabilities:
        if index == 1:
            sentences += 1
            start_probabilities[pos] += 1
    else:
        if index == 1:
            sentences += 1
            start_probabilities[pos] = 1

In [209]:
sums = 0
for key, value in start_probabilities.items():
    sums += value
    start_probabilities[key] = float(value/sentences)

In [210]:
emission_nums_str = {tup2str(key): value for key, value in emission_nums.items()}
transition_nums_str = {tup2str(key): value for key, value in transition_nums.items()}

In [211]:
import json

hmm = {}
hmm['transition'] = transition_nums_str
hmm['emission'] = emission_nums_str

hmmjson = json.dumps(hmm, indent=4)
with open("hmm.json", "w") as outfile:
    outfile.write(hmmjson)

# Task 3: Greedy Decoding with HMM

In [212]:
dev = pd.read_csv("data/dev", sep='\t', header=None, names=['index', 'word', 'postag'])

In [213]:
dev.info

<bound method DataFrame.info of         index          word postag
0           1           The     DT
1           2       Arizona    NNP
2           3  Corporations    NNP
3           4    Commission    NNP
4           5    authorized    VBD
...       ...           ...    ...
131763     13          join     VB
131764     14           the     DT
131765     15       winning    VBG
131766     16        bidder     NN
131767     17             .      .

[131768 rows x 3 columns]>

In [214]:
dev.head()

Unnamed: 0,index,word,postag
0,1,The,DT
1,2,Arizona,NNP
2,3,Corporations,NNP
3,4,Commission,NNP
4,5,authorized,VBD


In [215]:
dev_words_extracted = dev['word'].values
dev_postags_extracted = dev['postag'].values

In [216]:
for postag, value in postag_vocab.items():
    print(postag)

NNP
,
CD
NNS
JJ
MD
VB
DT
NN
IN
.
VBZ
VBG
CC
VBD
VBN
RB
TO
PRP
RBR
WDT
VBP
RP
PRP$
JJS
POS
``
EX
''
WP
:
JJR
WRB
$
NNPS
WP$
-LRB-
-RRB-
PDT
RBS
FW
UH
SYM
LS
#


In [238]:
# GREEDY DECODE

def greedy_decoding(words, transition_nums, emission_nums):
    postag_preds = []
    findmax = {}
    word = words[0]
    if word not in vocab:
        word = '< unk >'
        
    for postag, value in postag_vocab.items():
        tupled_key = (postag, word)
        if postag in start_probabilities and tupled_key in emission_nums:
            findmax[postag] = start_probabilities[postag] * emission_nums[tupled_key]
    last_postag = max(findmax, key=findmax.get)

    postag_preds.append(last_postag)
    
    words_len = words.shape[0]
    for i in range(1, words_len):
        findmax.clear()
        word = words[i]
        if word not in vocab:
                word = '< unk >'
            
        for postag, value in postag_vocab.items():          
            trans_key = (last_postag, postag)
            emiss_key = (postag, word)
            if trans_key in transition_nums and emiss_key in emission_nums:
                findmax[postag] = transition_nums[trans_key] * emission_nums[emiss_key]

        if not findmax:
            last_postag = ':'
        else:
            last_postag = max(findmax, key=findmax.get)
        postag_preds.append(last_postag)
        
    return np.array(postag_preds)
    
    
dev_greedy_preds = greedy_decoding(dev_words_extracted, transition_nums, emission_nums)

In [239]:
print(('DT','NN') in transition_nums)
print(('NNP','Arizona') in emission_nums)
print(('Corporations') in vocab)

True
True
True


In [240]:
print(dev_words_extracted.shape)

(131768,)


In [241]:
def accuracy(preds, trues):
    length = preds.shape[0]
    right = 0
    for i in range(0, length):
        if preds[i] == trues[i]:
            right += 1
            
    return float(right/length)
        

In [242]:
print("Accuracy of greedy decoding on dev dataset: ", accuracy(dev_greedy_preds,dev_postags_extracted))

Accuracy of greedy decoding on dev dataset:  0.93507528383219


In [243]:
print(dev_greedy_preds.shape[0])

131768


In [244]:
print(len(vocab))

23183


In [245]:
# ON TEST DATASET

test = pd.read_csv("data/test", sep='\t', header=None, names=['index', 'word'])

In [246]:
test.head()

Unnamed: 0,index,word
0,1,Influential
1,2,members
2,3,of
3,4,the
4,5,House


In [247]:
test_index_extracted = test['index'].values
test_words_extracted = test['word'].values

In [248]:
test_greedy_preds = greedy_decoding(test_words_extracted, transition_nums, emission_nums)

In [249]:
print(test_greedy_preds.shape[0])

129654


In [250]:
print(test_words_extracted.shape[0])

129654


In [251]:
greedytest = open('greedy.out', 'w')

s = str(test_index_extracted[0]) + '\t' + test_words_extracted[0] + '\t' + test_greedy_preds[0] + '\n'
greedytest.write(s)

test_length = test_greedy_preds.shape[0]
for i in range(1, test_length):
    if test_index_extracted[i] == 1:
        greedytest.write('\n')
    s = str(test_index_extracted[i]) + '\t' + test_words_extracted[i] + '\t' + test_greedy_preds[i] + '\n'
    greedytest.write(s)
        
greedytest.close()

# Task 4: Viterbi Decoding with HMM

In [252]:
postag_array = np.array(list(postag_vocab.keys()))
print(postag_array)

['NNP' ',' 'CD' 'NNS' 'JJ' 'MD' 'VB' 'DT' 'NN' 'IN' '.' 'VBZ' 'VBG' 'CC'
 'VBD' 'VBN' 'RB' 'TO' 'PRP' 'RBR' 'WDT' 'VBP' 'RP' 'PRP$' 'JJS' 'POS'
 '``' 'EX' "''" 'WP' ':' 'JJR' 'WRB' '$' 'NNPS' 'WP$' '-LRB-' '-RRB-'
 'PDT' 'RBS' 'FW' 'UH' 'SYM' 'LS' '#']


In [253]:
print(postag_array.shape)

(45,)


In [254]:
# VITERBI DECODE

def viterbi_decoding(words, postag_array, transition_nums, emission_nums):
    rows = postag_array.shape[0]
    cols = words.shape[0]
    
    pi = np.zeros((cols,rows), dtype=np.float128)
    prevs = np.zeros((cols, rows), dtype=int)
    
    word = words[0]
    if word not in vocab:
            word = '< unk >'
    
    for i in range(0, rows):
        postag = postag_array[i]
        
        start_prob = 0
        if postag in start_probabilities:
            start_prob = start_probabilities[postag]
            
        emission_prob = 0
        emiss_key = (postag, word)
        if emiss_key in emission_nums:
            emission_prob = emission_nums[emiss_key]
            
        pi[0,i] = start_prob * emission_prob
        
    words_len = words.shape[0]
    
    for j in range(1, words_len):
        word = words[j]
        if word not in vocab:
            word = '< unk >'
        for s in range(0, rows):
            postag = postag_array[s]
            findmax = []
            for sprime in range(0, rows):
                past_postag = postag_array[sprime]

                trans_prob = 0
                trans_key = (past_postag, postag)
                if trans_key in transition_nums:
                    trans_prob = transition_nums[trans_key]

                emission_prob = 0
                emiss_key = (postag, word)
                if emiss_key in emission_nums:
                    emission_prob = emission_nums[emiss_key]
                    
                pivalue = pi[j-1,sprime] + np.log(trans_prob + 0.00000001) + np.log(emission_prob + 0.00000001)
                
                findmax.append(pivalue)
                
            findmax = np.array(findmax)
            maxval = np.max(findmax)
            max_sprime = np.argmax(findmax)
            
            pi[j,s] = maxval
            prevs[j,s] = max_sprime
            
    return pi, prevs
            
            
            
        
pi, prevs = viterbi_decoding(dev_words_extracted, postag_array, transition_nums, emission_nums)

In [255]:
last = pi[prevs.shape[0] - 1,:]

def reverse_traverse(last, prevs, postag_array):
    viterbi_preds = []
    last_index = np.argmax(last)
    viterbi_preds.append(postag_array[last_index])
    
    length = prevs.shape[0]
    
    for i in reversed(range(1,length)):
        index = prevs[i,last_index]
        viterbi_preds.append(postag_array[index])
        last_index = index
        
    viterbi_preds = viterbi_preds[::-1]
    return np.array(viterbi_preds)
    
viterbi_preds = reverse_traverse(last, prevs, postag_array)

In [256]:
print(dev_postags_extracted[:37])


['DT' 'NNP' 'NNP' 'NNP' 'VBD' 'DT' 'CD' 'NN' 'NN' 'NN' 'IN' 'NNP' 'NNP'
 'NNP' 'NNP' ',' 'RB' 'JJR' 'IN' 'VBN' 'JJ' 'NN' 'IN' 'DT' 'NN' 'NN' 'NN'
 'CC' 'RB' 'PDT' 'DT' 'NN' 'VBN' 'IN' 'DT' 'NN' '.']


In [257]:
print("Accuracy of viterbi decoding on dev dataset: ", accuracy(viterbi_preds, dev_postags_extracted))

Accuracy of viterbi decoding on dev dataset:  0.9480754052577257


In [258]:
print(viterbi_preds)

['NNP' 'NNP' 'NNS' ... 'JJ' 'NN' '.']


In [259]:
test_pi, test_prevs = viterbi_decoding(test_words_extracted, postag_array, transition_nums, emission_nums)

In [260]:
test_last = test_pi[test_prevs.shape[0] - 1,:]

In [261]:
test_viterbi_preds = reverse_traverse(test_last, test_prevs, postag_array)

In [262]:
print(test_viterbi_preds.shape)

(129654,)


In [263]:
viterbitest = open('viterbi.out', 'w')

s = str(test_index_extracted[0]) + '\t' + test_words_extracted[0] + '\t' + test_viterbi_preds[0] + '\n'
viterbitest.write(s)

test_length = test_viterbi_preds.shape[0]
for i in range(1, test_length):
    if test_index_extracted[i] == 1:
        viterbitest.write('\n')
    s = str(test_index_extracted[i]) + '\t' + test_words_extracted[i] + '\t' + test_viterbi_preds[i] + '\n'
    viterbitest.write(s)
        
viterbitest.close()