In [1]:
from utils_pos import get_word_tag, preprocess  
import pandas as pd
from collections import defaultdict
import math
import numpy as np

In [2]:
with open("WSJ_02-21.pos", 'r') as f:
    training_corpus = f.readlines()

print(f"A few items of the training corpus list")
print(training_corpus[0:100])

A few items of the training corpus list
['In\tIN\n', 'an\tDT\n', 'Oct.\tNNP\n', '19\tCD\n', 'review\tNN\n', 'of\tIN\n', '``\t``\n', 'The\tDT\n', 'Misanthrope\tNN\n', "''\t''\n", 'at\tIN\n', 'Chicago\tNNP\n', "'s\tPOS\n", 'Goodman\tNNP\n', 'Theatre\tNNP\n', '(\t(\n', '``\t``\n', 'Revitalized\tVBN\n', 'Classics\tNNS\n', 'Take\tVBP\n', 'the\tDT\n', 'Stage\tNN\n', 'in\tIN\n', 'Windy\tNNP\n', 'City\tNNP\n', ',\t,\n', "''\t''\n", 'Leisure\tNN\n', '&\tCC\n', 'Arts\tNNS\n', ')\t)\n', ',\t,\n', 'the\tDT\n', 'role\tNN\n', 'of\tIN\n', 'Celimene\tNNP\n', ',\t,\n', 'played\tVBN\n', 'by\tIN\n', 'Kim\tNNP\n', 'Cattrall\tNNP\n', ',\t,\n', 'was\tVBD\n', 'mistakenly\tRB\n', 'attributed\tVBN\n', 'to\tTO\n', 'Christina\tNNP\n', 'Haag\tNNP\n', '.\t.\n', '\n', 'Ms.\tNNP\n', 'Haag\tNNP\n', 'plays\tVBZ\n', 'Elianti\tNNP\n', '.\t.\n', '\n', 'Rolls-Royce\tNNP\n', 'Motor\tNNP\n', 'Cars\tNNPS\n', 'Inc.\tNNP\n', 'said\tVBD\n', 'it\tPRP\n', 'expects\tVBZ\n', 'its\tPRP$\n', 'U.S.\tNNP\n', 'sales\tNNS\n', 'to\tTO\n',

In [3]:
with open("hmm_vocab.txt", 'r') as f:
    voc = f.read().split('\n')

print("A few items of the vocabulary list")
print(voc[0:50])
print()
print("A few items at the end of the vocabulary list")
print(voc[-50:])

A few items of the vocabulary list
['!', '#', '$', '%', '&', "'", "''", "'40s", "'60s", "'70s", "'80s", "'86", "'90s", "'N", "'S", "'d", "'em", "'ll", "'m", "'n'", "'re", "'s", "'til", "'ve", '(', ')', ',', '-', '--', '--n--', '--unk--', '--unk_adj--', '--unk_adv--', '--unk_digit--', '--unk_noun--', '--unk_punct--', '--unk_upper--', '--unk_verb--', '.', '...', '0.01', '0.0108', '0.02', '0.03', '0.05', '0.1', '0.10', '0.12', '0.13', '0.15']

A few items at the end of the vocabulary list
['yards', 'yardstick', 'year', 'year-ago', 'year-before', 'year-earlier', 'year-end', 'year-on-year', 'year-round', 'year-to-date', 'year-to-year', 'yearlong', 'yearly', 'years', 'yeast', 'yelled', 'yelling', 'yellow', 'yen', 'yes', 'yesterday', 'yet', 'yield', 'yielded', 'yielding', 'yields', 'you', 'young', 'younger', 'youngest', 'youngsters', 'your', 'yourself', 'youth', 'youthful', 'yuppie', 'yuppies', 'zero', 'zero-coupon', 'zeroing', 'zeros', 'zinc', 'zip', 'zombie', 'zone', 'zones', 'zoning', '{',

In [4]:
import string


# Punctuation characters
punct = set(string.punctuation)

# Morphology rules used to assign unknown word tokens
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

In [5]:
def assign_unk(tok):
    """
    Assign unknown word tokens
    """
    # Digits
    if any(char.isdigit() for char in tok):
        return "--unk_digit--"

    # Punctuation
    elif any(char in punct for char in tok):
        return "--unk_punct--"

    # Upper-case
    elif any(char.isupper() for char in tok):
        return "--unk_upper--"

    # Nouns
    elif any(tok.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Verbs
    elif any(tok.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Adjectives
    elif any(tok.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Adverbs
    elif any(tok.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"

    return "--unk--"

In [6]:
def preprocess(vocab,txt_path):
    orig=[]
    processed=[]
    with open(txt_path,"r") as f:
        for cnt,word in enumerate(f):
            
            if not word.strip():
                orig.append("--n--")
                processed.append("--n--")
            elif word.strip() not in vocab:
                orig.append(word.strip())
                word=assign_unk(word)
                processed.append(word.strip())
                continue
            else:
                orig.append(word.strip())
                processed.append(word.strip())
    return orig,processed
                
                

In [7]:
_, prep = preprocess(voc, "test.words") 
print(prep[0:10])

['The', 'economy', "'s", 'temperature', 'will', 'be', 'taken', 'from', 'several', '--unk--']


In [8]:
def get_word_tag(line, vocab): 
    if not line.split():
        word = "--n--"
        tag = "--s--"
        return word, tag
    else:
        word, tag = line.split()
        if word not in vocab: 
            # Handle unknown words
            word = assign_unk(word)
        return word, tag
    return None 

In [9]:
print(get_word_tag("review\tNN\n",voc))

('review', 'NN')


In [10]:
def trainer(corpus,vocab):
    prev_tag="--s--"
    trans=defaultdict(int)
    emis=defaultdict(int)
    tag_count=defaultdict(int)
    i=0
    for k in corpus:
        i+=1
        if i%50000==0:
            print(i)
        
        word,tag=get_word_tag(k,vocab)
        trans[(prev_tag,tag)]+=1
        emis[(tag,word)]+=1
        tag_count[tag]+=1
        prev_tag=tag
    return trans,emis,tag_count

In [11]:
trans,emis,tag_count=trainer(training_corpus,voc)

50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
700000
750000
800000
850000
900000
950000


In [12]:
print(tag_count)
states = sorted(tag_count.keys())

defaultdict(<class 'int'>, {'IN': 98554, 'DT': 81842, 'NNP': 91466, 'CD': 36568, 'NN': 132935, '``': 7092, "''": 6919, 'POS': 8701, '(': 1366, 'VBN': 20024, 'NNS': 59856, 'VBP': 12491, ',': 48727, 'CC': 23947, ')': 1376, 'VBD': 29889, 'RB': 30970, 'TO': 22357, '.': 39478, '--s--': 39832, 'VBZ': 21672, 'NNPS': 2673, 'PRP': 17436, 'PRP$': 8407, 'VB': 26438, 'JJ': 61217, 'MD': 9803, 'VBG': 14846, 'RBR': 1768, ':': 4772, 'WP': 2363, 'WDT': 4294, 'JJR': 3238, 'PDT': 370, 'RBS': 451, 'WRB': 2143, 'JJS': 1947, '$': 7372, 'RP': 2662, 'FW': 234, 'EX': 863, 'SYM': 58, '#': 142, 'LS': 36, 'UH': 97, 'WP$': 168})


In [13]:
def predict(prep,vocab,trans,tagz,emis):
    
    k=0
    
    for word,y_tup in zip(prep,y):
        if len(y_tup.split())!=2:
            
            continue
        else:
            true=y_tup.split()[1]
            
        pred_key=""
        if word in vocab:
            pos=0
            for key in tagz:
                if (key,word) not in emis.keys():
                    
                    continue
                elif pos<emis[(key,word)]:
                    pos=emis[(key,word)]
                    pred_key=key
            if pred_key==true:
              k=k+1
        
        
        
    return k/len(y)

In [14]:
with open("WSJ_24.pos", 'r') as f:
    y = f.readlines()
print(y[0:100])

['The\tDT\n', 'economy\tNN\n', "'s\tPOS\n", 'temperature\tNN\n', 'will\tMD\n', 'be\tVB\n', 'taken\tVBN\n', 'from\tIN\n', 'several\tJJ\n', 'vantage\tNN\n', 'points\tNNS\n', 'this\tDT\n', 'week\tNN\n', ',\t,\n', 'with\tIN\n', 'readings\tNNS\n', 'on\tIN\n', 'trade\tNN\n', ',\t,\n', 'output\tNN\n', ',\t,\n', 'housing\tNN\n', 'and\tCC\n', 'inflation\tNN\n', '.\t.\n', '\n', 'The\tDT\n', 'most\tRBS\n', 'troublesome\tJJ\n', 'report\tNN\n', 'may\tMD\n', 'be\tVB\n', 'the\tDT\n', 'August\tNNP\n', 'merchandise\tNN\n', 'trade\tNN\n', 'deficit\tNN\n', 'due\tJJ\n', 'out\tIN\n', 'tomorrow\tNN\n', '.\t.\n', '\n', 'The\tDT\n', 'trade\tNN\n', 'gap\tNN\n', 'is\tVBZ\n', 'expected\tVBN\n', 'to\tTO\n', 'widen\tVB\n', 'to\tTO\n', 'about\tIN\n', '$\t$\n', '9\tCD\n', 'billion\tCD\n', 'from\tIN\n', 'July\tNNP\n', "'s\tPOS\n", '$\t$\n', '7.6\tCD\n', 'billion\tCD\n', ',\t,\n', 'according\tVBG\n', 'to\tTO\n', 'a\tDT\n', 'survey\tNN\n', 'by\tIN\n', 'MMS\tNNP\n', 'International\tNNP\n', ',\t,\n', 'a\tDT\n', 'unit\tNN

In [16]:
k=predict(prep,voc,trans,states,emis)

In [17]:
print(k)


0.8888563993099213


In [90]:
i=0
vocabulary={}
for k in voc:
    vocabulary[k]=i
    i=i+1
print(vocabulary)



In [18]:
tagz=states


In [19]:
def transmat(alpha,tag_count,tagz,trans,emis):
    mat=np.zeros((len(tagz),len(tagz)))
    for i in range(len(tagz)):
                   for j in range(len(tagz)):
                      mat[i][j]=(trans[(tagz[i],tagz[j])]+alpha)/(tag_count[tagz[i]]+alpha*len(tagz))
    return mat              

In [20]:
mat_trans=transmat(0.001,tag_count,tagz,trans,emis)


In [22]:
def emismat(alpha,tag_count,tagz,emis,voc):
    mat=np.zeros((len(tagz),len(voc)))
    for i in range(len(tagz)):
        for j in range(len(voc)):
            mat[i][j]=(emis[(tagz[i],voc[j])]+alpha)/(tag_count[tagz[i]]+alpha*len(voc))
    return mat

In [23]:
emis_mat=emismat(0.001,tag_count,tagz,emis,voc)

In [27]:
i=0

NameError: name 'vocab' is not defined

In [40]:
def initialize(states, tag_counts, trans_mat, emis_mat, corpus, vocab):
    n=len(states)
    best_probs=np.zeros((n,len(corpus)))
    best_paths = np.zeros((n, len(corpus)))
    s_idx = int(states.index("--s--"))
    for i in range(n):
        if trans_mat[s_idx,i]==0:
            best_probs[i][0]=-999999999999
        else:
            best_probs[i][0]=math.log(trans_mat[s_idx,i])+math.log(emis_mat[i,voc.index(corpus[0])])
    return best_probs,best_paths

In [41]:
best_probs,best_paths=initialize(states,tag_count,mat_trans,emis_mat,prep,voc)

In [99]:
def viterbi_forward(trans_mat, emismat, corpus, best_prob, best_paths, vocabulary):
    n=best_prob.shape[0]
    
    for i in range(1,len(corpus)):
        if i%1000==0:
         print(i)
        
       
        
       
        for j in range(n):
            least=float('-inf')
            best=None
            for k in range(n):
                
                calc=best_prob[k][i-1]+math.log(trans_mat[k][j])+math.log(emismat[j][vocabulary[corpus[i]]])
                if calc>least:
                    least=calc
                    best=k
            best_probs[j][i]=least
            best_paths[j][i]=best      
    return best_probs,best_paths
                    
            

In [100]:
best_probs,best_paths=viterbi_forward(mat_trans, emis_mat, prep, best_probs, best_paths, vocabulary)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000


In [98]:
print(best_probs)

[[-22.60982633          inf          inf ...          inf          inf
           inf]
 [-23.07660654          inf          inf ...          inf          inf
           inf]
 [-23.57298822          inf          inf ...          inf          inf
           inf]
 ...
 [-22.75551606          inf          inf ...          inf          inf
           inf]
 [-19.6637215           inf          inf ...          inf          inf
           inf]
 [-18.36288463          inf          inf ...          inf          inf
           inf]]


In [None]:
def viterbi_backward(best_probs, best_paths, corpus, states):
    minprob=float('-inf')
    m=len(corpus)
    
    pred = [None] * m
    z=[None]*m
    tag=0
    for i in range(best_probs.shape[0]):
        if best_probs[i,-1]<minprob:
            minprob=best_probs[i,-1]
            z[m-1]=i
            pred[m-1]=states[i]
    for i in range(len(corpus)-1,-1,-1):
        z[i-1]=best_paths[z[i],i]
        pred[i-1]=states[i-1]
        
        
        
    
    
    
    
            

In [106]:
def viterbi_backward(best_probs, best_paths, corpus, states):
    m = len(corpus)
    z = [None] * m
    pred = [None] * m
    
    # Step 1: Find best POS tag index for the last word
    maxprob = float('-inf')
    for i in range(best_probs.shape[0]):
        if best_probs[i, -1] > maxprob:
            maxprob = best_probs[i, -1]
            z[m - 1] = i
    pred[m - 1] = states[z[m - 1]]
    
    # Step 2: Walk backward to find the full best path
    for i in range(m - 1, 0, -1):
        z[i - 1] = best_paths[int(z[i]), i]
        pred[i - 1] = states[int(z[i - 1])]
    
    return pred


In [107]:
pred=viterbi_backward(best_probs, best_paths, prep, states)

In [108]:
print(pred)

['DT', 'NN', 'POS', 'NN', 'MD', 'VB', 'VBN', 'IN', 'JJ', 'NN', 'VBZ', 'DT', 'NN', ',', 'IN', 'NNS', 'IN', 'NN', ',', 'NN', ',', 'NN', 'CC', 'NN', '.', '--s--', 'DT', 'RBS', 'JJ', 'NN', 'MD', 'VB', 'DT', 'NNP', 'NN', 'NN', 'NN', 'JJ', 'IN', 'NN', '.', '--s--', 'DT', 'NN', 'NN', 'VBZ', 'VBN', 'TO', 'VB', 'TO', 'RB', '$', 'CD', 'CD', 'IN', 'NNP', 'POS', '$', 'CD', 'CD', ',', 'VBG', 'TO', 'DT', 'NN', 'IN', 'NNP', 'NNP', ',', 'DT', 'NN', 'IN', 'NNP', 'NNP', ',', 'NNP', 'NNP', '.', '--s--', 'NNP', 'POS', 'NN', 'IN', 'DT', 'NNP', 'NN', 'NN', 'NN', 'VBZ', 'VBN', 'TO', 'VB', ',', 'IN', 'RB', 'RB', 'RB', 'IN', 'DT', 'CD', 'NN', 'NN', 'VBD', 'NNP', 'IN', 'DT', 'NN', 'NN', 'NN', '.', '--s--', 'DT', 'NN', 'VBD', 'VBG', 'VBN', 'IN', 'DT', 'NN', 'DT', 'NN', 'NN', 'VBD', 'RB', 'RB', 'IN', 'NNP', 'POS', 'NN', ',', 'IN', 'PRP', 'VBD', 'VBN', 'IN', 'PRP$', 'JJ', 'JJ', 'NN', '.', '--s--', 'NNS', 'VBP', 'VBN', 'RB', 'TO', 'WRB', 'JJ', 'NN', 'NN', 'PRP', 'VBP', 'TO', 'VB', 'IN', 'NNP', 'NNS', 'IN', 'JJ', 'N

In [109]:
def compute_accuracy(pred, y):
    '''
    Input: 
        pred: a list of the predicted parts-of-speech 
        y: a list of lines where each word is separated by a '\t' (i.e. word \t tag)
    Output: 
        
    '''
    num_correct = 0
    total = 0
    
    # Zip together the prediction and the labels
    for prediction, y in zip(pred, y):
        ### START CODE HERE (Replace instances of 'None' with your code) ###
        # Split the label into the word and the POS tag
        word_tag_tuple = y.split()
        
        # Check that there is actually a word and a tag
        # no more and no less than 2 items
        if len(word_tag_tuple)!=2: # complete this line
            continue 

        # store the word and tag separately
        word, tag = word_tag_tuple
        
        # Check if the POS tag label matches the prediction
        if prediction == tag: # complete this line
            
            # count the number of times that the prediction
            # and label match
            num_correct += 1
            
        # keep track of the total number of examples (that have valid labels)
        total += 1
        
        ### END CODE HERE ###
    return num_correct/total

In [111]:
print(compute_accuracy(pred,y))

0.953063647155511
