In [17]:
import os
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
from tqdm import tqdm


ROOT_DIR = '../' if 'HMM' in os.getcwd() else os.getcwd() # setting the root dir
POS_DIR = os.path.join(ROOT_DIR, 'dataset') # setting the pos dir

pos_train = os.path.join(POS_DIR, "train.txt") 
pos_test = os.path.join(POS_DIR, "test.txt") 

In [20]:
def format_data(fname):
    sentences = [] # master list
    with open(fname) as f:
        content = f.readlines()
    
    sentence = [] # local list
    for line in content:
        if line !='\n':
            line = line.strip() # remove leading/trailing spaces
            word = line.split()[0].lower() # get the word
            pos = ""
            pos = line.split()[1] # get the pos tag
            sentence.append((word, pos)) # create a pair and save to local list
        else:
            sentences.append(sentence) # once a \n is detected, append the local sentence to master sentence
            sentence = []
    return sentences

data = format_data(pos_train)
train_set, test_set = train_test_split(data, train_size=.75, test_size=.25, random_state=42)
print(len(train_set))
print(len(test_set))

6702
2234


In [6]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words), len(test_tagged_words))

211727 47377


In [7]:
tags = {tag for word,tag in train_tagged_words}
print(tags, len(tags))
vocab = {word for word,tag in train_tagged_words}

{'JJS', 'VBN', ':', 'WRB', 'NNS', 'SYM', 'MD', 'CC', 'RBR', 'RBS', 'WP', 'PRP$', 'FW', '.', '``', 'JJ', 'VBP', ',', 'NNPS', 'TO', 'IN', 'VBD', 'POS', 'PDT', '(', 'UH', '#', 'CD', 'RP', ')', 'DT', 'JJR', 'WP$', 'VBZ', 'EX', 'VBG', '$', 'PRP', 'NN', "''", 'VB', 'RB', 'WDT', 'NNP'} 44


In [8]:
def compute_emmision(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return count_w_given_tag /count_tag

transition_tags = [pair[1] for pair in train_tagged_words]


def compute_transition(t2, t1):
    count_t1 = len([t for t in transition_tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(transition_tags)-1):
        if transition_tags[index]==t1 and transition_tags[index+1] == t2:
            count_t2_t1 += 1
    return count_t2_t1/count_t1

In [9]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = compute_transition(t2, t1)
print(tags_matrix)

[[0.         0.         0.00534759 ... 0.02139037 0.         0.00802139]
 [0.         0.0312828  0.00398908 ... 0.04241025 0.0004199  0.0308629 ]
 [0.         0.01432665 0.00286533 ... 0.05921681 0.0191022  0.12225406]
 ...
 [0.00060542 0.08627214 0.00378387 ... 0.05887695 0.00045406 0.00681096]
 [0.         0.         0.00104712 ... 0.02827225 0.         0.01465969]
 [0.         0.00040233 0.0066888  ... 0.0078958  0.00125729 0.39011267]]


In [10]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,JJS,VBN,:,WRB,NNS,SYM,MD,CC,RBR,RBS,...,EX,VBG,$,PRP,NN,'',VB,RB,WDT,NNP
JJS,0.0,0.0,0.005348,0.002674,0.144385,0.0,0.002674,0.005348,0.0,0.0,...,0.0,0.013369,0.005348,0.013369,0.40107,0.002674,0.008021,0.02139,0.0,0.008021
VBN,0.0,0.031283,0.003989,0.002939,0.0443,0.0,0.00147,0.013227,0.00168,0.00021,...,0.0,0.015956,0.008188,0.008188,0.090909,0.002939,0.00105,0.04241,0.00042,0.030863
:,0.0,0.014327,0.002865,0.008596,0.0468,0.0,0.007641,0.066858,0.0,0.00191,...,0.002865,0.019102,0.017192,0.042025,0.038204,0.002865,0.016237,0.059217,0.019102,0.122254
WRB,0.0,0.004184,0.0,0.0,0.075314,0.0,0.01046,0.0,0.0,0.0,...,0.008368,0.004184,0.0,0.248954,0.073222,0.0,0.002092,0.035565,0.002092,0.089958
NNS,0.000147,0.022983,0.019678,0.00235,0.011234,0.0,0.02871,0.059623,0.001689,0.000294,...,0.000441,0.014759,0.000147,0.004479,0.018504,0.003818,0.001248,0.031941,0.011601,0.003084
SYM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
MD,0.0,0.000461,0.0,0.0,0.000923,0.0,0.0,0.000461,0.000923,0.0,...,0.0,0.0,0.0,0.006922,0.0,0.0,0.787264,0.174435,0.0,0.0
CC,0.000186,0.021035,0.0,0.004095,0.071668,0.0,0.011355,0.0,0.002048,0.003351,...,0.004468,0.022338,0.025503,0.032204,0.122487,0.0,0.033321,0.045421,0.000745,0.16102
RBR,0.0,0.021807,0.009346,0.0,0.0,0.0,0.0,0.015576,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003115,0.109034,0.0,0.003115
RBS,0.0,0.057592,0.0,0.0,0.136126,0.0,0.0,0.0,0.0,0.0,...,0.0,0.005236,0.0,0.0,0.031414,0.0,0.005236,0.104712,0.0,0.015707


In [11]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in tqdm(enumerate(words),total=len(words)):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = compute_emmision(words[key], tag)
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [12]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
 
# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_tagged_words = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_run_untagged_words = [tup[0] for sent in test_run for tup in sent]


In [13]:
def test_accuracy(algorithm, tagged, untagged):
    start = time.time()
    tagged_seq = algorithm(untagged)
    end = time.time()
    difference = end-start
    
    print("Time taken in seconds: ", difference)
    
    # accuracy
    check = [i for i, j in zip(tagged_seq, tagged) if i == j] 
    
    accuracy = len(check)/len(tagged_seq)
    print('Viterbi Algorithm Accuracy: ',accuracy*100)
    return tagged_seq, check, accuracy

In [21]:
#Code to test all the test sentences
#(takes alot of time to run s0 we wont run it here)
# tagging the test sentences()
test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]


In [16]:
#To improve the performance,we specify a rule base tagger for unknown words 
# specify patterns for tagging
patterns = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
    (r'(The|the|A|a|An|an)$', 'AT'),   # articles
    (r'.*able$', 'JJ'),                # adjectives
    (r'.*ness$', 'NN'),                # nouns formed from adjectives
    (r'.*ly$', 'RB'),                  # adverbs
    (r'.*s$', 'NNS'),                  # plural nouns
    (r'.*ing$', 'VBG'),                # gerunds
    (r'.*ed$', 'VBD'),                 # past tense verbs
    (r'.*', 'NN'),                     # nouns
]
 
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

In [42]:
import tables


#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    ROW_SIZE = len(tags)
    filename = 'probabilities.h5'
    with tables.open_file(filename, mode='w') as f:
        atom = tables.Float64Atom()
        array_c = f.create_earray(f.root, 'data', atom, (0, ROW_SIZE))
    
     
    for key, word in tqdm(enumerate(words), total=len(words)):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = compute_emmision(words[key], tag)
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]       
        
         
        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                # getting state for which probability is maximum
                state_max = T[p.index(pmax)]  
        p = np.array(p).reshape(1, ROW_SIZE)  
        with tables.open_file(filename, mode='a') as f:
            f.root.data.append(p)               
             
        state.append(state_max)
    return list(zip(words, state))

In [43]:
#test accuracy on subset of test data 
# tagged_seq, check, accuracy = test_accuracy(Viterbi_rule_based, test_run_tagged_words, test_run_untagged_words)

100%|██████████| 231/231 [03:05<00:00,  1.25it/s]

Time taken in seconds:  185.13668608665466
Viterbi Algorithm Accuracy:  93.93939393939394





In [46]:
with tables.open_file('probabilities.h5', mode='r') as f:
    print(f.root.data[:])

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 8.89947047e-06]
 [0.00000000e+00 0.00000000e+00 3.44203261e-03 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 5.36612618e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [None]:
tagged_seq, check, accuracy = test_accuracy(Viterbi_rule_based, test_tagged_words, test_untagged_words)

100%|██████████| 47377/47377 [3:33:56<00:00,  3.69it/s]  

Time taken in seconds:  12836.207618236542
Viterbi Algorithm Accuracy:  91.80192920615488





In [31]:
import pickle
with open('output.pkl', 'wb') as f:
    pickle.dump([tagged_seq, check, accuracy], f)

In [22]:
with open('output.pkl', 'rb') as f:
    output = pickle.load(f)