## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import random
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import time
import sys

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# first few tagged sentences
print(nltk_data[:40])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

In [4]:
# Splitting into train and test
train_set, test_set = train_test_split(nltk_data,test_size=0.05,random_state=100)

print(len(train_set))
print(len(test_set))

3718
196


In [5]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95949

In [6]:
train_tagged_words[:5]

[('One', 'NUM'),
 ('bright', 'ADJ'),
 ('sign', 'NOUN'),
 ('is', 'VERB'),
 ('that', 'ADP')]

In [7]:
# getting vocabulary
tokens = [pair[0] for pair in train_tagged_words]
vocabulary = set(tokens)
print(len(vocabulary))

12106


In [8]:
# number of tags
tags = set([pair[1] for pair in train_tagged_words])
print(len(tags))
tags

12


{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

#### Emission Probability

In [9]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

#### Transiiton Probability Calculation

In [10]:
# Method to count the tags
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [11]:
# Transition Probability Calculattion
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [12]:
#Corelation
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
tags_df

Unnamed: 0,CONJ,VERB,ADP,.,ADV,ADJ,X,PRT,NOUN,NUM,PRON,DET
CONJ,0.000464,0.155308,0.053778,0.035698,0.053778,0.118683,0.008809,0.003709,0.350487,0.042188,0.058414,0.118683
VERB,0.005186,0.168744,0.091184,0.034291,0.08205,0.06564,0.218438,0.031427,0.111386,0.022448,0.035916,0.133292
ADP,0.000848,0.008481,0.017492,0.039754,0.013357,0.107389,0.034984,0.001484,0.321213,0.06191,0.069119,0.323969
.,0.058032,0.088708,0.092206,0.092923,0.052292,0.043681,0.026908,0.002511,0.222531,0.081353,0.065208,0.173558
ADV,0.006991,0.344541,0.119507,0.135153,0.07723,0.13016,0.023302,0.014314,0.031624,0.031624,0.015646,0.069907
ADJ,0.016052,0.011794,0.078624,0.063882,0.004914,0.067158,0.020311,0.010156,0.700901,0.020803,0.000491,0.004914
X,0.010316,0.204571,0.144898,0.162831,0.025393,0.016505,0.074433,0.184891,0.062371,0.002857,0.055705,0.055229
PRT,0.002297,0.405184,0.019357,0.043635,0.010171,0.083661,0.013123,0.001969,0.245735,0.056102,0.017717,0.10105
NOUN,0.042921,0.146955,0.177058,0.239951,0.016813,0.012165,0.028868,0.043357,0.26428,0.00955,0.004721,0.013363
NUM,0.013072,0.016934,0.035056,0.118835,0.002674,0.033571,0.211824,0.026144,0.352347,0.184195,0.001485,0.003862


### Build the vanilla Viterbi based POS tagger

In [13]:
#Vanilla Viterbi
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

### Evaluate Vailla

In [14]:
#To get accuracy matrix
accuracy_matrix={}

In [15]:
#To get test tagged words
test_tagged_words = [tup for sent in test_set for tup in sent]
test_words = [tup[0] for sent in test_set for tup in sent]
test_words[0:10]

['Investors',
 'took',
 'advantage',
 'of',
 'Tuesday',
 "'s",
 'stock',
 'rally',
 '*-1',
 'to']

In [16]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_words)
print("Time taken in seconds: ", time.time()-start)

Time taken in seconds:  629.9195320606232


In [17]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_tagged_words) if i == j] 
accuracy = len(check)/len(tagged_seq)
accuracy_matrix['Vanilla'] = accuracy
accuracy

0.9033213454622382

In [18]:
#To identify incorrect tagging by Vanilla Viterbi
incorrect_tags = [[j,i[1]] for i, j in zip(tagged_seq, test_tagged_words) if i != j] 
incorrect_tags

[[('book', 'VERB'), 'NOUN'],
 [('stocks', 'ADV'), 'NOUN'],
 [('up', 'ADP'), 'PRT'],
 [('over', 'PRT'), 'ADP'],
 [('ignored', 'VERB'), 'CONJ'],
 [('mine', 'ADJ'), 'NOUN'],
 [('Palestinian', 'NOUN'), 'ADJ'],
 [('first', 'ADV'), 'ADJ'],
 [('Preston', 'NOUN'), 'CONJ'],
 [('Birmingham', 'NOUN'), 'CONJ'],
 [('Ala', 'NOUN'), 'CONJ'],
 [('clamped', 'VERB'), 'CONJ'],
 [('ankle', 'NOUN'), 'CONJ'],
 [('third-largest', 'ADJ'), 'CONJ'],
 [('fifth-largest', 'ADJ'), 'CONJ'],
 [('Z.', 'NOUN'), 'CONJ'],
 [('Wick', 'NOUN'), 'CONJ'],
 [('89.7', 'NUM'), 'CONJ'],
 [('141.9', 'NUM'), 'CONJ'],
 [('94.8', 'NUM'), 'CONJ'],
 [('149.9', 'NUM'), 'CONJ'],
 [('argues', 'VERB'), 'CONJ'],
 [('Sit', 'VERB'), 'CONJ'],
 [('British', 'NOUN'), 'ADJ'],
 [('halt', 'VERB'), 'NOUN'],
 [('slides', 'VERB'), 'NOUN'],
 [('most', 'ADV'), 'ADJ'],
 [('athletic', 'ADJ'), 'CONJ'],
 [('to', 'ADJ'), 'PRT'],
 [('better', 'ADV'), 'ADJ'],
 [('illustrates', 'VERB'), 'CONJ'],
 [('attempt', 'NOUN'), 'VERB'],
 [('usurp', 'VERB'), 'CONJ'],
 [('

# Common Issues
1. Numbers are defined as Det
2. Nouns are defines as Det.
3. Verts are defined as Det.
4. other are defied as Det.
5. Verbs are defined as Noun.

Most of the incorrect POS  are Determinent.

### Solve the problem of unknown words.

###  1. Most Frequent Tag

As per the logic viterbi algorithm assigns the first tag from the tag list for unknown words. 

As determenent is first in the list. all unknown words are assigned as determinent.

Hence, using the sorted list. (As per frequency) to that most frequent comes in the list and all unknown words are assigned as Noun

In [19]:
def sorted_tags(train_bag = train_tagged_words):
    T = list(set([pair[1] for pair in train_bag]))
    tag_freq={}
    for  tag in tags:
        tag_freq[tag] =  len([pair for pair in train_bag if pair[1]==tag])
    tag_freq = pd.DataFrame(tag_freq.items(),columns=['tag','freq'])
    return list(tag_freq.sort_values(by='freq',ascending=False).tag)

In [20]:
#Most common tag
sorted_tags()[0]

'NOUN'

In [21]:
#Viterbi -  Most common tag
def Viterbi_frequent(words, train_bag = train_tagged_words):
    state = []
    T = sorted_tags()
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

#### Evaluate Most Frequent Tag

In [22]:
# tagging the test sentences
start = time.time()
tagged_seq_frequent = Viterbi_frequent(test_words)
print("Time taken in seconds: ", time.time()-start)

Time taken in seconds:  687.4354703426361


In [23]:
# accuracy
check = [i for i, j in zip(tagged_seq_frequent, test_tagged_words) if i == j] 
accuracy = len(check)/len(tagged_seq_frequent)
accuracy_matrix['Most Frequent POS'] = accuracy
accuracy

0.9348423947535435

In [24]:
#To identify incorrect tagging by Viterbi  - most frequent words
incorrect_tags = [[j,i[1]] for i, j in zip(tagged_seq_frequent, test_tagged_words) if i != j] 

In [25]:
incorrect_tags

[[('book', 'VERB'), 'NOUN'],
 [('stocks', 'ADV'), 'NOUN'],
 [('up', 'ADP'), 'PRT'],
 [('over', 'PRT'), 'ADP'],
 [('ignored', 'VERB'), 'NOUN'],
 [('mine', 'ADJ'), 'NOUN'],
 [('Palestinian', 'NOUN'), 'ADJ'],
 [('first', 'ADV'), 'ADJ'],
 [('clamped', 'VERB'), 'NOUN'],
 [('third-largest', 'ADJ'), 'NOUN'],
 [('fifth-largest', 'ADJ'), 'NOUN'],
 [('89.7', 'NUM'), 'NOUN'],
 [('141.9', 'NUM'), 'NOUN'],
 [('94.8', 'NUM'), 'NOUN'],
 [('149.9', 'NUM'), 'NOUN'],
 [('argues', 'VERB'), 'NOUN'],
 [('Sit', 'VERB'), 'NOUN'],
 [('down', 'ADV'), 'ADP'],
 [('British', 'NOUN'), 'ADJ'],
 [('halt', 'VERB'), 'NOUN'],
 [('slides', 'VERB'), 'NOUN'],
 [('most', 'ADV'), 'ADJ'],
 [('athletic', 'ADJ'), 'NOUN'],
 [('to', 'ADJ'), 'PRT'],
 [('better', 'ADV'), 'ADJ'],
 [('illustrates', 'VERB'), 'NOUN'],
 [('attempt', 'NOUN'), 'VERB'],
 [('usurp', 'VERB'), 'NOUN'],
 [('executive', 'ADJ'), 'NOUN'],
 [('609', 'NUM'), 'NOUN'],
 [('administer', 'VERB'), 'NOUN'],
 [('disapproved', 'VERB'), 'NOUN'],
 [('*-58', 'X'), 'NOUN'],
 

Common Issues
1. Numners are tagged as Noun.
2. verb is tagged as noun.
3. *-226 are defined as noun
4. adjectives are defined as noun.
5. DET are tagged as ADP
6. alphabet-apphabet are tagged as Noun. It shuld be adjective

### 2. Lexicon based taggers

We will create below rules for words and assign tags. And back plan will be most frequent tag

In [26]:
#regex pattern
patterns = [
    (r'.*ed$', 'VERB'),                          # past tense
    (r'.*es$', 'VERB'),                       # 3rd singular present
    (r'[^-]*ing$', 'VERB'),                         # 3rd singular present
    (r'.*ly$', 'ADV'),                           # Adverb ending with ly
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),                # cardinal numbers
    (r'.[*T]?\*-[0-9]*$', 'X'),                         # other
    (r'\*-[0-9]*$', 'X'),                                 # other
    (r'\*-[0-9]*$', 'X'),                                 # other
    (r'[a-zA-Z]*-[a-zA-Z]*-?[a-zA-Z]*', 'ADJ'),            # adj
    (r'.*', 'NOUN')                                        # nouns
]

In [27]:
#Viterbi rule
def Viterbi_rule(words, patterns, train_bag = train_tagged_words):
    regexp_tagger = nltk.RegexpTagger(patterns)
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        if pmax==0:
            state.append(regexp_tagger.tag([word])[0][1])
        else:
            # getting state for which probability is maximum
            state_max = T[p.index(pmax)] 
            state.append(state_max)
    return list(zip(words, state))

In [28]:
# tagging the test sentences
start = time.time()
tagged_seq_rule = Viterbi_rule(test_words,patterns)
print("Time taken in seconds: ", time.time()-start)

Time taken in seconds:  674.0070085525513


In [29]:
# accuracy
check = [i for i, j in zip(tagged_seq_rule, test_tagged_words) if i == j] 
accuracy = len(check)/len(tagged_seq_rule)
accuracy_matrix['Lexicon Based'] = accuracy
accuracy

0.9504971440660038

In [30]:
#To identify incorrect tagging by Viterbi  - rule based
incorrect_tags = [[j,i[1]] for i, j in zip(tagged_seq_rule, test_tagged_words) if i != j] 
incorrect_tags

[[('book', 'VERB'), 'NOUN'],
 [('stocks', 'ADV'), 'NOUN'],
 [('up', 'ADP'), 'PRT'],
 [('over', 'PRT'), 'ADP'],
 [('mine', 'ADJ'), 'NOUN'],
 [('Palestinian', 'NOUN'), 'ADJ'],
 [('first', 'ADV'), 'ADJ'],
 [('Sit', 'VERB'), 'NOUN'],
 [('down', 'ADV'), 'ADP'],
 [('British', 'NOUN'), 'ADJ'],
 [('halt', 'VERB'), 'NOUN'],
 [('slides', 'VERB'), 'NOUN'],
 [('most', 'ADV'), 'ADJ'],
 [('athletic', 'ADJ'), 'NOUN'],
 [('to', 'ADJ'), 'PRT'],
 [('better', 'ADV'), 'ADJ'],
 [('attempt', 'NOUN'), 'VERB'],
 [('usurp', 'VERB'), 'NOUN'],
 [('executive', 'ADJ'), 'NOUN'],
 [('executive-office', 'NOUN'), 'ADJ'],
 [('administer', 'VERB'), 'NOUN'],
 [('applicable', 'ADJ'), 'NOUN'],
 [('Louisiana-Pacific', 'NOUN'), 'ADJ'],
 [('Five', 'NUM'), 'NOUN'],
 [('Proper', 'ADJ'), 'NOUN'],
 [('English', 'ADJ'), 'NOUN'],
 [('highest-pitched', 'ADJ'), 'VERB'],
 [('as', 'ADV'), 'ADP'],
 [('62-year-old', 'ADJ'), 'NOUN'],
 [('executive', 'ADJ'), 'NOUN'],
 [('forest-product', 'NOUN'), 'ADJ'],
 [('unsolicited', 'ADJ'), 'VERB'],


Common issues
1. 1st form of verb is tagged as noun.
2. nouns ending with ed are tagged as verb.

### 3. Ignoring zero state probability for unknown words

In [31]:
#Viterbi -  ignore zero
def Viterbi_ignorezero(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            if emission_p ==0:emission_p=sys.float_info.epsilon
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

#### Evaluate State probability ignoring

In [32]:
# tagging the test sentences
start = time.time()
tagged_seq_ignorezero = Viterbi_ignorezero(test_words)
print("Time taken in seconds: ", time.time()-start)

Time taken in seconds:  702.0532686710358


In [33]:
# accuracy
check = [i for i, j in zip(tagged_seq_ignorezero, test_tagged_words) if i == j] 
accuracy = len(check)/len(tagged_seq_ignorezero)
accuracy_matrix['Viterbi - Ignore state probability for unknown words'] = accuracy
accuracy

0.9331499894224666

In [34]:
#To identify incorrect tagging by Viterbi  - ignore zero state probability algo
incorrect_tags = [[j,i[1]] for i, j in zip(tagged_seq_ignorezero, test_tagged_words) if i != j] 
incorrect_tags

[[('book', 'VERB'), 'NOUN'],
 [('stocks', 'ADV'), 'NOUN'],
 [('up', 'ADP'), 'PRT'],
 [('over', 'PRT'), 'ADP'],
 [('mine', 'ADJ'), 'NOUN'],
 [('Palestinian', 'NOUN'), 'ADJ'],
 [('first', 'ADV'), 'ADJ'],
 [('clamped', 'VERB'), 'X'],
 [('ankle', 'NOUN'), 'VERB'],
 [('third-largest', 'ADJ'), 'NOUN'],
 [('fifth-largest', 'ADJ'), 'NOUN'],
 [('89.7', 'NUM'), 'NOUN'],
 [('141.9', 'NUM'), 'NOUN'],
 [('94.8', 'NUM'), 'NOUN'],
 [('149.9', 'NUM'), 'NOUN'],
 [('British', 'NOUN'), 'ADJ'],
 [('halt', 'VERB'), 'NOUN'],
 [('slides', 'VERB'), 'NOUN'],
 [('most', 'ADV'), 'ADJ'],
 [('athletic', 'ADJ'), 'DET'],
 [('to', 'ADJ'), 'PRT'],
 [('better', 'ADV'), 'ADJ'],
 [('attempt', 'NOUN'), 'VERB'],
 [('609', 'NUM'), 'NOUN'],
 [('administer', 'VERB'), 'NOUN'],
 [('disapproved', 'VERB'), 'X'],
 [('*-58', 'X'), 'VERB'],
 [('disapproval', 'NOUN'), 'DET'],
 [('accordance', 'NOUN'), 'DET'],
 [('applicable', 'ADJ'), 'NOUN'],
 [('Five', 'NUM'), 'NOUN'],
 [('Proper', 'ADJ'), 'NOUN'],
 [('English', 'ADJ'), 'NOUN'],
 [(

Common issues:
1. verb is defined as noun.
2. num is defined as noun.
3. other words are incorrectly assigned to any randon tag.
4. Adj are defined as noun.
5. words joined by (-) are incorrectly assigned

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [35]:
#Accuracy matrix
accuracy_matrix

{'Vanilla': 0.9033213454622382,
 'Most Frequent POS': 0.9348423947535435,
 'Lexicon Based': 0.9504971440660038,
 'Viterbi - Ignore state probability for unknown words': 0.9331499894224666}

##### Lexicon Based is giving the highest probability

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [36]:
#Words incorrectly tagged by Vanilla but corrected by Regex based tagger.
fixed_rule_corrected = [[i[0],i[1],j[1],k[1]] for i, j,k in zip(test_tagged_words,tagged_seq,tagged_seq_rule) if i != j and i==k]
fixed_rule_corrected = pd.DataFrame(fixed_rule_corrected,columns=['word','correct_pos','vanilla_pos','rule_pos'])
fixed_rule_corrected

Unnamed: 0,word,correct_pos,vanilla_pos,rule_pos
0,ignored,VERB,CONJ,VERB
1,Preston,NOUN,CONJ,NOUN
2,Birmingham,NOUN,CONJ,NOUN
3,Ala,NOUN,CONJ,NOUN
4,clamped,VERB,CONJ,VERB
...,...,...,...,...
224,NCNB,NOUN,CONJ,NOUN
225,Connections,NOUN,CONJ,NOUN
226,adults,NOUN,CONJ,NOUN
227,broadly,ADV,CONJ,ADV


In [37]:
#Analysis
fixed_rule_corrected['vanilla_pos'].value_counts()

CONJ    225
VERB      2
DET       1
ADJ       1
Name: vanilla_pos, dtype: int64

In [38]:
#Words incorrectly tagged by Vanilla but corrected by Most frequent tagger.
frequent_corrected = [[i[0],i[1],j[1],k[1]] for i, j,k in zip(test_tagged_words,tagged_seq,tagged_seq_frequent) if i != j and i==k]
frequent_corrected = pd.DataFrame(frequent_corrected,columns=['word','correct_pos','vanilla_pos','frequent_pos'])
frequent_corrected

Unnamed: 0,word,correct_pos,vanilla_pos,frequent_pos
0,Preston,NOUN,CONJ,NOUN
1,Birmingham,NOUN,CONJ,NOUN
2,Ala,NOUN,CONJ,NOUN
3,ankle,NOUN,CONJ,NOUN
4,Z.,NOUN,CONJ,NOUN
...,...,...,...,...
151,shipboard,NOUN,CONJ,NOUN
152,Malta,NOUN,CONJ,NOUN
153,NCNB,NOUN,CONJ,NOUN
154,Connections,NOUN,CONJ,NOUN


In [39]:
#Analysis
frequent_corrected['vanilla_pos'].value_counts()

CONJ    151
VERB      2
ADV       1
DET       1
ADJ       1
Name: vanilla_pos, dtype: int64

In [40]:
#Words incorrectly tagged by Vanilla but corrected by Viterbi - ignore zero state probability
zero_state_corrected = [[i[0],i[1],j[1],k[1]] for i, j,k in zip(test_tagged_words,tagged_seq,tagged_seq_ignorezero) if i != j and i==k]
zero_state_corrected = pd.DataFrame(zero_state_corrected,columns=['word','correct_pos','vanilla_pos','zero_state_pos'])
zero_state_corrected

Unnamed: 0,word,correct_pos,vanilla_pos,zero_state_pos
0,ignored,VERB,CONJ,VERB
1,Preston,NOUN,CONJ,NOUN
2,Birmingham,NOUN,CONJ,NOUN
3,Ala,NOUN,CONJ,NOUN
4,Z.,NOUN,CONJ,NOUN
...,...,...,...,...
142,bombers,NOUN,CONJ,NOUN
143,shipboard,NOUN,CONJ,NOUN
144,NCNB,NOUN,CONJ,NOUN
145,Connections,NOUN,CONJ,NOUN


In [41]:
#Analysis
zero_state_corrected['vanilla_pos'].value_counts()

CONJ    142
VERB      2
ADV       1
DET       1
ADJ       1
Name: vanilla_pos, dtype: int64

1. In Vanilla -  Most common issue was -  Most tags were in-correctly assigned as Determinent. After analysis, we found thay viterbi algo is assigining all unknown words are determinent. (accuracy 90 percent)
2. Later we used, most frequent tagger (Noun) -  It removed the problem of Determinent and increased the accuract to 93 percent. But now, most is-correct POS tagging was Noun.
3. We further analyzed the problem and created the rules to correctly identify verbs, adverb, number, etc. After the accuracy increased to --- 95 %

4. Later, we tried one more approch by ignoring state probability of unknown words. The accuracy is about 93 percent. And problem remain the same. i.e - Most words are incorrectly assigned as noun. But, the issue of inccorrent words tagged as DET is resolved.