## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pprint, time
from nltk.tokenize import word_tokenize

In [3]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
# Reading the data from the sample test data file provided and Preparing Test DataSet
data = pd.read_csv("Test_sentences.txt", sep = "\n",names=['Sentence'])
test_data=[]
for index,row in data.iterrows():
    test_data.append(row['Sentence'])
test_data_words=[]
for sentence in test_data:
    words=word_tokenize(sentence)
    for word in words:
        test_data_words.append(word)
test_data_words 

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google',
 '.',
 'Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013',
 '.',
 'Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 "'s",
 'firehose',
 '.',
 'Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets',
 '.',
 'Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality',
 '.',
 'The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years',
 '.'

In [5]:
#tagging the given sample test dataset
from nltk import pos_tag
test_data_tagged=pos_tag(test_data_words,tagset='universal')

In [7]:
#Splitting the data into training set and validation set
train_set,valid_set=train_test_split(nltk_data,test_size=0.05)

In [8]:
#Extracting list of tagged words from the training list
train_set_words=[word for sentence in train_set for word in sentence]

In [9]:
#Preparing the validation set
valid_set_words=[word for sentence in valid_set for word in sentence]
valid_set_untagged_words=[tag[0] for tag in valid_set_words]

In [10]:
print(len(valid_set_untagged_words))
print(len(test_data_tagged))

5194
181


In [11]:
#To store the Model Accuracies
Valid_Accuracies={}
Test_Accuracies={}

### Build the vanilla Viterbi based POS tagger

In [12]:
#Function To compute the Emission probabilities:Probability of word w for tag t
def word_given_tag(word, tag, train_bag = train_set_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [13]:
#Function to compute the Transition Probabilities:Probability of t1 followed by t2
def t2_given_t1(t2, t1, train_bag = train_set_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [14]:
#Building the transition probability Matrix
# number of tags
T = set([pair[1] for pair in train_set_words])
#T.add('UNKNOWN')
# creating t x t transition matrix of tags where each column is t2, each row is t1 so M(i, j) represents P(tj given ti)
tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
#Converting the final Matrix to a DataFrame
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))
tags_df

Unnamed: 0,DET,NUM,NOUN,.,X,ADV,PRON,CONJ,PRT,ADP,ADJ,VERB
DET,0.005344,0.022346,0.637843,0.018217,0.045664,0.012631,0.003279,0.000486,0.000243,0.008987,0.205125,0.039835
NUM,0.003546,0.185579,0.35195,0.115248,0.21247,0.002955,0.001182,0.013002,0.027778,0.033983,0.034574,0.01773
NOUN,0.013351,0.009411,0.265084,0.240461,0.028744,0.016926,0.004815,0.042533,0.043226,0.176114,0.012074,0.147261
.,0.171626,0.082183,0.223338,0.093476,0.027066,0.05207,0.065962,0.057985,0.002509,0.090428,0.044184,0.089084
X,0.055803,0.002862,0.061367,0.164229,0.075199,0.026073,0.05628,0.010334,0.181081,0.145787,0.016852,0.204134
ADV,0.068588,0.031809,0.03214,0.13784,0.023526,0.080517,0.014911,0.006958,0.013917,0.115308,0.131875,0.342611
PRON,0.010035,0.006947,0.210729,0.040139,0.094558,0.032806,0.008105,0.005017,0.011193,0.022385,0.072945,0.485141
CONJ,0.120859,0.039664,0.351377,0.035931,0.007933,0.055996,0.058796,0.000467,0.004666,0.04993,0.116192,0.158189
PRT,0.101291,0.057266,0.250248,0.044687,0.01291,0.00993,0.017213,0.002317,0.001986,0.020854,0.083747,0.39755
ADP,0.320846,0.063654,0.323851,0.040683,0.034457,0.013632,0.06827,0.000644,0.001395,0.017067,0.107128,0.008373


In [15]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_set_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))



### Evaluating Vanilla Viterbi with the validation set 

In [16]:
#Running the Vanilla Viterbi Model
#start = time.time()
vanilla_output=Viterbi(valid_set_untagged_words)
#end= time.time()
#difference=end-start
#print("Time taken in seconds: ", difference)

In [17]:
# Calculate the accuracy of the Vanilla Viterbi Model
check = [i for i, j in zip(vanilla_output, valid_set_words) if i == j]
vanilla_accuracy = (len(check)/len(vanilla_output))*100
Valid_Accuracies[0]=round(vanilla_accuracy,2)
Valid_Accuracies[0]

90.8

### Solve the problem of unknown words

#### Building the first Modified Viterbi Model 

For this model, when Emission Probability of a word turns our to be zero i.e. the word is an unknown word, we look at the tag assigned to the previous word in the set. We then check the transition probabilities when a tag follows the previous tag for all tags in the Tag list. The tag which has maximum transition probability is assigned to the word. The entire process of selecting the tag with maximum transition probability is done by the AssignTag() function which takes the prev_tag asssigned to the previous word as an input.

In [18]:
# For assigning tag to the unknown word based on the transition probabilities with the previous word's assigned tag
def AssignTag(prev_tag):
    T = list(set([pair[1] for pair in train_set_words]))
    t_prob1=[]
    for tg in T:
        transition_p1 = tags_df.loc[prev_tag, tg]
        t_prob1.append(transition_p1)
    p1max=max(t_prob1)
    s1= T[t_prob1.index(p1max)] 
    return s1

In [19]:
# Modified Viterbi Heuristic Model based on Transition Probabilities
def ModifiedViterbiTransition(words, train_bag = train_set_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        if pmax==0:
            if key==0:
                state_max=AssignTag('.')
            else:
                state_max=AssignTag(state[-1])
        else:
            state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))

#### Evaluating tagging accuracy for the first Modified Viterbi model

In [20]:
model1_output=ModifiedViterbiTransition(valid_set_untagged_words)
model1_output

[('An', 'DET'),
 ('official', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('Palestinian', 'ADJ'),
 ('Olympic', 'NOUN'),
 ('Committee', 'NOUN'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('the', 'DET'),
 ('committee', 'NOUN'),
 ('first', 'ADJ'),
 ('applied', 'VERB'),
 ('for', 'ADP'),
 ('membership', 'NOUN'),
 ('in', 'ADP'),
 ('1979', 'NUM'),
 ('and', 'CONJ'),
 ('renewed', 'VERB'),
 ('its', 'PRON'),
 ('application', 'NOUN'),
 ('in', 'ADP'),
 ('August', 'NOUN'),
 ('of', 'ADP'),
 ('this', 'DET'),
 ('year', 'NOUN'),
 ('.', '.'),
 ('The', 'DET'),
 ('meeting', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-62', 'X'),
 ('is', 'VERB'),
 ('expected', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('draw', 'VERB'),
 ('20,000', 'NUM'),
 ('to', 'PRT'),
 ('Bangkok', 'VERB'),
 (',', '.'),
 ('was', 'VERB'),
 ('going', 'VERB'),
 ('*-2', 'X'),
 ('to', 'PRT'),
 ('be', 'VERB'),
 ('held', 'VERB'),
 ('*-61', 'X'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('Central', 'NOUN'),
 ('Plaza', 'NOUN'),
 ('Hotel', 'NOUN'),
 (',', '.'),
 

In [21]:
# Calculate the accuracy of the first Modified Viterbi Model
check = [i for i, j in zip(model1_output, valid_set_words) if i == j]
model1_accuracy = (len(check)/len(model1_output))*100
Valid_Accuracies[1]=round(model1_accuracy,2)
Valid_Accuracies[1]

94.2

#### Building the second Modified Viterbi Model

For the second model, We use a Bigram Tagger backed up with unigram tagger(which is in turn backed up with a regex tagger) for tagging the words incorrectly tagged by the Vanilla Viterbi Algorithm. The function ret_Lexicon returns the tag of the word after running a Bigram Tagger through it.

In [22]:
#Defining patterns for regex tagger
word_patterns=[
        (r'.*(ing|ed|es|ould)$','VERB'),  #Verbs (all tenses and modes)
        (r'.*(ness|s|\'s|tion)$','NOUN'), #NOUN - nouns (common and proper)
        (r'(The|the|A|a|An|an)$', 'DET'), #DET - determiners
        (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), #NUM - cardinal numbers
        (r'.*ly$', 'ADV'), #ADV - adverbs
    ]
regex=nltk.RegexpTagger(word_patterns)
unigram=nltk.UnigramTagger(train_set,backoff=regex)
bigram=nltk.BigramTagger(train_set,backoff=unigram)
def ret_Lexicon(word):
    lexicon_state=bigram.tag([word])
    lexicon_tag=[tup[1] for tup in lexicon_state]
    return lexicon_tag

In [23]:
# Modified Viterbi Heuristic Model using a Bigram Tagger backed up with Unigram and Regex
def ModifiedViterbi(words, train_bag = train_set_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
            
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            if int(emission_p)==0:  #Assigning emission_p for unknown words
                emission_p=1
            else:
                state_probability = emission_p * transition_p
            state_probability = emission_p * transition_p
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        tag_again=ret_Lexicon(words[key])
        if tag_again[0]!=state_max:
            if tag_again[0]==None:
                state.append(state_max)
            else:
                state.append(tag_again[0])
        else:
            state.append(state_max)
    return list(zip(words, state))

#### Evaluating tagging accuracy for the second Modified Viterbi model

In [24]:
model2_output=ModifiedViterbi(valid_set_untagged_words)
model2_output

[('An', 'DET'),
 ('official', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('Palestinian', 'ADJ'),
 ('Olympic', 'NOUN'),
 ('Committee', 'NOUN'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('the', 'DET'),
 ('committee', 'NOUN'),
 ('first', 'ADJ'),
 ('applied', 'VERB'),
 ('for', 'ADP'),
 ('membership', 'NOUN'),
 ('in', 'ADP'),
 ('1979', 'NUM'),
 ('and', 'CONJ'),
 ('renewed', 'VERB'),
 ('its', 'PRON'),
 ('application', 'NOUN'),
 ('in', 'ADP'),
 ('August', 'NOUN'),
 ('of', 'ADP'),
 ('this', 'DET'),
 ('year', 'NOUN'),
 ('.', '.'),
 ('The', 'DET'),
 ('meeting', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-62', 'X'),
 ('is', 'VERB'),
 ('expected', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('draw', 'VERB'),
 ('20,000', 'NUM'),
 ('to', 'PRT'),
 ('Bangkok', 'VERB'),
 (',', '.'),
 ('was', 'VERB'),
 ('going', 'VERB'),
 ('*-2', 'X'),
 ('to', 'PRT'),
 ('be', 'VERB'),
 ('held', 'VERB'),
 ('*-61', 'X'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('Central', 'NOUN'),
 ('Plaza', 'NOUN'),
 ('Hotel', 'NOUN'),
 (',', '.'),
 

In [26]:
# Calculate the accuracy of the second Modified Viterbi Model
check = [i for i, j in zip(model2_output, valid_set_words) if i == j]
model2_accuracy = (len(check)/len(model2_output))*100
Valid_Accuracies[2]=round(model2_accuracy,2)
Valid_Accuracies[2]

92.66

### Compare the tagging accuracies on the validation set of the modifications with the vanilla Viterbi algorithm

Validation Accuracies obtained with different models is as given below:

In [45]:
print('\t0\tVanilla Viterbi Algorithm     : ',Valid_Accuracies[0])
print('\t1\tFirst Modified Viterbi Model  : ',Valid_Accuracies[1])
print('\t2\tSecond Modified Viterbi Model : ',Valid_Accuracies[2])

	0	Vanilla Viterbi Algorithm     :  90.8
	1	First Modified Viterbi Model  :  94.2
	2	Second Modified Viterbi Model :  92.66


Upon observation,it is seen that the first Modified Viterbi Model gives maximum accuracy on validation set.

### Evaluating the models on the test set

In [46]:
#Running the Vanilla Viterbi model on the test set and computing the accuracy
vanilla_output_test=Viterbi(test_data_words)

In [52]:
#Accuracies on Test Set
# Calculate the accuracy of the Vanilla Viterbi Model
check = [i for i, j in zip(vanilla_output_test, test_data_tagged) if i == j]
vanilla_accuracy_test = (len(check)/len(vanilla_output_test))*100
Test_Accuracies[0]=round(vanilla_accuracy_test,2)

In [53]:
Test_Accuracies[0]

75.14

In [54]:
#Running the first modified Viterbi model on the test set and computing the accuracy
model1_output_test=ModifiedViterbiTransition(test_data_words)

In [55]:
#Accuracies on Test Set
# Calculate the accuracy of the first modified Viterbi model
check = [i for i, j in zip(model1_output_test, test_data_tagged) if i == j]
model1_accuracy_test = (len(check)/len(model1_output_test))*100
Test_Accuracies[1]=round(model1_accuracy_test,2)

In [56]:
Test_Accuracies[1]

89.5

In [34]:
#Running the second modified Viterbi model on the test set and computing the accuracy
model2_output_test=ModifiedViterbi(test_data_words)

In [57]:
#Accuracies on Test Set
# Calculate the accuracy of the second modified Viterbi model
check = [i for i, j in zip(model2_output_test, test_data_tagged) if i == j]
model2_accuracy_test = (len(check)/len(model2_output_test))*100
Test_Accuracies[2]=round(model2_accuracy_test,2)

In [58]:
Test_Accuracies[2]

88.4

### Compare the tagging accuracies on the test set of the modifications with the vanilla Viterbi algorithm

Test Accuracies after aplying different models is given below

In [60]:
print('\t0\tVanilla Viterbi Algorithm     : ',Test_Accuracies[0])
print('\t1\tFirst Modified Viterbi Model  : ',Test_Accuracies[1])
print('\t2\tSecond Modified Viterbi Model : ',Test_Accuracies[2])

	0	Vanilla Viterbi Algorithm     :  75.14
	1	First Modified Viterbi Model  :  89.5
	2	Second Modified Viterbi Model :  88.4


Upon observation,it is seen that the first Modified Viterbi Model gives maximum accuracy on validation set.

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [63]:
incorrect_tagged_cases_0 = [[test_data_tagged[i-1],j] for i, j in enumerate(zip(vanilla_output_test, test_data_tagged)) if j[0]!=j[1]]
incorrect_tagged_cases_0

[[('.', '.'), (('Android', 'DET'), ('Android', 'NOUN'))],
 [('by', 'ADP'), (('Google', 'DET'), ('Google', 'NOUN'))],
 [('.', '.'), (('Android', 'DET'), ('Android', 'NOUN'))],
 [('best-selling', 'ADJ'), (('OS', 'DET'), ('OS', 'NOUN'))],
 [('OS', 'NOUN'), (('worldwide', 'DET'), ('worldwide', 'NOUN'))],
 [('on', 'ADP'), (('smartphones', 'DET'), ('smartphones', 'NOUN'))],
 [('since', 'ADP'), (('2011', 'DET'), ('2011', 'NUM'))],
 [('since', 'ADP'), (('2013', 'DET'), ('2013', 'NUM'))],
 [('.', '.'), (('Google', 'DET'), ('Google', 'NOUN'))],
 [('and', 'CONJ'), (('Twitter', 'DET'), ('Twitter', 'NOUN'))],
 [('in', 'ADP'), (('2015', 'DET'), ('2015', 'NUM'))],
 [('2015', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('gave', 'VERB'), (('Google', 'DET'), ('Google', 'NOUN'))],
 [('to', 'PRT'), (('Twitter', 'DET'), ('Twitter', 'NOUN'))],
 [('Twitter', 'NOUN'), (("'s", 'VERB'), ("'s", 'PRT'))],
 [("'s", 'PRT'), (('firehose', 'DET'), ('firehose', 'NOUN'))],
 [('.', '.'), (('Twitter', 'DET'), ('Twitte

In [64]:
incorrect_tagged_cases_1 = [[test_data_tagged[i-1],j] for i, j in enumerate(zip(model1_output_test, test_data_tagged)) if j[0]!=j[1]]
incorrect_tagged_cases_1

[[('since', 'ADP'), (('2011', 'NOUN'), ('2011', 'NUM'))],
 [('since', 'ADP'), (('2013', 'NOUN'), ('2013', 'NUM'))],
 [('in', 'ADP'), (('2015', 'NOUN'), ('2015', 'NUM'))],
 [('2015', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('gave', 'VERB'), (('Google', 'X'), ('Google', 'NOUN'))],
 [('to', 'PRT'), (('Twitter', 'VERB'), ('Twitter', 'NOUN'))],
 [("'s", 'PRT'), (('firehose', 'VERB'), ('firehose', 'NOUN'))],
 [('an', 'DET'), (('online', 'NOUN'), ('online', 'ADJ'))],
 [('a', 'DET'), (('domineering', 'NOUN'), ('domineering', 'ADJ'))],
 [('The', 'DET'), (('2018', 'NOUN'), ('2018', 'NUM'))],
 [('the', 'DET'), (('21st', 'NOUN'), ('21st', 'NUM'))],
 [('tournament', 'NOUN'), (('contested', 'NOUN'), ('contested', 'VERB'))],
 [('the', 'DET'), (('11th', 'NOUN'), ('11th', 'NUM'))],
 [('.', '.'), (('Show', 'NOUN'), ('Show', 'VERB'))],
 [('would', 'VERB'), (('like', 'ADP'), ('like', 'VERB'))],
 [('.', '.'), (('Show', 'NOUN'), ('Show', 'VERB'))],
 [('at', 'ADP'), (('about', 'ADP'), ('about', 'ADV')

In [65]:
incorrect_tagged_cases_2 = [[test_data_tagged[i-1],j] for i, j in enumerate(zip(model2_output_test, test_data_tagged)) if j[0]!=j[1]]
incorrect_tagged_cases_2

[[('is', 'VERB'), (('a', 'X'), ('a', 'DET'))],
 [('on', 'ADP'), (('smartphones', 'VERB'), ('smartphones', 'NOUN'))],
 [('made', 'VERB'), (('a', 'X'), ('a', 'DET'))],
 [('2015', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('gave', 'VERB'), (('Google', 'X'), ('Google', 'NOUN'))],
 [('to', 'PRT'), (('Twitter', 'VERB'), ('Twitter', 'NOUN'))],
 [("'s", 'PRT'), (('firehose', 'VERB'), ('firehose', 'NOUN'))],
 [('an', 'DET'), (('online', 'NOUN'), ('online', 'ADJ'))],
 [('social', 'ADJ'), (('networking', 'VERB'), ('networking', 'NOUN'))],
 [('with', 'ADP'), (('messages', 'VERB'), ('messages', 'NOUN'))],
 [('was', 'VERB'), (('a', 'X'), ('a', 'DET'))],
 [('a', 'DET'), (('domineering', 'VERB'), ('domineering', 'ADJ'))],
 [('and', 'CONJ'), (('a', 'X'), ('a', 'DET'))],
 [('the', 'DET'), (('21st', 'NOUN'), ('21st', 'NUM'))],
 [('the', 'DET'), (('11th', 'NOUN'), ('11th', 'NUM'))],
 [('.', '.'), (('Show', 'NOUN'), ('Show', 'VERB'))],
 [('would', 'VERB'), (('like', 'ADP'), ('like', 'VERB'))],
 [('.',

In [66]:
set0=set([val[1][0] for i,val in enumerate(incorrect_tagged_cases_0)])
set1=set([val[1][0] for i,val in enumerate(incorrect_tagged_cases_1)])
set2=set([val[1][0] for i,val in enumerate(incorrect_tagged_cases_2)])

Cases which were incorrectly tagged by the Vanilla Viterbi(original POS tagger) but were rectified in the first model are as given below

In [67]:
print(set0.difference(set1))

{('Android', 'DET'), ('domineering', 'DET'), ('21st', 'DET'), ('trips', 'DET'), ('online', 'DET'), ('messages', 'DET'), ('contested', 'DET'), ('worldwide', 'DET'), ('11th', 'DET'), ('Twitter', 'DET'), ('personality', 'DET'), ('smartphones', 'DET'), ('2013', 'DET'), ('firehose', 'DET'), ("'s", 'VERB'), ('2018', 'DET'), ('NASA', 'DET'), ('interact', 'DET'), ('Google', 'DET'), ('invited', 'DET'), ('ICESAT-2', 'DET'), ('Satellite', 'DET'), ('2011', 'DET'), ('networking', 'DET'), ('Cup', 'DET'), ('tweets', 'DET'), ('2015', 'DET'), ('tournament', 'DET'), ('OS', 'DET'), ('FIFA', 'DET'), ('arriving', 'DET')}


Cases which were incorrectly tagged by the Vanilla Viterbi(original POS tagger) but were rectified in the second model are as given below

In [68]:
print(set0.difference(set2))

{('Android', 'DET'), ('domineering', 'DET'), ('21st', 'DET'), ('trips', 'DET'), ('online', 'DET'), ('messages', 'DET'), ('contested', 'DET'), ('worldwide', 'DET'), ('11th', 'DET'), ('Twitter', 'DET'), ('personality', 'DET'), ('smartphones', 'DET'), ('2013', 'DET'), ('firehose', 'DET'), ("'s", 'VERB'), ('2018', 'DET'), ('NASA', 'DET'), ('interact', 'DET'), ('Google', 'DET'), ('invited', 'DET'), ('ICESAT-2', 'DET'), ('Satellite', 'DET'), ('2011', 'DET'), ('networking', 'DET'), ('Cup', 'DET'), ('tweets', 'DET'), ('2015', 'DET'), ('tournament', 'DET'), ('OS', 'DET'), ('FIFA', 'DET'), ('arriving', 'DET')}
