## POS tagging using modified Viterbi

## 1. Data Preparation

In [1]:
# Importing libraries

import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import random
import numpy as np
import pandas as pd
import pprint, time
import random
import collections
import time
nltk.download('universal_tagset')
nltk.download('treebank')
nltk.download('punkt')


[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# reading the Treebank tagged sentences

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# Train Test Split
train_set, test_set = train_test_split(nltk_data,test_size=0.05, random_state=100)

print(len(train_set))
print(len(test_set))

3718
196


In [4]:
# Extract words with tags from all sentences
train_tagged_words = [tup for sentence in train_set for tup in sentence]

In [5]:
# List out all unique words
unique_words = set([tup[0] for tup in train_tagged_words])

In [6]:
#List out all unique tags
unique_tags = set([tup[1] for tup in train_tagged_words])
#len(T)

## 2. HMM



In [7]:
# compute word given tag: Emission Probability

def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [8]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
             count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [9]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(unique_tags), len(unique_tags)), dtype='float32')
for i, t1 in enumerate(list(unique_tags)):
    for j, t2 in enumerate(list(unique_tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]


In [10]:
tags_df = pd.DataFrame(tags_matrix, columns = list(unique_tags), index=list(unique_tags))
tags_df

Unnamed: 0,NUM,PRT,.,ADP,VERB,CONJ,DET,X,ADJ,NOUN,PRON,ADV
NUM,0.184195,0.026144,0.118835,0.035056,0.016934,0.013072,0.003862,0.211824,0.033571,0.352347,0.001485,0.002674
PRT,0.056102,0.001969,0.043635,0.019357,0.405184,0.002297,0.10105,0.013123,0.083661,0.245735,0.017717,0.010171
.,0.081353,0.002511,0.092923,0.092206,0.088708,0.058032,0.173558,0.026908,0.043681,0.222531,0.065208,0.052292
ADP,0.06191,0.001484,0.039754,0.017492,0.008481,0.000848,0.323969,0.034984,0.107389,0.321213,0.069119,0.013357
VERB,0.022448,0.031427,0.034291,0.091184,0.168744,0.005186,0.133292,0.218438,0.06564,0.111386,0.035916,0.08205
CONJ,0.042188,0.003709,0.035698,0.053778,0.155308,0.000464,0.118683,0.008809,0.118683,0.350487,0.058414,0.053778
DET,0.02164,0.00024,0.017913,0.009618,0.040394,0.000481,0.005771,0.045323,0.204977,0.637293,0.003727,0.012623
X,0.002857,0.184891,0.162831,0.144898,0.204571,0.010316,0.055229,0.074433,0.016505,0.062371,0.055705,0.025393
ADJ,0.020803,0.010156,0.063882,0.078624,0.011794,0.016052,0.004914,0.020311,0.067158,0.700901,0.000491,0.004914
NOUN,0.00955,0.043357,0.239951,0.177058,0.146955,0.042921,0.013363,0.028868,0.012165,0.26428,0.004721,0.016813


## 3.  Build the vanilla Viterbi based POS tagger

In [11]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []    
    unique_tags_list = list(unique_tags)
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in unique_tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]  
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = unique_tags_list[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))


In [12]:
# list of untagged words in test set 
test_tagged_words = [tup[0] for sent in test_set for tup in sent]

In [13]:
#Executing Viterbi algorithm on test data
tagged_seq = Viterbi(test_tagged_words)

In [14]:
#Accuracy check for tags
test_run_base = [tup for sent in test_set for tup in sent]
correctly_tagged = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

accuracy = len(correctly_tagged)/len(tagged_seq)
print("Vanilla Viterbi Accuracy ----> ",accuracy)

Vanilla Viterbi Accuracy ---->  0.91030251745293


In [15]:
#List of incorrectly tagged words
incorrect_tagged_cases = [j for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[(('book', 'NOUN'), ('book', 'VERB')),
 (('stocks', 'NOUN'), ('stocks', 'ADV')),
 (('up', 'PRT'), ('up', 'ADP')),
 (('over', 'ADP'), ('over', 'PRT')),
 (('ignored', 'NUM'), ('ignored', 'VERB')),
 (('mine', 'NOUN'), ('mine', 'ADJ')),
 (('Palestinian', 'ADJ'), ('Palestinian', 'NOUN')),
 (('first', 'ADJ'), ('first', 'ADV')),
 (('Preston', 'NUM'), ('Preston', 'NOUN')),
 (('Birmingham', 'NUM'), ('Birmingham', 'NOUN')),
 (('Ala', 'NUM'), ('Ala', 'NOUN')),
 (('clamped', 'NUM'), ('clamped', 'VERB')),
 (('ankle', 'NUM'), ('ankle', 'NOUN')),
 (('third-largest', 'NUM'), ('third-largest', 'ADJ')),
 (('fifth-largest', 'NUM'), ('fifth-largest', 'ADJ')),
 (('Z.', 'NUM'), ('Z.', 'NOUN')),
 (('Wick', 'NUM'), ('Wick', 'NOUN')),
 (('argues', 'NUM'), ('argues', 'VERB')),
 (('Sit', 'NUM'), ('Sit', 'VERB')),
 (('down', 'PRT'), ('down', 'ADV')),
 (('British', 'ADJ'), ('British', 'NOUN')),
 (('halt', 'NOUN'), ('halt', 'VERB')),
 (('slides', 'NOUN'), ('slides', 'VERB')),
 (('most', 'ADJ'), ('most', 'ADV')),
 (

## 4.  Solve the problem of unknown words

###  Modification 1 - Lexicon based (tagging unknown words with the most common tag)

In [16]:
#Lexicon based model

# Obtain most frequent tag to assign it for the unknown words

most_frequent_tag = collections.Counter([tup[1] for tup in train_tagged_words]).most_common(1)[0][0]
most_frequent_tag

'NOUN'

In [17]:
# Viterbi Heuristic with lexicon

def Viterbi_mod1(words, train_bag = train_tagged_words):
    state = []    
    unique_tags_list = list(unique_tags)
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in unique_tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]  
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        # getting state for which probability is maximum
        if pmax == 0:
            state.append(most_frequent_tag)  # assigning most frequent tag to the unknown words
        else:
            state_max = unique_tags_list[p.index(pmax)] 
            state.append(state_max)
    return list(zip(words, state))


In [18]:
# Tagging with Lexicon based viterbi

tagged_seq_mod1 = Viterbi_mod1(test_tagged_words)

### Evaluating tagging accuracy

In [19]:
test_run_base_mod1 = [tup for sent in test_set for tup in sent]
correctly_tagged_mod1 = [i for i, j in zip(tagged_seq_mod1, test_run_base_mod1) if i == j] 

accuracy_mod1 = len(correctly_tagged_mod1)/len(tagged_seq_mod1)
print("Lexicon based Viterbi Accuracy -----> ",accuracy_mod1)

Lexicon based Viterbi Accuracy ----->  0.9348423947535435


In [20]:
#List of incorrectly tagged words
incorrect_tagged_cases_mod1 = [j for i, j in enumerate(zip(tagged_seq_mod1, test_run_base_mod1)) if j[0]!=j[1]]
incorrect_tagged_cases_mod1

[(('book', 'NOUN'), ('book', 'VERB')),
 (('stocks', 'NOUN'), ('stocks', 'ADV')),
 (('up', 'PRT'), ('up', 'ADP')),
 (('over', 'ADP'), ('over', 'PRT')),
 (('ignored', 'NOUN'), ('ignored', 'VERB')),
 (('mine', 'NOUN'), ('mine', 'ADJ')),
 (('Palestinian', 'ADJ'), ('Palestinian', 'NOUN')),
 (('first', 'ADJ'), ('first', 'ADV')),
 (('clamped', 'NOUN'), ('clamped', 'VERB')),
 (('third-largest', 'NOUN'), ('third-largest', 'ADJ')),
 (('fifth-largest', 'NOUN'), ('fifth-largest', 'ADJ')),
 (('89.7', 'NOUN'), ('89.7', 'NUM')),
 (('141.9', 'NOUN'), ('141.9', 'NUM')),
 (('94.8', 'NOUN'), ('94.8', 'NUM')),
 (('149.9', 'NOUN'), ('149.9', 'NUM')),
 (('argues', 'NOUN'), ('argues', 'VERB')),
 (('Sit', 'NOUN'), ('Sit', 'VERB')),
 (('down', 'ADP'), ('down', 'ADV')),
 (('British', 'ADJ'), ('British', 'NOUN')),
 (('halt', 'NOUN'), ('halt', 'VERB')),
 (('slides', 'NOUN'), ('slides', 'VERB')),
 (('most', 'ADJ'), ('most', 'ADV')),
 (('athletic', 'NOUN'), ('athletic', 'ADJ')),
 (('to', 'PRT'), ('to', 'ADJ')),
 ((

### Modification 2 - Rule based POS tagging

In [21]:
#Rule based model

#Define Rules
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # 3rd singular present
    (r'.*ould$', 'VERB'),              # modals
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),
    (r'^(the|a)$', 'DET'),# cardinal numbers
    (r'.*', 'NOUN')
    # nouns
]

# Initiating a regex tagger 
regexp_tagger = nltk.RegexpTagger(patterns)

In [22]:
#Rule Based tagger

def tag_by_rules(word):
    tagged_by_rule = regexp_tagger.tag([word])
    return tagged_by_rule[0][1]

In [23]:
# Viterbi Heuristic with Rule based tagger
def Viterbi_mod2(words, train_bag = train_tagged_words):
    state = []    
    unique_tags_list = list(unique_tags)
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in unique_tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]  
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        pmax = max(p)
        # getting state for which probability is maximum
        if pmax == 0:
            state.append(tag_by_rules(word))  # assigning most frequent tag to the unknown words
        else:
            state_max = unique_tags_list[p.index(pmax)] 
            state.append(state_max)
    return list(zip(words, state))


In [24]:
# Tagging with Rule based Viterbi

tagged_seq_mod2 = Viterbi_mod2(test_tagged_words)

#### Evaluating tagging accuracy

In [25]:
test_run_base_mod2 = [tup for sent in test_set for tup in sent]
correctly_tagged = [i for i, j in zip(tagged_seq_mod2, test_run_base_mod2) if i == j] 

accuracy_mod2 = len(correctly_tagged)/len(tagged_seq_mod2)
print("Rule based with Viterbi Accuracy------> ", accuracy_mod2)

Rule based with Viterbi Accuracy------>  0.946689232071081


In [26]:
#List of incorrectly tagged words
incorrect_tagged_cases_mod2 = [j for i, j in enumerate(zip(tagged_seq_mod2, test_run_base_mod2)) if j[0]!=j[1]]
incorrect_tagged_cases_mod2

[(('book', 'NOUN'), ('book', 'VERB')),
 (('stocks', 'NOUN'), ('stocks', 'ADV')),
 (('up', 'PRT'), ('up', 'ADP')),
 (('over', 'ADP'), ('over', 'PRT')),
 (('mine', 'NOUN'), ('mine', 'ADJ')),
 (('Palestinian', 'ADJ'), ('Palestinian', 'NOUN')),
 (('first', 'ADJ'), ('first', 'ADV')),
 (('third-largest', 'NOUN'), ('third-largest', 'ADJ')),
 (('fifth-largest', 'NOUN'), ('fifth-largest', 'ADJ')),
 (('Sit', 'NOUN'), ('Sit', 'VERB')),
 (('down', 'ADP'), ('down', 'ADV')),
 (('British', 'ADJ'), ('British', 'NOUN')),
 (('halt', 'NOUN'), ('halt', 'VERB')),
 (('slides', 'NOUN'), ('slides', 'VERB')),
 (('most', 'ADJ'), ('most', 'ADV')),
 (('athletic', 'NOUN'), ('athletic', 'ADJ')),
 (('to', 'PRT'), ('to', 'ADJ')),
 (('better', 'ADJ'), ('better', 'ADV')),
 (('attempt', 'VERB'), ('attempt', 'NOUN')),
 (('usurp', 'NOUN'), ('usurp', 'VERB')),
 (('executive', 'NOUN'), ('executive', 'ADJ')),
 (('administer', 'NOUN'), ('administer', 'VERB')),
 (('*-58', 'NOUN'), ('*-58', 'X')),
 (('applicable', 'NOUN'), ('ap

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [27]:
#Vanilla Viterbi accuracy
print("Vanilla Viterb accuracy --------> ",accuracy)

#Lexicom based modification accuracy
print("Lexicon based approach with Viterbi accuracy--------> ",accuracy_mod1)

#Rule based modification accuracy
print("Rule based approach with Viterbi accuracy--------> ",accuracy_mod2)

Vanilla Viterb accuracy -------->  0.91030251745293
Lexicon based approach with Viterbi accuracy-------->  0.9348423947535435
Rule based approach with Viterbi accuracy-------->  0.946689232071081


## 5. List down words which were incorrectly tagged by Vanilla Viterbi and got corrected with Modification 1 & 2 (Lexicon and Rule based)

In [28]:
# Words incorrectly tagged by Vanilla Viterbi

incorrect_tag_words_vanilla = [item[0][0].lower() for item in incorrect_tagged_cases ]
len(set(incorrect_tag_words_vanilla))

372

In [29]:
# Words incorrectly tagged by Lexicon with Viterbi

incorrect_tag_words_mod1 = [item[0][0].lower() for item in incorrect_tagged_cases_mod1 ]
len(set(incorrect_tag_words_mod1))

259

In [30]:
# Words incorrectly tagged by Rule based with Viterbi

incorrect_tag_words_mod2 = [item[0][0].lower() for item in incorrect_tagged_cases_mod2 ]
len(set(incorrect_tag_words_mod2))

203

In [31]:
# Words Correctly tagged by Lexicon tagger which are wrongly tagged by Vanilla Viterbi

incorrect_tag_words_vanilla_tmp = set([x.lower() for x in incorrect_tag_words_vanilla])
incorrect_tag_words_mod1_tmp = set([x.lower() for x in incorrect_tag_words_mod1])
vanilla_mod1_correctly_tagged_tmp = [item for item in incorrect_tag_words_mod1_tmp
                                 if item not in incorrect_tag_words_vanilla_tmp]
print(list(set(vanilla_mod1_correctly_tagged_tmp)))

['51.6', '94.8', '221.4', '1997', '89.7', '126,000', '879', '8.75', '960', '550,000', '14.75', '12.52', '143.93', "'82", '115', '143.08', '472', '11.95', '77.6', '88.32', '609', '306', '13.90', '692', '858,000', 'four', '99.1', '141.9', '150.00', '3.19', '149.9', '618.1', 'five']


In [32]:
# Words Correctly tagged by Rule based tagger which are wrongly tagged by lexicon tagger

incorrect_tag_words_mod1_tmp = set([x.lower() for x in incorrect_tag_words_mod1])
incorrect_tag_words_mod2_tmp = set([x.lower() for x in incorrect_tag_words_mod2])
mod1_mod2_correctly_tagged = [item for item in incorrect_tag_words_mod1_tmp if item not in incorrect_tag_words_mod2_tmp]
print(list(set(mod1_mod2_correctly_tagged)))

['51.6', 'broadened', 'delayed', '94.8', 'curbed', '221.4', 'burned', '1997', 'sagged', 'removing', 'shopped', '89.7', 'owning', '126,000', '879', '8.75', 'flirted', 'stemmed', '960', '550,000', 'descending', 'outlawing', 'overvalued', 'tempted', '14.75', 'disapproved', 'dreamed', '12.52', 'faded', 'reclaimed', 'illustrates', '143.93', 'safeguarding', 'diagnosed', '115', 'deteriorating', 'outlawed', '143.08', '472', '11.95', '77.6', 'headed', 'wrestling', '88.32', 'clamped', 'leveraging', '609', 'pushes', '306', 'crying', '13.90', '692', '858,000', 'filled', 'ignored', 'soaring', '99.1', 'cultivated', '150.00', '141.9', 'inserted', 'argues', '3.19', '149.9', '618.1', 'maturing', 'construed']


### 6. Tagging the Sample test file 

In [33]:
# Read File
file1 = open("Test_sentences.txt","r+")

sample_test_data= file1.read()
sample_test_data = sample_test_data.replace("\n"," ")
sample_test_data = sample_test_data.lower()
words = word_tokenize(sample_test_data)


In [34]:
# Tagging with Vanilla Viterbi

tagged_seq = Viterbi(words)

In [35]:
# Tagging with Lexicon based tagger

tagged_seq_1 = Viterbi_mod1(words)
tagged_seq_1

[('android', 'NOUN'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('google', 'NOUN'),
 ('.', '.'),
 ('android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('os', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'NOUN'),
 ('since', 'ADP'),
 ('2011', 'NOUN'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013.', 'NOUN'),
 ('google', 'NOUN'),
 ('and', 'CONJ'),
 ('twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NOUN'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('google', 'NOUN'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('twitter', 'NOUN'),
 ("'s", 'PRT'),
 ('firehose', 'NOUN'),
 ('.', '.'),
 ('twitter', 'NOUN'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'NOUN'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('service', 

In [36]:
# Tagging with Rule based Tagger

tagged_seq_2 = Viterbi_mod2(words)
tagged_seq_2

[('android', 'NOUN'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('google', 'NOUN'),
 ('.', '.'),
 ('android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('os', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'VERB'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013.', 'NOUN'),
 ('google', 'NOUN'),
 ('and', 'CONJ'),
 ('twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NUM'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('google', 'NOUN'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('twitter', 'NOUN'),
 ("'s", 'PRT'),
 ('firehose', 'NOUN'),
 ('.', '.'),
 ('twitter', 'NOUN'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'NOUN'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('service', 'N

In [37]:
# Tagging of words in Lexicon tagger in comparison with Vanilla Viterbi
unkown_set_tagging_vanilla_mod1 = [ (x[0],"Current-tag: " + x[1], "Previous-tag: "+ y[1]) for x in tagged_seq_1 for y in tagged_seq if x[1] != y[1]  and x[0] == y[0]]
print(len(set(unkown_set_tagging_vanilla_mod1)))

# Tagging of words in Rule based tagger in comparison with Vanilla Viterbi
unkown_set_tagging_vanilla_mod2 = [ (x[0],"Current-tag: " + x[1],"Previous-tag: "+ y[1]) for x in tagged_seq_2 for y in tagged_seq if x[1] != y[1]  and x[0] == y[0]]
print(len(set(unkown_set_tagging_vanilla_mod2)))

# Tagging of words in Rule based tagger in comparison with Lexicon tagger
unkown_set_tagging_mod1_mod2 = [ (x[0],"Current-tag: " + x[1],"Previous-tag: "+ y[1]) for x in tagged_seq_2 for y in tagged_seq_1 if x[1] != y[1]  and x[0] == y[0]]
print(len(set(unkown_set_tagging_mod1_mod2)))


39
36
9


In [39]:
print("===============Tagging of lexicon tagger VS Viterbi Vanilla============")
print()
print(list(set(unkown_set_tagging_vanilla_mod1)))
print()
print("--------Tag Correction Case 1-----------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod1))[0])
print("----------------------------------------------------------------")
print()
print("--------Tag Correction Case 2-----------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod1))[1])
print("----------------------------------------------------------------")
print()
print("--------Tag Correction Case 3-----------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod1))[5])
print("----------------------------------------------------------------")
print()
print("===================================================================================")



[('tournament', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('francisco', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('2015', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('2018', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('invited', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('smartphones', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('domineering', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('denver', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('2011', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('google', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('contested', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('android', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('interact', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('arriving', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('donald', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('2013.', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('eastern', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('philadelphia', 'Current-tag: NOUN', 'Previous-tag: NUM'), 

In [41]:
print("===============Tagging of Rule based tagger VS Viterbi Vanilla============")
print()
print(list(set(unkown_set_tagging_vanilla_mod2)))
print()
print("--------Tag Correction Case 1------------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod2))[13])
print("-----------------------------------------------------------------")
print()
print("--------Tag Correction Case 2------------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod2))[10])
print("-----------------------------------------------------------------")
print()
print("--------Tag Correction Case 3------------------------------------")
print(list(set(unkown_set_tagging_vanilla_mod2))[12])
print("-----------------------------------------------------------------")
print()
print("===================================================================================")



[('tournament', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('francisco', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('domineering', 'Current-tag: VERB', 'Previous-tag: NUM'), ('messages', 'Current-tag: VERB', 'Previous-tag: NUM'), ('denver', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('google', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('android', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('interact', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('donald', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('2013.', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('invited', 'Current-tag: VERB', 'Previous-tag: NUM'), ('eastern', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('philadelphia', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('dallas', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('cup', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('icesat-2', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('satellite', 'Current-tag: NOUN', 'Previous-tag: NUM'), ('trump', 'Current-tag: NOUN', 'Previous-tag: NUM'), (

In [43]:
print("===============Tagging of Rule based tagger VS Lexicon Tagger============")
print()
print(list(set(unkown_set_tagging_mod1_mod2)))
print()
print("--------Tag Correction Case 1------------------------------------")
print(list(set(unkown_set_tagging_mod1_mod2))[0])
print("-----------------------------------------------------------------")
print()
print("--------Tag Correction Case 2------------------------------------")
print(list(set(unkown_set_tagging_mod1_mod2))[1])
print("-----------------------------------------------------------------")
print()
print("--------Tag Correction Case 3------------------------------------")
print(list(set(unkown_set_tagging_mod1_mod2))[3])
print("-----------------------------------------------------------------")
print()
print("===================================================================================")


[('2018', 'Current-tag: NUM', 'Previous-tag: NOUN'), ('2011', 'Current-tag: NUM', 'Previous-tag: NOUN'), ('arriving', 'Current-tag: VERB', 'Previous-tag: NOUN'), ('contested', 'Current-tag: VERB', 'Previous-tag: NOUN'), ('domineering', 'Current-tag: VERB', 'Previous-tag: NOUN'), ('2015', 'Current-tag: NUM', 'Previous-tag: NOUN'), ('invited', 'Current-tag: VERB', 'Previous-tag: NOUN'), ('smartphones', 'Current-tag: VERB', 'Previous-tag: NOUN'), ('messages', 'Current-tag: VERB', 'Previous-tag: NOUN')]

--------Tag Correction Case 1------------------------------------
('2018', 'Current-tag: NUM', 'Previous-tag: NOUN')
-----------------------------------------------------------------

--------Tag Correction Case 2------------------------------------
('2011', 'Current-tag: NUM', 'Previous-tag: NOUN')
-----------------------------------------------------------------

--------Tag Correction Case 3------------------------------------
('contested', 'Current-tag: VERB', 'Previous-tag: NOUN')
--