<b>Data Preparation</b>

In [1]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
#downloading treebank and universal tagset
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\shreyash\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\shreyash\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
print(nltk_data[:5])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

<b> Splitting Data into Train and Validation set </b><br>Sample size - 95:5

In [4]:
random.seed(1234)
Train_set, Validation_set = train_test_split(nltk_data,test_size=0.05)

In [5]:
print("len of Train set :")
print(len(Train_set))
print("len of Validation set :")
print(len(Validation_set))

len of Train set :
3718
len of Validation set :
196


In [6]:
# Getting list of tagged words
train_tagged_words = [tup for sent in Train_set for tup in sent]
len(train_tagged_words)

95939

In [7]:
train_tagged_words[:10]

[('Copperweld', 'NOUN'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('it', 'PRON'),
 ('does', 'VERB'),
 ("n't", 'ADV'),
 ('expect', 'VERB'),
 ('a', 'DET'),
 ('protracted', 'ADJ'),
 ('strike', 'NOUN')]

In [8]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]

# vocabulary
Vocab = set(tokens)


In [9]:
noOfTags = set([pair[1] for pair in train_tagged_words])
print("Number of Tags:")
print(len(noOfTags))

Number of Tags:
12


In [10]:
print ("List of Tags:")
print(list(noOfTags))

List of Tags:
['ADJ', 'PRT', 'VERB', 'ADP', 'CONJ', 'ADV', 'PRON', '.', 'NOUN', 'X', 'DET', 'NUM']


In [11]:
from collections import Counter
frequentTags = Counter([pair[1] for pair in train_tagged_words])
print("Most Frequent Tags:")
print(frequentTags.most_common(5))

Most Frequent Tags:
[('NOUN', 27485), ('VERB', 12916), ('.', 11180), ('ADP', 9389), ('DET', 8318)]


<b>POS Tagging - HMM </b>

Computing Token X Vocab matrix

In [12]:
t = len(noOfTags)
v = len(Vocab)
WordTagMatrix = np.zeros((t, v))

<b> Emission Probabilities </b>

In [13]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

<b> Transition Probabilities </b>

In [14]:

# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [15]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(noOfTags), len(noOfTags)), dtype='float32')
for i, t1 in enumerate(list(noOfTags)):
    for j, t2 in enumerate(list(noOfTags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [16]:
tags_df = pd.DataFrame(tags_matrix, columns = list(noOfTags), index=list(noOfTags))

In [17]:
tags_df

Unnamed: 0,ADJ,PRT,VERB,ADP,CONJ,ADV,PRON,.,NOUN,X,DET,NUM
ADJ,0.064183,0.010506,0.012147,0.077643,0.017236,0.004432,0.000657,0.064839,0.70174,0.021011,0.004596,0.021011
PRT,0.0842,0.001951,0.400195,0.021456,0.001951,0.009753,0.018531,0.041612,0.249675,0.012679,0.100455,0.057542
VERB,0.06519,0.031589,0.168396,0.090198,0.00542,0.081217,0.035615,0.034841,0.111257,0.218876,0.134252,0.02315
ADP,0.106082,0.001491,0.008627,0.016935,0.000852,0.01342,0.069549,0.03994,0.321333,0.034934,0.324103,0.062733
CONJ,0.12046,0.004138,0.156782,0.054253,0.00046,0.054713,0.058851,0.035402,0.345747,0.008276,0.12092,0.04
ADV,0.128027,0.014925,0.344942,0.120066,0.006633,0.079602,0.015257,0.13466,0.032172,0.022554,0.068325,0.032836
PRON,0.073552,0.012576,0.486662,0.022485,0.004573,0.03468,0.008003,0.040396,0.209604,0.090701,0.009909,0.00686
.,0.044275,0.002504,0.089445,0.09186,0.058855,0.052504,0.066279,0.093918,0.21932,0.027191,0.173792,0.079964
NOUN,0.012261,0.044242,0.145934,0.176533,0.043005,0.0171,0.004657,0.241004,0.263889,0.028852,0.013171,0.009351
X,0.017137,0.184227,0.204697,0.144716,0.009997,0.025547,0.055697,0.162805,0.063313,0.074262,0.054745,0.002856


<b> Modelling <b>

<b>Vanila Viterbi POS tagger</b>

In [18]:
def Viterbi(words):
    state = []
    Tags = list(set([pair[1] for pair in train_tagged_words]))
    
    for key, word in enumerate(words):
        prob = [] 
        for tag in Tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            prob.append(state_probability)
            
        pmax = max(prob)
        state_max = Tags[prob.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

Determing Accuracy

In [19]:
import time

In [20]:
random.seed(1234)

# choose random 10 sents
rndom = [random.randint(1,len(Validation_set)) for x in range(10)]
validation_run = [Validation_set[i] for i in rndom]
validation_run_base = [tup for sent in validation_run for tup in sent]
validation_tagged_words = [tup[0] for sent in validation_run for tup in sent]
len(validation_tagged_words)


217

In [21]:
start = time.time()
tagged_seq = Viterbi(validation_tagged_words)
print(tagged_seq[:10])

# accuracy
check = [i for i, j in zip(tagged_seq, validation_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
end = time.time()
print("Accuracy:")
print(accuracy)
print("Runtime:")
print(end-start)

[('In', 'ADP'), ('this', 'DET'), ('one', 'NUM'), (',', '.'), ('the', 'DET'), ('screen', 'NOUN'), ('fills', 'VERB'), ('with', 'ADP'), ('photographs', 'NOUN'), ('of', 'ADP')]
Accuracy:
0.9170506912442397
Runtime:
28.750932693481445


In [22]:
#incorrectly tagged cases
incorrect_tagged_cases = [[validation_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, validation_run_base)) if j[0]!=j[1]]
print("Incorrectly tagged cases")
print(incorrect_tagged_cases)



Incorrectly tagged cases
[[('this', 'DET'), (('one', 'NUM'), ('one', 'NOUN'))], [('*', 'X'), (('Possibly', 'ADJ'), ('Possibly', 'ADV'))], [('offsetting', 'VERB'), (('that', 'ADP'), ('that', 'DET'))], [('$', '.'), (('45.2', 'ADJ'), ('45.2', 'NUM'))], [('$', '.'), (('84.9', 'ADJ'), ('84.9', 'NUM'))], [('$', '.'), (('1.24', 'ADJ'), ('1.24', 'NUM'))], [('year', 'NOUN'), (('earlier', 'ADV'), ('earlier', 'ADJ'))], [('the', 'DET'), (('backdrop', 'ADJ'), ('backdrop', 'NOUN'))], [('.', '.'), (('Hiroshi', 'ADJ'), ('Hiroshi', 'NOUN'))], [('Hiroshi', 'NOUN'), (('Asada', 'ADJ'), ('Asada', 'NOUN'))], [('industrial', 'ADJ'), (('average', 'ADJ'), ('average', 'NOUN'))], [('takeover', 'NOUN'), (('targets', 'VERB'), ('targets', 'NOUN'))], [('.', '.'), (('Could', 'ADJ'), ('Could', 'VERB'))], [('evil', 'ADJ'), (('deeds', 'ADJ'), ('deeds', 'NOUN'))], [('of', 'ADP'), (('program-trading', 'ADJ'), ('program-trading', 'NOUN'))], [('program-trading', 'NOUN'), (('goblins', 'ADJ'), ('goblins', 'NOUN'))], [('the', 

In [23]:
tag_list = []
for case in incorrect_tagged_cases:
    tag_list.append(case[0])
    tag_list.append(case[1][0])
    tag_list.append(case[1][1])
tag_counts = Counter(tag_list)
print("Most common incorrectly tagged cases are ")
print(tag_counts.most_common(5))

Most common incorrectly tagged cases are 
[(('$', '.'), 3), (('the', 'DET'), 2), (('.', '.'), 2), (('Hiroshi', 'NOUN'), 2), (('program-trading', 'NOUN'), 2)]


<b> Solving problems of unkown words </b>

Unigram Tagger : 

In [24]:
unigram_tagger = nltk.UnigramTagger(Train_set)
accuracyUnigramTagger = unigram_tagger.evaluate(Validation_set)
print("Accuracy of Unigram Tagger:")
print(accuracyUnigramTagger)

Accuracy of Unigram Tagger:
0.8955034832172261


In [25]:
#specifying generic patterns
patterns = [
    (r'.*ing$', 'VBG'),              # gerund
    (r'.*ed$', 'VBD'),               # past tense
    (r'.*es$', 'VBZ'),               # 3rd singular present
    (r'.*ould$', 'MD'),              # modals
    (r'.*\'s$', 'NN$'),              # possessive nouns
    (r'.*s$', 'NNS'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN')                    # nouns
]

In [26]:
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)


lexicon_tagger = nltk.UnigramTagger(Train_set, backoff=rule_based_tagger)

accuracy_lexicon_tagger = lexicon_tagger.evaluate(Validation_set)
print("Accuracy with lexicon and rule based tagger")
print(accuracy_lexicon_tagger)

Accuracy with lexicon and rule based tagger
0.8955034832172261


Bigram Tagger:

In [27]:
bigram_tagger = nltk.BigramTagger(Train_set,backoff=lexicon_tagger)
accuracy_bigram_tagger = bigram_tagger.evaluate(Validation_set)
print("Accuracy of Bigram Tagger:")
print(accuracy_bigram_tagger)

Accuracy of Bigram Tagger:
0.8982478361832383


In [28]:
trigram_tagger = nltk.TrigramTagger(Train_set,backoff=lexicon_tagger)
accuracy_trigram_tagger = trigram_tagger.evaluate(Validation_set)
print("Accuracy of trigram Tagger:")
print(accuracy_trigram_tagger)

Accuracy of trigram Tagger:
0.8982478361832383


<b>Trying to build Bigram tagger into viterbi</b>

In [29]:
#modifying patterns to out data
def bigram_tagger(word,train_set = Train_set):
    patterns1 = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # 3rd singular present
    (r'.*ould$', 'X'),              # modals
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                    # nouns
    ]
    regex_based_tagger = nltk.RegexpTagger(patterns1)
    
    # trigram backed up by the regex tagger
    bigram_regex_tagger = nltk.BigramTagger(train_set, backoff=regex_based_tagger)
    return bigram_regex_tagger.tag_sents([[(word)]])

In [30]:
def Viterbi_modified(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        # unknown words from bigram taggr
        if word not in tokens:
            unk_word_tag=bigram_tagger(word)
            for sent in unk_word_tag:
                for tup in sent:
                    state.append(tup[1])
        # rest remains same            
        else:            
            p = [] 
            for tag in T:
                if key == 0:
                    transition_p = tags_df.loc['.', tag]
                else:
                    transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
                emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
                state_probability = emission_p * transition_p    
                p.append(state_probability)
            
            pmax = max(p)
            # getting state for which probability is maximum
            state_max = T[p.index(pmax)] 
            state.append(state_max)
            
    return list(zip(words, state))

In [31]:
tagged_seq = Viterbi_modified(validation_tagged_words)

In [32]:
validation_run_base = [tup for sent in validation_run for tup in sent]

# list of untagged words
validation_tagged_words = [tup[0] for sent in validation_run for tup in sent]
validation_tagged_words

['In',
 'this',
 'one',
 ',',
 'the',
 'screen',
 'fills',
 'with',
 'photographs',
 'of',
 'both',
 'candidates',
 '.',
 '*',
 'Possibly',
 'offsetting',
 'that',
 ',',
 'Columbia',
 'recently',
 'estimated',
 '0',
 'it',
 'has',
 'unrealized',
 'gains',
 'on',
 'publicly',
 'traded',
 'equity',
 'investments',
 'of',
 'more',
 'than',
 '$',
 '70',
 'million',
 '*U*',
 '.',
 'IBM',
 ',',
 'the',
 'giant',
 'computer',
 'maker',
 ',',
 'offered',
 '$',
 '750',
 'million',
 '*U*',
 'of',
 'non-callable',
 '30-year',
 'debentures',
 'priced',
 '*',
 '*-1',
 'to',
 'yield',
 '8.47',
 '%',
 ',',
 'or',
 'about',
 '1\\/2',
 'percentage',
 'point',
 'higher',
 'than',
 'the',
 'yield',
 'on',
 '30-year',
 'Treasury',
 'bonds',
 '.',
 'The',
 'St.',
 'Louis',
 'company',
 'earned',
 '$',
 '45.2',
 'million',
 '*U*',
 ',',
 'or',
 '65',
 'cents',
 'a',
 'share',
 ',',
 'compared',
 'with',
 '$',
 '84.9',
 'million',
 '*U*',
 ',',
 'or',
 '$',
 '1.24',
 '*U*',
 'a',
 'share',
 ',',
 'a',
 'year

In [33]:
start = time.time()
tagged_seq_modified = Viterbi_modified(validation_tagged_words)
end = time.time()

In [34]:
# accuracy
check1 = [i for i, j in zip(tagged_seq_modified, validation_run_base) if i == j]
accuracy_viterbi_modified = len(check1)/len(tagged_seq_modified)
print("Accuracy of modified Viterbi:")
print(accuracy_viterbi_modified)
print("Time taken:")
print(end-start)

Accuracy of modified Viterbi:
0.9493087557603687
Time taken:
38.270033836364746


<b> Comparing Tagging Accuracies: </b>
<br> Viterbi - 0.9170506912442397
<br> Unigram - 0.8955034832172261
<br> Bigram - 0.8982478361832383
<br> Trigram - 0.8982478361832383
<br> Viterbi Modified - 0.9493087557603687

<b> Sampling some cases where modified viterbi might have rectified the tagging </b>

In [43]:
#nltk.download('punkt')
sentence_1 = "Google, Twitter, Facebook, Instagram, Whatsapp are the various social media platforms."
words = word_tokenize(sentence_1)
tagged_seq = Viterbi(words)
print(tagged_seq)

[('Google', 'ADJ'), (',', '.'), ('Twitter', 'ADJ'), (',', '.'), ('Facebook', 'ADJ'), (',', '.'), ('Instagram', 'ADJ'), (',', '.'), ('Whatsapp', 'ADJ'), ('are', 'VERB'), ('the', 'DET'), ('various', 'ADJ'), ('social', 'ADJ'), ('media', 'NOUN'), ('platforms', 'NOUN'), ('.', '.')]


In [44]:
tagged_seq_modified = Viterbi_modified(words)
print(tagged_seq_modified)

[('Google', 'NOUN'), (',', '.'), ('Twitter', 'NOUN'), (',', '.'), ('Facebook', 'NOUN'), (',', '.'), ('Instagram', 'NOUN'), (',', '.'), ('Whatsapp', 'NOUN'), ('are', 'VERB'), ('the', 'DET'), ('various', 'ADJ'), ('social', 'ADJ'), ('media', 'NOUN'), ('platforms', 'NOUN'), ('.', '.')]


In [45]:

sentence_1 = "Even with this technology called money, trade has been difficult."
words = word_tokenize(sentence_1)
tagged_seq = Viterbi(words)
print(tagged_seq)

[('Even', 'ADV'), ('with', 'ADP'), ('this', 'DET'), ('technology', 'NOUN'), ('called', 'VERB'), ('money', 'NOUN'), (',', '.'), ('trade', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('difficult', 'ADJ'), ('.', '.')]


In [46]:
tagged_seq_modified = Viterbi_modified(words)
print(tagged_seq_modified)

[('Even', 'ADV'), ('with', 'ADP'), ('this', 'DET'), ('technology', 'NOUN'), ('called', 'VERB'), ('money', 'NOUN'), (',', '.'), ('trade', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('difficult', 'ADJ'), ('.', '.')]


In [47]:
sentence_1 = "upGrad is an online higher education platform providing rigorous industry-relevant programs designed and delivered in collaboration with world-class faculty"
words = word_tokenize(sentence_1)
tagged_seq = Viterbi(words)
print(tagged_seq)


[('upGrad', 'ADJ'), ('is', 'VERB'), ('an', 'DET'), ('online', 'ADJ'), ('higher', 'ADJ'), ('education', 'NOUN'), ('platform', 'ADJ'), ('providing', 'VERB'), ('rigorous', 'ADJ'), ('industry-relevant', 'ADJ'), ('programs', 'NOUN'), ('designed', 'VERB'), ('and', 'CONJ'), ('delivered', 'VERB'), ('in', 'ADP'), ('collaboration', 'ADJ'), ('with', 'ADP'), ('world-class', 'ADJ'), ('faculty', 'NOUN')]


In [48]:
tagged_seq_modified = Viterbi_modified(words)
print(tagged_seq_modified)

[('upGrad', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('online', 'NOUN'), ('higher', 'ADJ'), ('education', 'NOUN'), ('platform', 'NOUN'), ('providing', 'VERB'), ('rigorous', 'NOUN'), ('industry-relevant', 'NOUN'), ('programs', 'NOUN'), ('designed', 'VERB'), ('and', 'CONJ'), ('delivered', 'VERB'), ('in', 'ADP'), ('collaboration', 'NOUN'), ('with', 'ADP'), ('world-class', 'NOUN'), ('faculty', 'NOUN')]
