## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries

import nltk, re
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
nltk_data[:1]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')]]

In [4]:
# Splitting into train and test
random.seed(42)
train_set, test_set = train_test_split(nltk_data, test_size=0.05)

print("Training Set Length :", len(train_set))
print("Training Set Length :", len(test_set))
print("==" * 89)
print("Training Data sample :\n")
print(train_set[:10])

Training Set Length : 3718
Training Set Length : 196
Training Data sample :

[[('Any', 'DET'), ('money', 'NOUN'), ('in', 'ADP'), ('excess', 'NOUN'), ('of', 'ADP'), ('$', '.'), ('40', 'NUM'), ('million', 'NUM'), ('*U*', 'X'), ('collected', 'VERB'), ('*', 'X'), ('from', 'ADP'), ('the', 'DET'), ('fees', 'NOUN'), ('in', 'ADP'), ('fiscal', 'ADJ'), ('1990', 'NUM'), ('would', 'VERB'), ('go', 'VERB'), ('to', 'PRT'), ('the', 'DET'), ('Treasury', 'NOUN'), ('at', 'ADP'), ('large', 'ADJ'), ('.', '.')], [('Stock-index', 'NOUN'), ('futures', 'NOUN'), ('--', '.'), ('Contracts', 'NOUN'), ('*', 'X'), ('to', 'PRT'), ('buy', 'VERB'), ('or', 'CONJ'), ('sell', 'VERB'), ('the', 'DET'), ('cash', 'NOUN'), ('value', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('stock', 'NOUN'), ('index', 'NOUN'), ('by', 'ADP'), ('a', 'DET'), ('certain', 'ADJ'), ('date', 'NOUN'), ('.', '.')], [('All', 'DET'), ('came', 'VERB'), ('from', 'ADP'), ('Cray', 'NOUN'), ('Research', 'NOUN'), ('.', '.')], [('Yasser', 'NOUN'), ('Arafat', 'NOUN'

In [5]:
# Get list of tagged tuples

train_tagged_tup = [tup for sent in train_set for tup in sent]
print(f" Num of tuples : {len(train_tagged_tup)}")
train_tagged_tup[:10]

 Num of tuples : 95558


[('Any', 'DET'),
 ('money', 'NOUN'),
 ('in', 'ADP'),
 ('excess', 'NOUN'),
 ('of', 'ADP'),
 ('$', '.'),
 ('40', 'NUM'),
 ('million', 'NUM'),
 ('*U*', 'X'),
 ('collected', 'VERB')]

In [6]:
# Get a list of tagged words

train_tagged_words = [pair[0] for pair in train_tagged_tup]
train_tagged_words[:10]

['Any',
 'money',
 'in',
 'excess',
 'of',
 '$',
 '40',
 'million',
 '*U*',
 'collected']

In [7]:
# Get a list of tags 

train_tagged_tags = [pair[1] for pair in train_tagged_tup]
train_tagged_tags[:10]

['DET', 'NOUN', 'ADP', 'NOUN', 'ADP', '.', 'NUM', 'NUM', 'X', 'VERB']

In [8]:
## Building the vocabulary and the POS tag sets (haivng unique occurences)

train_vocab = set(train_tagged_words)
train_tags = set(train_tagged_tags)

print(f"Vocabulary_length : {len(train_vocab)} \nTagset_length : {len(train_tags)}")
print("\n")
print(f"Available tags : {train_tags}")

Vocabulary_length : 12077 
Tagset_length : 12


Available tags : {'.', 'VERB', 'PRON', 'NOUN', 'ADV', 'DET', 'ADP', 'ADJ', 'X', 'NUM', 'PRT', 'CONJ'}


### Emission and Transition probabilities

In [9]:
len_tags = len(train_tags)
len_vocab = len(train_vocab)

word_given_tag = np.zeros((len_tags, len_vocab))

In [10]:
## computing emission probability (Word given tag = P(word | tag))

def word_given_tag(word, train_bag = train_tagged_tup):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [11]:
# computing Transition Probability (tag2(t2) given tag1 (t1) = P(tag2 | tag1))

def t2_given_t1(t2, t1, train_bag = train_tagged_tup):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [13]:
tag_toTag_matrix = np.zeros((len_tags, len_tags), dtype='float32')
for i, t1 in enumerate(list(train_tags)):
    for j, t2 in enumerate(list(train_tags)): 
        tag_toTag_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

tag_toTag_matrix

array([[9.27253217e-02, 8.87833685e-02, 6.60275966e-02, 2.22809538e-01,
        5.17828353e-02, 1.72997668e-01, 9.13814753e-02, 4.38989438e-02,
        2.65185442e-02, 8.17057863e-02, 2.32933159e-03, 5.89500107e-02],
       [3.53535339e-02, 1.69075370e-01, 3.62082347e-02, 1.10644914e-01,
        8.18958804e-02, 1.34343430e-01, 9.14529935e-02, 6.41025677e-02,
        2.17094019e-01, 2.30769236e-02, 3.13908309e-02, 5.36130555e-03],
       [4.14921977e-02, 4.85725164e-01, 7.61324726e-03, 2.08983630e-01,
        3.27369608e-02, 9.89722088e-03, 2.24590786e-02, 7.42291585e-02,
        9.13589671e-02, 7.61324726e-03, 1.25618577e-02, 5.32927271e-03],
       [2.40799502e-01, 1.46296099e-01, 4.55921516e-03, 2.65091002e-01,
        1.70697011e-02, 1.33129079e-02, 1.76058650e-01, 1.21822227e-02,
        2.87777651e-02, 9.51964129e-03, 4.39508334e-02, 4.23824638e-02],
       [1.36287898e-01, 3.44885051e-01, 1.56614464e-02, 3.06564476e-02,
        8.09730068e-02, 6.79773390e-02, 1.17627457e-01, 1.29

### Build the vanilla Viterbi based POS tagger

### Solve the problem of unknown words

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications