# POS Tagging using the Hidden Markov Model
Hidden Markov model is a chain of invisble states. Each state emits observable output

![image-2.png](attachment:image-2.png)

In [1]:
!pip install pomegranate==0.14.4
# !conda install -c anaconda pomegranate

Collecting pomegranate==0.14.4
  Downloading pomegranate-0.14.4.tar.gz (4.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/4.3 MB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m2.6/4.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pomegranate
  Building wheel for pomegranate (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pomegranate: filename=pomegranate-0.14.4-cp310-cp310-linux_x86_64.whl size=20180327 sha256=ef7f

In [2]:
import numpy as np
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
from pprint import pprint

In [3]:
# training data
sent1='Ram plays in parks'
sent1_POS='NOUN VERB MODIFIER NOUN'
sent2='Robert parks car at nights'
sent2_POS='NOUN VERB NOUN MODIFIER NOUN'

In [4]:
s1 = sent1.split()
s2 = sent2.split()
s1_POS = sent1_POS.split()
s2_POS = sent2_POS.split()

print(s1)
print(s1_POS)
print(s2)
print(s2_POS)
words_list = s1 + s2
POS_list = s1_POS + s2_POS
print(POS_list)
print(words_list)

['Ram', 'plays', 'in', 'parks']
['NOUN', 'VERB', 'MODIFIER', 'NOUN']
['Robert', 'parks', 'car', 'at', 'nights']
['NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
['Ram', 'plays', 'in', 'parks', 'Robert', 'parks', 'car', 'at', 'nights']


## Hidden Markov Model

In [5]:
#Build Hidden Markov Model
hmm_model = HiddenMarkovModel(name="POS-Tagger")

## Add Hidden states with their emission probabilities to the model

### Hidden states with their emissions counts
- keys will be the hidden states (POS tags)
- value of each key will be emissions from that hidden state (a dictionary of word frequency for that POS)

In [6]:
print(POS_list)
print(words_list)

['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
['Ram', 'plays', 'in', 'parks', 'Robert', 'parks', 'car', 'at', 'nights']


In [7]:
# defaultdict is simpler to use as you don't need to initialize the value of any key
# when a key (POS) is not found, the lambda function tells to create another defaultdict object which will have values as int
# so, for the outer dict the keyValues will be a dict, and for the inner dict the keyValues will be type int

POS_wordsFreq = defaultdict(lambda: defaultdict(int))
for POS, word in zip(POS_list, words_list):
    POS_wordsFreq[POS][word] += 1
pprint(POS_wordsFreq)


defaultdict(<function <lambda> at 0x7fc42b616c20>,
            {'MODIFIER': defaultdict(<class 'int'>, {'in': 1, 'at': 1}),
             'NOUN': defaultdict(<class 'int'>,
                                 {'Ram': 1,
                                  'Robert': 1,
                                  'car': 1,
                                  'nights': 1,
                                  'parks': 1}),
             'VERB': defaultdict(<class 'int'>, {'plays': 1, 'parks': 1})})


### Calculate emission probabilities and add each POS 'State' object to the model

In [8]:
# POS_words_count has counts for emission under each POS
# we convert that count to probability, which is the emission probability
# to_states will store the emission probability of each state
to_states = []
for POS, wordsFreq in POS_wordsFreq.items():
    total = float(sum(wordsFreq.values()))
    print('------------------------------------')
    print(POS,' total',total)
    emission_prob = {word: count/total for word, count in wordsFreq.items()}
    print(emission_prob)
    #print('------------------------------------')
    POS_state = State(DiscreteDistribution(emission_prob), name=POS)
    #print('POS_state',POS_state)
    to_states.append(POS_state)
    #input("any key")

------------------------------------
NOUN  total 5.0
{'Ram': 0.2, 'parks': 0.2, 'Robert': 0.2, 'car': 0.2, 'nights': 0.2}
------------------------------------
VERB  total 2.0
{'plays': 0.5, 'parks': 0.5}
------------------------------------
MODIFIER  total 2.0
{'in': 0.5, 'at': 0.5}


In [None]:
#print(type(to_states[0]))
# print(len(to_states))
#print([state.name for state  in to_states])
# print(to_states[0])


## Add Start and End Tag transition probabilities

### Counts for Transition from Start tag and Transition to End tag

In [None]:
POS_list

['NOUN',
 'VERB',
 'MODIFIER',
 'NOUN',
 'NOUN',
 'VERB',
 'NOUN',
 'MODIFIER',
 'NOUN']

In [9]:
print(s1_POS)
print(s2_POS)

start_POS_list = [s1_POS[0],s2_POS[0]]
print('Start POS list:',start_POS_list)
end_POS_list = [s1_POS[-1],s2_POS[-1]]
print('End POS list:',end_POS_list)
start_POS_count = Counter(start_POS_list)
print('Start POS counts:',start_POS_count)
end_POS_count = Counter(end_POS_list)
print('End POS counts:', end_POS_count)
print('------------------------')
POS_count_ug = Counter(POS_list)
print('POS unigram counts:',POS_count_ug)
print(POS_list)

['NOUN', 'VERB', 'MODIFIER', 'NOUN']
['NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']
Start POS list: ['NOUN', 'NOUN']
End POS list: ['NOUN', 'NOUN']
Start POS counts: Counter({'NOUN': 2})
End POS counts: Counter({'NOUN': 2})
------------------------
POS unigram counts: Counter({'NOUN': 5, 'VERB': 2, 'MODIFIER': 2})
['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']


### Start and End probability for each POS tag

In [10]:
start_prob, end_prob = {}, {}
for ps in POS_count_ug:
    start_prob[ps]=start_POS_count[ps]/POS_count_ug[ps]
    end_prob[ps] = end_POS_count[ps]/POS_count_ug[ps]

print(start_prob)
print(end_prob)

{'NOUN': 0.4, 'VERB': 0.0, 'MODIFIER': 0.0}
{'NOUN': 0.4, 'VERB': 0.0, 'MODIFIER': 0.0}


### Add Start & End probalities to the model

In [11]:
for POS_state in to_states :
    hmm_model.add_transition(hmm_model.start,POS_state,start_prob[POS_state.name])
    hmm_model.add_transition(POS_state,hmm_model.end,end_prob[POS_state.name])

## Add Transition probabilities between POS states

### Hidden state Transition counts - using POS_List Bigrams

In [12]:
bigrams = [(POS_list[i],POS_list[i+1]) for i in range(0,len(POS_list)-1,1)]
pprint(bigrams)

POS_count_bg = Counter(bigrams)
pprint(POS_count_bg)

[('NOUN', 'VERB'),
 ('VERB', 'MODIFIER'),
 ('MODIFIER', 'NOUN'),
 ('NOUN', 'NOUN'),
 ('NOUN', 'VERB'),
 ('VERB', 'NOUN'),
 ('NOUN', 'MODIFIER'),
 ('MODIFIER', 'NOUN')]
Counter({('NOUN', 'VERB'): 2,
         ('MODIFIER', 'NOUN'): 2,
         ('VERB', 'MODIFIER'): 1,
         ('NOUN', 'NOUN'): 1,
         ('VERB', 'NOUN'): 1,
         ('NOUN', 'MODIFIER'): 1})


### Transition probabilities for each POS tag pair

In [13]:
# Get the transition probability
transition_prob_POS_word={}
for key in POS_count_bg.keys():
    transition_prob_POS_word[key]=POS_count_bg.get(key)/POS_count_ug[key[0]]

transition_prob_POS_word

{('NOUN', 'VERB'): 0.4,
 ('VERB', 'MODIFIER'): 0.5,
 ('MODIFIER', 'NOUN'): 1.0,
 ('NOUN', 'NOUN'): 0.2,
 ('VERB', 'NOUN'): 0.5,
 ('NOUN', 'MODIFIER'): 0.2}

**If a certain pair of POS don't occur in traning set, make them ZEROES. Since our training set is very less, this issue occurs**

In [14]:
# If a certain pair of POS don't occur in traning set, make them ZEROES. Since our training set is very less, this issue occurs
transition_prob_POS_word[('VERB', 'VERB')]=0
transition_prob_POS_word[('MODIFIER', 'VERB')]=0
transition_prob_POS_word[('MODIFIER', 'MODIFIER')]=0

In [15]:
transition_prob_POS_word

{('NOUN', 'VERB'): 0.4,
 ('VERB', 'MODIFIER'): 0.5,
 ('MODIFIER', 'NOUN'): 1.0,
 ('NOUN', 'NOUN'): 0.2,
 ('VERB', 'NOUN'): 0.5,
 ('NOUN', 'MODIFIER'): 0.2,
 ('VERB', 'VERB'): 0,
 ('MODIFIER', 'VERB'): 0,
 ('MODIFIER', 'MODIFIER'): 0}

In [16]:
# Add transition probabilities to all POS
for POS_state in to_states :
    for next_POS_state in to_states :
        hmm_model.add_transition(POS_state,next_POS_state,transition_prob_POS_word[(POS_state.name,next_POS_state.name)])

In [17]:
hmm_model.bake()

## Decode POS for a new sentence

In [18]:
# Decode POS for a new sentence
def POS_decoding(sentence, model):
    _, state_path = model.viterbi(sentence)
    return [state[1].name for state in state_path[1:-1]]

In [19]:
print(s1)
print(s2)

['Ram', 'plays', 'in', 'parks']
['Robert', 'parks', 'car', 'at', 'nights']


In [20]:
s3 = ('Ram','parks','car','in','parks')
print(s3)
POS_tags = POS_decoding(s3, hmm_model)
print(str(POS_tags))

('Ram', 'parks', 'car', 'in', 'parks')
['NOUN', 'VERB', 'NOUN', 'MODIFIER', 'NOUN']


In [None]:
s3 = ['Ram','plays','at','nights','in','parks']
print(s3)
POS_tags = POS_decoding(s3, hmm_model)
print(str(POS_tags))

['Ram', 'plays', 'at', 'nights', 'in', 'parks']
['NOUN', 'VERB', 'MODIFIER', 'NOUN', 'MODIFIER', 'NOUN']


In [None]:
_, state_path = hmm_model.viterbi(('Ram','parks','car','in','parks'))
[state[1].name for state in state_path[1:-1]]

In [None]:
state_path
# state_path[0] is the 'start'
# state_path[-1] is the 'end'
# So, we consider the values between these
# each state_path[i] is a tuple, and we need the state_path[i,1].name for i between 1 and last but 1 index.

### Getting emission matrix from model

In [None]:
print(hmm_model)

In [None]:
hmm_model.dense_transition_matrix()

In [None]:
hmm_model.sample(length=20)

In [None]:
custom_model = HiddenMarkovModel.from_matrix(transition_matrix, distributions, start_probs, end_probs)