In [1]:
# Importing the necessary libraries

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
import re
from nltk.tokenize import word_tokenize

In [31]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [40]:
# Taking the tagged words from brown corpus' 'adventure' category
words = brown.tagged_words(categories='adventure', tagset='universal')

# Storing POS tags of the words in a list
pos = ([e[1] for e in words])

In [41]:
# Defining the Hidden Markov Model to store 

def hmm(text):
    markov = {}
    
    for i in range(len(text)-2):
        curr_state=text[i]
        next_state=text[i+1]
        
        if curr_state not in markov:
            markov[curr_state] = {}
            markov[curr_state][next_state] = 1
        else:
            if next_state in markov[curr_state]:
                markov[curr_state][next_state]+=1
            else:
                markov[curr_state][next_state] = 1
 
    for curr_state,transition in markov.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov[curr_state][state] = count/total
    return markov

In [42]:
# Making HMM of pos tags
markov = hmm(pos)

# Creating the transition matrix
L = len(set(pos))
A = np.zeros([L,L])
for i in range(len(set(pos))):
    for j in range(len(set(pos))):
        x = list(set(pos))[i]
        y = list(set(pos))[j]
        if y in markov[x]:
            A[i][j] = markov[x][y]
        else:
            A[i][j] = 0

A_ = pd.DataFrame(A)
A_.columns = list(set(pos))
A_.index = list(set(pos))
print("The transition matrix is following:\n")
A_

The transition matrix is following:



Unnamed: 0,DET,PRT,CONJ,ADP,ADJ,ADV,NUM,.,VERB,X,PRON,NOUN
DET,0.003311,0.003679,0.00049,0.006009,0.188228,0.012508,0.008829,0.010791,0.058492,0.001226,0.009687,0.69675
PRT,0.100575,0.019294,0.019704,0.124795,0.021346,0.066092,0.003284,0.134647,0.470443,0.0,0.013136,0.026683
CONJ,0.132996,0.033134,0.0,0.056144,0.057524,0.092039,0.016107,0.016567,0.307869,0.0,0.129775,0.157846
ADP,0.523412,0.015419,0.001556,0.025746,0.037488,0.017824,0.015419,0.007498,0.027161,0.000707,0.145565,0.182204
ADJ,0.010702,0.019917,0.037753,0.10107,0.043401,0.013377,0.005351,0.151605,0.013971,0.0,0.007432,0.595422
ADV,0.062129,0.031451,0.020882,0.145141,0.089714,0.086362,0.008765,0.266564,0.191544,0.0,0.074762,0.022686
NUM,0.015021,0.004292,0.045064,0.16309,0.053648,0.030043,0.023605,0.085837,0.04721,0.0,0.015021,0.517167
.,0.115311,0.049144,0.073671,0.064336,0.020134,0.07605,0.008053,0.227693,0.093621,0.001464,0.165096,0.105427
VERB,0.167101,0.085221,0.013606,0.148688,0.038618,0.102086,0.005459,0.107789,0.151377,8.1e-05,0.102249,0.077725
X,0.026316,0.0,0.026316,0.026316,0.0,0.026316,0.0,0.578947,0.105263,0.105263,0.0,0.105263


In [43]:
# Calculating the stable state probabilities

step = 10000
A_n = A
i=0
while i<step:
    A_n = np.matmul(A_n, A)
    i+=1

A_n = pd.DataFrame(A_n)
A_n. columns = list(set(pos))
print('The pi matrix is given by:')
A_n.iloc[0:1]

The pi matrix is given by:


Unnamed: 0,DET,PRT,CONJ,ADP,ADJ,ADV,NUM,.,VERB,X,PRON,NOUN
0,0.117609,0.035132,0.031338,0.101945,0.048515,0.055942,0.006721,0.157599,0.177012,0.000548,0.075066,0.192573


In [52]:
# Processing the sentence

string = "They are watching the match"
string = re.sub('[^A-Za-z0-9]',' ',string)
string = string.lower()
string = word_tokenize(string)

In [53]:
# The list of pos tags
pos_ = list(set(pos))
print(pos_)

# The processed sentence
print(string)

['DET', 'PRT', 'CONJ', 'ADP', 'ADJ', 'ADV', 'NUM', '.', 'VERB', 'X', 'PRON', 'NOUN']
['they', 'are', 'watching', 'the', 'match']


In [54]:
B = np.zeros([len(string),len(pos_)])
for m in range(len(string)):
    for n in range(len(pos_)):
        x=0
        s=1
        for tag in words:
            if tag[0] == string[m]:
                s+=1
                if tag[1] == pos_[n]:
                    x+=1
        B[m][n] = x/s

In [58]:
# Creating the emission matrix
B = pd.DataFrame(B)
B.columns = pos_
B.index = string
print("The emission matrix is: \n")
B

The emission matrix is: 



Unnamed: 0,DET,PRT,CONJ,ADP,ADJ,ADV,NUM,.,VERB,X,PRON,NOUN
they,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.995169,0.0
are,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.979592,0.0,0.0,0.0
watching,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.916667,0.0,0.0,0.0
the,0.999703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
match,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.714286
