# Implement HMM with HMM Learn Library

Install the library

In [1]:
pip install hmmlearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hmmlearn
  Downloading hmmlearn-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.3.0


## Hidden States: Topic about Cat or Dog
## Observed States: What we say contains some words: mouse, fetch, food, tail

## Import Libraries

In [2]:
import numpy as np
from hmmlearn import hmm

import warnings
warnings.filterwarnings('ignore')

## Setup HMM Parameters

## Hidden States

In [3]:

states = ['cat', 'dog']

id2topic = dict(zip(range(len(states)), states))
id2topic

{0: 'cat', 1: 'dog'}

## Initial probabilities of the Hidden States

In [4]:
start_probs = np.array([0.6, 0.4])

## Observed States

In [5]:
vocabulary = ['tail', 'fetch', 'mouse', 'food']

## Emission Probabilities

In [6]:
emission_probs = np.array([[0.25, 0.1, 0.4, 0.25],
                           [0.2, 0.5, 0.1, 0.2]])
emission_probs

array([[0.25, 0.1 , 0.4 , 0.25],
       [0.2 , 0.5 , 0.1 , 0.2 ]])

# Transition Probability

In [7]:
trans_mat = np.array([[0.8, 0.2], [0.2, 0.8]])
trans_mat

array([[0.8, 0.2],
       [0.2, 0.8]])

# Create Training Data

Every sentence we speak has only 5 words

In [8]:
observations = [["tail", "mouse", "mouse", "food", "mouse"],
        ["food", "mouse", "mouse", "food", "mouse"],
        ["tail", "mouse", "mouse", "tail", "mouse"],
        ["food", "mouse", "food", "food", "tail"],
        ["tail", "fetch", "mouse", "food", "tail"],
        ["tail", "fetch", "fetch", "food", "fetch"],
        ["fetch", "fetch", "fetch", "food", "tail"],
        ["food", "mouse", "food", "food", "tail"],
        ["tail", "mouse", "mouse", "tail", "mouse"],
        ["fetch", "fetch", "fetch", "fetch", "fetch"]]

## Define a function to convert sentences/words into numbers

In [9]:
vocab2id = dict(zip(vocabulary, range(len(vocabulary))))
vocab2id

{'tail': 0, 'fetch': 1, 'mouse': 2, 'food': 3}

## Custom function to convert the observable states into a sequence of counts of the corresponding states (BoW)

In [14]:
#print(vocab2id)

def sentence2counts(sentence, vocab2id):
    ans = []
    for word, idx in vocab2id.items():
        count = sentence.count(word)
        ans.append(count)
    return ans

In [15]:
X = []

for sentence in observations:
    row = sentence2counts(sentence, vocab2id)
    X.append(row)
X

[[1, 0, 3, 1],
 [0, 0, 3, 2],
 [2, 0, 3, 0],
 [1, 0, 1, 3],
 [2, 1, 1, 1],
 [1, 3, 0, 1],
 [1, 3, 0, 1],
 [1, 0, 1, 3],
 [2, 0, 3, 0],
 [0, 5, 0, 0]]

## Convet to Array and Replicate Data

In [16]:
data = np.array(X, dtype=int)

lengths = [len(X)]*5

sequences = np.tile(data, (5,1))
sequences

array([[1, 0, 3, 1],
       [0, 0, 3, 2],
       [2, 0, 3, 0],
       [1, 0, 1, 3],
       [2, 1, 1, 1],
       [1, 3, 0, 1],
       [1, 3, 0, 1],
       [1, 0, 1, 3],
       [2, 0, 3, 0],
       [0, 5, 0, 0],
       [1, 0, 3, 1],
       [0, 0, 3, 2],
       [2, 0, 3, 0],
       [1, 0, 1, 3],
       [2, 1, 1, 1],
       [1, 3, 0, 1],
       [1, 3, 0, 1],
       [1, 0, 1, 3],
       [2, 0, 3, 0],
       [0, 5, 0, 0],
       [1, 0, 3, 1],
       [0, 0, 3, 2],
       [2, 0, 3, 0],
       [1, 0, 1, 3],
       [2, 1, 1, 1],
       [1, 3, 0, 1],
       [1, 3, 0, 1],
       [1, 0, 1, 3],
       [2, 0, 3, 0],
       [0, 5, 0, 0],
       [1, 0, 3, 1],
       [0, 0, 3, 2],
       [2, 0, 3, 0],
       [1, 0, 1, 3],
       [2, 1, 1, 1],
       [1, 3, 0, 1],
       [1, 3, 0, 1],
       [1, 0, 1, 3],
       [2, 0, 3, 0],
       [0, 5, 0, 0],
       [1, 0, 3, 1],
       [0, 0, 3, 2],
       [2, 0, 3, 0],
       [1, 0, 1, 3],
       [2, 1, 1, 1],
       [1, 3, 0, 1],
       [1, 3, 0, 1],
       [1, 0,

Train a HMM MOdel

In [21]:
model = hmm.MultinomialHMM(n_components=len(states),
                           n_trials=len(observations[0]),
                                        n_iter=50, init_params='')


model.n_features = len(vocabulary)
model.startprob_ = start_probs
model.transmat_ = trans_mat
model.emissionprob_ = emission_probs

https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


# Train the Model

In [22]:
model.fit(sequences, lengths)

# Trained Model Parameters

In [23]:
model.emissionprob_

array([[2.57129200e-01, 2.86190571e-02, 4.28541642e-01, 2.85710101e-01],
       [1.33352852e-01, 7.33292496e-01, 2.67548571e-05, 1.33327897e-01]])

In [24]:
model.transmat_

array([[0.71429762, 0.28570238],
       [0.50007593, 0.49992407]])

# Decode a Sequence

In [26]:
logprob, received = model.decode(sequences)
print('\n Topics discussed: ')
print([id2topic[x] for x in received])


 Topics discussed: 
['cat', 'cat', 'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'dog', 'cat', 'cat', 'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'dog', 'cat', 'cat', 'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'dog', 'cat', 'cat', 'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'dog', 'cat', 'cat', 'cat', 'cat', 'cat', 'dog', 'dog', 'cat', 'cat', 'dog']
