# Markov Model

In [2]:
%pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.2.7-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (129 kB)
[?25l[K     |██▌                             | 10 kB 19.6 MB/s eta 0:00:01[K     |█████                           | 20 kB 11.8 MB/s eta 0:00:01[K     |███████▋                        | 30 kB 9.2 MB/s eta 0:00:01[K     |██████████                      | 40 kB 8.7 MB/s eta 0:00:01[K     |████████████▋                   | 51 kB 4.8 MB/s eta 0:00:01[K     |███████████████▏                | 61 kB 5.7 MB/s eta 0:00:01[K     |█████████████████▊              | 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████████▏           | 81 kB 4.3 MB/s eta 0:00:01[K     |██████████████████████▊         | 92 kB 4.8 MB/s eta 0:00:01[K     |█████████████████████████▎      | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████████▊    | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 122 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████

In [4]:
import numpy as np
from hmmlearn import hmm
import math

In [None]:
"Walk", "Shop", "Clean"

In [10]:
# 2 no of states
model = hmm.MultinomialHMM(n_components = 2)

model.startprob_ = np.array([0.6, 0.4])
model.transmat_ = np.array([[0.7, 0.3], 
                           [0.4, 0.6]])
model.emissionprob_ = np.array([[0.1, 0.4, 0.5],
                               [0.6, 0.3, 0.1]])

In [11]:
math.exp(model.score(np.array([[0]]))) # walk

0.30000000000000004

In [12]:
math.exp(model.score(np.array([[1]]))) # shop

0.36000000000000004

In [13]:
math.exp(model.score(np.array([[2]]))) # clean

0.3400000000000001

In [14]:
math.exp(model.score(np.array([[1,2,0]]))) # shop, clean, walk

0.03276

In [17]:
# optimal state sequence

logprob, seq = model.decode(np.array([[1,2,0]]).transpose())

In [18]:
math.exp(logprob)

0.015120000000000003

In [20]:
seq # rainy, rainy, sunny

array([0, 0, 1])

In [22]:
logprob, seq = model.decode(np.array([[1,2,0,1,2,0,1,2,0]]).transpose())
print(math.exp(logprob))
print(seq)

1.5362887680000006e-06
[0 0 1 0 0 1 0 0 1]


# Reinforcement Learning

In [29]:
gamma = 0.8

R = np.matrix([[-1,-1,-1,-1,0,-1],
               [-1,-1,-1,0,-1,100],
               [-1,-1,-1,0,-1,-1],
               [-1,0,0,-1,0,-1],
               [0,-1,-1,0,-1,100],
               [-1,0,-1,-1,0,100]])

Q = np.matrix(np.zeros([6,6]))

In [33]:
for i in range(10000):
  current_state = np.random.randint(0,int(Q.shape[0]))
  current_state_row = R[current_state, :]
  # get the first avail action
  avail_actions = np.where(current_state_row >= 0)[1]
  action = np.random.choice(avail_actions, 1)
  max_index = np.where(Q[action, :] == np.max(Q[action, :]))[1]
  if max_index.shape[0] > 1:
    max_index = np.random.choice(max_index, 1)
  Q[current_state, action] = R[current_state, action] + gamma * Q[action, max_index]

In [34]:
print("Trained Q matrix:")
print(Q)

Trained Q matrix:
[[  0.   0.   0.   0. 400.   0.]
 [  0.   0.   0. 320.   0. 500.]
 [  0.   0.   0. 320.   0.   0.]
 [  0. 400. 256.   0. 400.   0.]
 [320.   0.   0. 320.   0. 500.]
 [  0. 400.   0.   0. 400. 500.]]


In [36]:
print("Normalized Trained Q matrix:")
print(Q / np.max(Q) * 100)

Normalized Trained Q matrix:
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [ 64.    0.    0.   64.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


In [37]:
current_state = 2
steps = [current_state]

while current_state != 5:
  next_step = np.where(Q[current_state, :] == np.max(Q[current_state, :]))[1]
  if next_step.shape[0] > 1:
    next_step = np.random.choice(next_step, 1)
  steps.append(int(next_step))
  current_state = next_step

print("Path:", steps)

Path: [2, 3, 4, 5]
