In [1]:
import numpy as np
print('numpy version:', np.__version__)


numpy version: 1.23.5


In [2]:
def update_v(V, T, R, gamma):
    """
    Returns updated value array (V)

    """
    reward = R.reshape(-1, 1) + gamma * V.reshape(1, -1) # computing reward part (R(s) + gamma * V_k(s')) !!! it's IMPORTANT that s' !!!
    reward_lh = T * reward # computing reward weighted by likelihood (probabilities) for each current_state, future_state and action => shape (3, 5, 5)
    Q = reward_lh.sum(axis=2) # summing up across last dimension (future_states) => shape (3, 5) actions x current_states
    V_new = Q.max(axis=0) # determining best variant for each state across possible actions => shape (5,)
    return V_new

In [10]:
def update_v_mod(V, T, R, gamma):
    Q = [[sum([T[a][s][t] * (R[a][s][t] + gamma * V[t]) for t in range(len(V))]) for s in range(len(V))] for a in range(T.shape[0])]
    V = np.max(Q, axis=0)
    return V

In [14]:
S = ['A', 'B', 'C', 'D']
A = ['up', 'down']
T = np.array(
    [
        [[0., 1., 0., 0.], # action == 'up'
         [0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 0.]],
        [[0., 0., 0., 0.], # action == 'down'
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.]]
    ]
)
R = np.array(
    [
        [[0., 1., 0., 0.], # action == 'up'
         [0., 0., 1., 0.],
         [0., 0., 0., 10.],
         [0., 0., 0., 0.]],
        [[0., 0., 0., 0.], # action == 'down'
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 10., 0.]]
    ]
)
gamma = 0.75
V = np.zeros((4,))

In [13]:
V = update_v_mod(V=V, T=T, R=R, gamma=gamma)
V

array([ 1.75,  8.5 , 17.5 , 17.5 ])

In [15]:
for _ in range(100):
    V = update_v_mod(V=V, T=T, R=R, gamma=gamma)
V

array([24.25, 31.  , 40.  , 40.  ])

In [16]:
for _ in range(100):
    V = update_v_mod(V=V, T=T, R=R, gamma=gamma)
V

array([24.25, 31.  , 40.  , 40.  ])

In [18]:
S = [x for x in range(6)]
A = ['C', 'M']
# for states 1, 2, 3: T(s, M, s-1) = 1, T(s, C, s+2) = 0.7, T(s, C, s) = 0.3
# for state 0: T(s, M, s) = 1, T(s, C, s) = 1 => state 0 is terminal state
# for states 4, 5: T(s, M, s-1) = 1, T(s, C, s) = 1

# WARNING: T tensor here is defined as s, a, s' instead of a, s, s' as were earlier
T = np.array(
    [
        [[1., 0., 0., 0., 0., 0.], # state 0, action M
         [1., 0., 0., 0., 0., 0.]], # action C
        [[1., 0., 0., 0., 0., 0.], # state 1, action M
         [0., .3, 0., .7, 0., 0.]], # action C
        [[0., 1., 0., 0., 0., 0.], # state 2, action M
         [0., 0., .3, 0., .7, 0.]], # action C
        [[0., 0., 1., 0., 0., 0.], # state 3, action M
         [0., 0., 0., .3, 0., .7]], # action C
        [[0., 0., 0., 1., 0., 0.], # state 4, action M
         [0., 0., 0., 0., 1., 0.]], # action C
        [[0., 0., 0., 0., 1., 0.], # state 5, action M
         [0., 0., 0., 0., 0., 1.]], # action C
    ]
)

# R(s, a, s') = |s' - s| ^ (1/3) for each s != s'
# R(s, a, s) = (s + 4)^(-1/2) for each s != 0
# R(0, M, 0) = R(0, C, 0) = 0

# WARNING: R tensor here is defined as s, a, s' instead of a, s, s' as were earlier
R = np.array([[[np.abs(s - s_prime)**(1/3) if s_prime != s else (s + 4)**(-1/2) for s_prime in range(len(S))] for a in range(len(A))] for s in range(len(S))])
R[0, 0, 0] = 0
R[0, 1, 0] = 0
gamma = 0.6
Q = np.zeros((len(S), len(A)))

In [20]:
R

array([[[0.        , 1.        , 1.25992105, 1.44224957, 1.58740105,
         1.70997595],
        [0.        , 1.        , 1.25992105, 1.44224957, 1.58740105,
         1.70997595]],

       [[1.        , 0.4472136 , 1.        , 1.25992105, 1.44224957,
         1.58740105],
        [1.        , 0.4472136 , 1.        , 1.25992105, 1.44224957,
         1.58740105]],

       [[1.25992105, 1.        , 0.40824829, 1.        , 1.25992105,
         1.44224957],
        [1.25992105, 1.        , 0.40824829, 1.        , 1.25992105,
         1.44224957]],

       [[1.44224957, 1.25992105, 1.        , 0.37796447, 1.        ,
         1.25992105],
        [1.44224957, 1.25992105, 1.        , 0.37796447, 1.        ,
         1.25992105]],

       [[1.58740105, 1.44224957, 1.25992105, 1.        , 0.35355339,
         1.        ],
        [1.58740105, 1.44224957, 1.25992105, 1.        , 0.35355339,
         1.        ]],

       [[1.70997595, 1.58740105, 1.44224957, 1.25992105, 1.        ,
         0.

In [21]:
def update_q(Q, T, R, gamma):
    Q = [[sum([T[s][a][t] * (R[s][a][t] + gamma * np.max(Q, axis=1)[t]) for t in range(Q.shape[0])]) for a in range(Q.shape[1])] for s in range(Q.shape[0])]
    return Q

In [22]:
Q

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [23]:
Q = update_q(Q=Q, T=T, R=R, gamma=gamma)
Q

[[0.0, 0.0],
 [1.0, 1.0161088135763985],
 [1.0, 1.00441922206557],
 [1.0, 0.9953340768291793],
 [1.0, 0.3535533905932738],
 [1.0, 0.3333333333333333]]

![Task description](images/homework5_q_iteration.png)