In [1]:
import numpy as np
import random

#define the transition (default size = 5x5)
def next_state_reward(state, action, height = 5, width = 5):
  action_move = [(-1, 0), (1, 0), (0, -1), (0, 1)] #4 possible actions -  0: Up / 1: Down / 2: Left / 3: Right

  #state transition: states = (height, width)
  state[0] += action_move[action][0]
  state[1] += action_move[action][1]

  #treating some blocked moves
  if state[0] < 0:
    state[0] = 0
  elif state[0] >  height-1:
    state[0] = height-1
  if state[1] < 0:
    state[1] = 0
  elif state[1] > width-1:
    state[1] = width-1

  #return the new state according to the selected action and reward = -1
  return [state[0], state[1]], -1


In [2]:
def mysterious_state_reward(state, action, height = 5, width = 5):
  action_move = [(-1, 0), (1, 0), (0, -1), (0, 1)]

  mys = []

  mys_1 = [1,3]
  mys_2 = [0,3]

  if state in ([0,2] or [2,1]):
    mys = mys_1
  else:
    mys = mys_2

  a = random.choice(mys)
  state[0] += action_move[a][0]
  state[1] += action_move[a][1]

  if state[0] < 0:
    state[0] = 0
  elif state[0] >  height-1:
    state[0] = height-1
  if state[1] < 0:
    state[1] = 0
  elif state[1] > width-1:
    state[1] = width-1

  return [state[0], state[1]], 2

In [6]:
def policy_evaluation(action, policy, height = 5, width = 5, theta=1e-5, gamma=0.9):
  #initialize state value function
  value_table = np.zeros(shape=(height, width))

  #two-array version (try in-place version by yourself)
  iter = 0
  while iter<2000:
    #update the new state value function
    next_value_table = np.zeros(shape=(height, width))
    delta = 0
    for i in range(height):
      for j in range(width):

        #always assign 0 to the terminal states
        if ((i == 0) and (j == 0)) or ((i == height-1) and (j == height-1)):
          value_iter = 0

        else:
          #update the non-terminal states using the Bellman equation
          value_iter = 0

          for act in action:
            if [i,j] in [[0,2],[2,1],[3,3]]:
                transition_prob = 0.5
                next_s, r = mysterious_state_reward([i,j], act)

            else:
                transition_prob = 1 #deterministic
                next_s, r = next_state_reward([i,j], act)
            value_iter += policy[i][j][act]*transition_prob*(r + gamma*value_table[next_s[0]][next_s[1]])

          #deterministic state transition (do not consider the transition probability in this case)
          # for act in action:
          #   next_s, r = next_state_reward([i,j], act)
          #   value_iter += policy[i][j][act]*transition_prob*(r + gamma*value_table[next_s[0]][next_s[1]])

        #update the state value function
        next_value_table[i][j] = round(value_iter, 3)

        #compare the error and the error bound
        delta = max(delta, abs(next_value_table[i][j] - value_table[i][j]))

    value_table = next_value_table
    iter += 1

    #termination condition
    if delta < theta:
      # print('Final results ({} iterations): \n {}'.format(iter, next_value_table))
      break

    # print the results
    iter_visual = [1, 2, 10, 50] + [n*100 for n in range(20)]
    if iter in iter_visual:
      print('iteration {}: \n {}'.format(iter, next_value_table))

  return next_value_table

In [7]:
#grid environment
grid_height, grid_width = 5, 5 #5x5 gridworld
action = [0, 1, 2, 3] #0: up / 1: down / 2: left / 3: right

#policy initialization
policy = np.zeros(shape=(grid_height, grid_width, len(action)))

#random equiprobable policy
for i in range(grid_height):
  for j in range(grid_width):
    for k in range(len(action)):
      if ((i == 0) and (j == 0)) or ((i == grid_height-1) and (j == grid_height-1)):
        policy[i][j][k] = 0
      else:
        policy[i][j][k] = 0.25


state_value = policy_evaluation(action, policy)
# in_place_state_value = policy_evaluation_in_place(action, policy)
print('Vㅠ(s):',state_value)
# print('V(s):',in_place_state_value)

iteration 1: 
 [[ 0. -1.  1. -1. -1.]
 [-1. -1. -1. -1. -1.]
 [-1.  1. -1. -1. -1.]
 [-1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  0.]]
iteration 2: 
 [[ 0.    -1.225  1.225 -1.45  -1.9  ]
 [-1.675 -1.45  -1.45  -1.9   -1.9  ]
 [-1.45   0.55  -1.45  -1.45  -1.9  ]
 [-1.9   -1.45  -1.45   0.55  -1.225]
 [-1.9   -1.9   -1.9   -1.225  0.   ]]
iteration 10: 
 [[ 0.    -2.14   0.564 -4.026 -5.322]
 [-3.118 -3.029 -3.378 -4.619 -5.255]
 [-3.617 -0.441 -3.45  -3.821 -4.502]
 [-4.947 -4.058 -3.693 -0.509 -2.652]
 [-5.622 -5.182 -4.444 -2.637  0.   ]]
iteration 50: 
 [[ 0.    -2.328 -0.661 -4.704 -6.26 ]
 [-3.44  -3.422 -3.821 -5.372 -6.117]
 [-4.213 -0.599 -4.034 -4.364 -5.206]
 [-5.85  -4.834 -4.267 -0.808 -2.985]
 [-6.724 -6.137 -5.167 -2.972  0.   ]]
iteration 100: 
 [[ 0.    -1.984  1.513 -4.367 -6.213]
 [-3.461 -3.267 -3.572 -5.234 -6.143]
 [-4.189 -0.564 -3.947 -4.359 -5.205]
 [-5.847 -4.82  -4.238 -0.658 -2.949]
 [-6.726 -6.135 -5.176 -2.94   0.   ]]
iteration 200: 
 [[ 0.    -2.337  0.326 -