In [1]:
# Policy Evaluation
# Given a policy, show the value function table for the gridworld example
# Credit to Nguyen Dang Quang, PhD Student, Kyung Hee University, Suwon, Korea.

## Random policy
import numpy as np

def print_table(V, row=4, col=4, step=0):
  print 'Table @ step = ', step
  for i in range(row):
    print(V[i*row:i*row+col])

V = np.zeros(16)
print_table(V, 4, 4, 0)


Table @ step =  0
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]


In [2]:
# Reward Table, Transition Probability Table, and Policy Table for N, S, E, W movement
P_trans = np.zeros([16,16])
R = np.zeros([16,16])
pi = np.zeros([16,16,4])

for i in range(16):
  for j in range(16):
    row_i = int(i/4)
    col_i = i % 4
    row_j = int(j/4)
    col_j = j % 4
    if (row_i == row_j) and (col_i-col_j==1):
      P_trans[i,j] = 1
      R[i,j] = -1
      pi[i,j,0] = 1
    if (row_i == row_j) and (col_i-col_j==-1):
      P_trans[i,j] = 1
      R[i,j] = -1
      pi[i,j,1] = 1
    if (col_i == col_j) and (row_i-row_j==1):
      P_trans[i,j] = 1
      R[i,j] = -1
      pi[i,j,2] = 1
    if (col_i == col_j) and (row_i-row_j==-1):
      P_trans[i,j] = 1
      R[i,j] = -1
      pi[i,j,3] = 1

P_trans = P_trans/P_trans.sum(axis=1, keepdims=True)

print 'Transition Probability Table'
print np.round(P_trans,1)

print 'Reward Table'
print R

Transition Probability Table
[[0.  0.5 0.  0.  0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.3 0.  0.3 0.  0.  0.3 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.3 0.  0.3 0.  0.  0.3 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.  0.  0.  0.  0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.3 0.  0.  0.  0.  0.3 0.  0.  0.3 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.2 0.  0.  0.2 0.  0.2 0.  0.  0.2 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.2 0.  0.  0.2 0.  0.2 0.  0.  0.2 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.3 0.  0.  0.3 0.  0.  0.  0.  0.3 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.3 0.  0.  0.  0.  0.3 0.  0.  0.3 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.2 0.  0.  0.2 0.  0.2 0.  0.  0.2 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.2 0.  0.  0.2 0.  0.2 0.  0.  0.2 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.3 0.  0.  0.3 0.  0.  0.  0.  0.3]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.  0.  0.  0.  0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.3 0.  0.  0.3 0.  0.3 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0. 

In [4]:
def print_policy(V, iter):
  policy_table = []
  for i in range(16):
    row_i = int(i/4)
    col_i = i % 4
    V_max = -100
    policy = ""
    for j in range(16):
      row_j = int(j/4)
      col_j = j % 4
      if (row_i == row_j) and (col_i-col_j==1):
        if V_max == V[j]:
          policy += "L"
        elif V_max < V[j]:
          policy = "L"
          V_max = V[j]
          
      if (row_i == row_j) and (col_i-col_j==-1):
        if V_max == V[j]:
          policy += "R"
        elif V_max < V[j]:
          policy = "R"
          V_max = V[j]
          
      if (col_i == col_j) and (row_i-row_j==1):
        if V_max == V[j]:
          policy += "U"
        elif V_max < V[j]:
          policy = "U"
          V_max = V[j]
          
      if (col_i == col_j) and (row_i-row_j==-1):
        if V_max == V[j]:
          policy += "D"
        elif V_max < V[j]:
          policy = "D"
          V_max = V[j]
    if (i==0) or (i==15):
      policy_table.append('-')
    else:
      policy_table.append(policy)
  print_table(policy_table, 4, 4, iter)

    
def policyEvaluation(V_old, R, P_trans, pi, gamma, theta, iter_num):
  #Iterative Policy Evaluation
  print_table(V_old, 4, 4, 0)
  V = np.copy(V_old)
  iter = 0
  while (True) and (iter < iter_num) :
    delta = 0
    V_new = np.zeros(16)
    for s in range(16):
      for a in range(4):
        for s_prime in range(16):
          V_new[s] += pi[s, s_prime, a] * P_trans[s,s_prime]*(R[s,s_prime] + gamma * V[s_prime])
      delta = max(delta, np.abs(V[s] - V_new[s]))
    V_new[0] = 0 # Terminal State
    V_new[15] = 0 # Terminal State
    V = np.copy(V_new)
    iter += 1
    
    if delta < theta:
      break
      
    #print updated V table
    if (iter < 10) or (iter % 50) == 0:
      print_table(np.round(V,1), 4, 4, iter)
      print_policy(V, iter)
      print "\n"
      
policyEvaluation(V, R, P_trans, pi, 1, 0.1, 100)

Table @ step =  0
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
Table @ step =  1
[ 0. -1. -1. -1.]
[-1. -1. -1. -1.]
[-1. -1. -1. -1.]
[-1. -1. -1.  0.]
Table @ step =  1
['-', 'L', 'LRD', 'LD']
['U', 'ULRD', 'ULRD', 'ULD']
['URD', 'ULRD', 'ULRD', 'D']
['UR', 'ULR', 'R', '-']


Table @ step =  2
[ 0.  -1.7 -2.  -2. ]
[-1.7 -2.  -2.  -2. ]
[-2.  -2.  -2.  -1.7]
[-2.  -2.  -1.7  0. ]
Table @ step =  2
['-', 'L', 'L', 'LD']
['U', 'UL', 'ULRD', 'D']
['U', 'ULRD', 'RD', 'D']
['UR', 'R', 'R', '-']


Table @ step =  3
[ 0.  -2.3 -2.9 -3. ]
[-2.3 -2.8 -3.  -2.9]
[-2.9 -3.  -2.8 -2.3]
[-3.  -2.9 -2.3  0. ]
Table @ step =  3
['-', 'L', 'L', 'LD']
['U', 'UL', 'LD', 'D']
['U', 'UR', 'D', 'D']
['UR', 'R', 'R', '-']


Table @ step =  4
[ 0.  -2.9 -3.8 -3.9]
[-2.9 -3.7 -3.9 -3.8]
[-3.8 -3.9 -3.7 -2.9]
[-3.9 -3.8 -2.9  0. ]
Table @ step =  4
['-', 'L', 'L', 'LD']
['U', 'UL', 'L', 'D']
['U', 'U', 'RD', 'D']
['UR', 'R', 'R', '-']


Table @ step =  5
[ 0.  -3.5 -4.6 -4.8]
[-3.5 -4.4 -4.7 -4.6]