# Dynamic Programming

In [4]:
import numpy as np

## 1. get_state

In [5]:
def get_state(s,a,grid_size):
    # 4 actions that agent can take
    A = [(-1,0),(1,0),(0,-1),(0,1)]
    s[0] += A[a][0]
    s[1] += A[a][1]
    
    # can not get outside of the grid
    if s[0]<0:
        s[0]=0
    elif s[0]>grid_size-1:
        s[0]=grid_size-1
    
    if s[1]<0:
        s[1]=0
    elif s[1]>grid_size-1:
        s[1]=grid_size-1
    
    return s[0],s[1]

In [27]:
action = [0,1,2,3]

for i in range(3):
    for j in range(3):
        x,y = get_state(s=[i,j],a=3,grid_size=4)
        print(x,y)

0 1
0 2
0 3
1 1
1 2
1 3
2 1
2 2
2 3


## 2. Policy Evaluation
- getting the "state-value function"

In [41]:
def policy_eval(grid_size,action,policy,iter_num,reward=-1,dis=0.9):
    post_value_table = np.zeros([grid_size,grid_size])
    
    for i in range(iter_num):
        val_table = np.zeros([grid_size,grid_size])
        
        for m in range(grid_size):
            for n in range(grid_size):
                if (m==n==0) or (m==n==grid_size-1):
                    value_t = 0
                else :
                    value_t_list = []
                    for a in action:
                        m_, n_ = get_state([m,n],a,grid_size) # get s(t+1)
                        v = reward+dis*post_value_table[m_][n_] # no probability!
                        value_t_list.append(v)
                    val_table[m][n] = max(value_t_list) # just choose the MAX
        i += 1
        
        if i%1 ==0:
            print('Iteration : {} \n {} \n'.format(i,val_table))            
        post_value_table = val_table
        
    return val_table

In [42]:
grid_size = 4
action = [0,1,2,3]
policy = np.empty([grid_size,grid_size,len(action)])

In [43]:
for i in range(grid_size):
    for j in range(grid_size):
        for k in range(len(action)):
            if i==j and ((i==0) or (i==grid_size)):
                policy[i][j]=0.00
            else :
                policy[i][j]=0.25

In [45]:
final_value = policy_eval(grid_size,action,policy,6)

Iteration : 1 
 [[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]] 

Iteration : 2 
 [[ 0.  -1.  -1.9 -1.9]
 [-1.  -1.9 -1.9 -1.9]
 [-1.9 -1.9 -1.9 -1. ]
 [-1.9 -1.9 -1.   0. ]] 

Iteration : 3 
 [[ 0.   -1.   -1.9  -2.71]
 [-1.   -1.9  -2.71 -1.9 ]
 [-1.9  -2.71 -1.9  -1.  ]
 [-2.71 -1.9  -1.    0.  ]] 

Iteration : 4 
 [[ 0.   -1.   -1.9  -2.71]
 [-1.   -1.9  -2.71 -1.9 ]
 [-1.9  -2.71 -1.9  -1.  ]
 [-2.71 -1.9  -1.    0.  ]] 

Iteration : 5 
 [[ 0.   -1.   -1.9  -2.71]
 [-1.   -1.9  -2.71 -1.9 ]
 [-1.9  -2.71 -1.9  -1.  ]
 [-2.71 -1.9  -1.    0.  ]] 

Iteration : 6 
 [[ 0.   -1.   -1.9  -2.71]
 [-1.   -1.9  -2.71 -1.9 ]
 [-1.9  -2.71 -1.9  -1.  ]
 [-2.71 -1.9  -1.    0.  ]] 



In [46]:
final_value

array([[ 0.  , -1.  , -1.9 , -2.71],
       [-1.  , -1.9 , -2.71, -1.9 ],
       [-1.9 , -2.71, -1.9 , -1.  ],
       [-2.71, -1.9 , -1.  ,  0.  ]])

참고 : https://sumniya.tistory.com/10?category=781573