<a href="https://colab.research.google.com/github/songqsh/foo1/blob/master/src/sb18-exm-4-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gridworld

It is n-d generalization of 2-d gridworld of Example 4.1 of [SB18].

## Code

In [0]:
"""
Created on Wed Dec  4 17:36:40 2019

@author: songqsh
"""

import numpy as np

class gridworld:
    def __init__(
            self,
            WORLD_SIZE = 4, #length of each side
            DIM = 2, #dimension number
            verbose = True
            ):
        self.WORLD_SIZE = WORLD_SIZE
        self.DIM = DIM
        self.ACTIONS = np.append(np.eye(DIM), -np.eye(DIM), axis=0) #each row is one action
        self.ACTION_PROB = 1./(2*DIM) #random policy 
        if verbose:
            print(str(DIM) + 
                  '-d Gridworld, \n length of each side: '
                  + str(WORLD_SIZE)
                  + '\n reflecting boundary \n and absorbing corner'
                  )
        
    #state: n-d array 
    #return: true (if reflecting) or false.
    def is_reflecting(self, state): 
      out = 0 #false by dafault
      out = out or np.any(state>self.WORLD_SIZE-1)
      out = out or np.any(state < 0)
      return  out


    #state: n-d array 
    #return: true (if absorbing/terminating) or false.
    def is_absorbing(self, state): 
      out = 0
      out = out or np.all(state == 0)
      out = out or np.all(state == self.WORLD_SIZE-1)
      return  out
      
    
    #input
    #state: n-d np.array
    #action: n-d np.array
    #return:
    #new_state: n-d np.array, 
    #           if it is terminal, then stay
    #           if next move is absorbing, then stay in previous state
    #           otherwise state + action
    #reward: -1 for each move
    
    def step(self, state, action):
      next_state = state+action
      if self.is_absorbing(state) or self.is_reflecting(next_state):
        next_state = state
      reward = -1
      return next_state, reward
        
        
    #value iteration
    #return:
      #v0: state value matrix
      #iter_n: number of iterations.
    def value_iteration(self):
      v_shape = (np.ones(self.DIM)*self.WORLD_SIZE).astype(int)
      v0 = np.zeros(shape=v_shape)
      v1 = v0.copy()
    
      iter_n = 1
      while True:
        it = np.nditer(v0, flags=['multi_index'])
        while not it.finished:
          state0 = np.array(it.multi_index)
          if self.is_absorbing(state0):
            v1[it.multi_index] = 0.
          else:
            rhs = 0
            for a in self.ACTIONS:
                state1, reward = self.step(state0, a)
                state1_tuple = tuple([i for i in state1.astype(int)])
                rhs += self.ACTION_PROB*(reward+v0[state1_tuple])
            v1[it.multi_index]= rhs 
          it.iternext()
          
        if np.sum(np.abs(v1-v0)) < 1e-4:
          v0 = v1.copy()
          break
        v0 = v1.copy()
        iter_n += 1
    
      return v0, iter_n

## Example for 2-d

In [0]:
import timeit

In [3]:
g2 = gridworld(DIM=2, WORLD_SIZE=4)

2-d Gridworld, 
 length of each side: 4
 reflecting boundary 
 and absorbing corner


In [4]:
start = timeit.default_timer()
out = g2.value_iteration()
stop = timeit.default_timer()
print('Run Time: ', stop - start) 
print('>>>number of iteration: \n'+ str(out[1]))
print('>>>the state value function: \n'+ str(out[0]))

Run Time:  0.37140565699999684
>>>number of iteration: 
218
>>>the state value function: 
[[  0.         -13.99990931 -19.99986561 -21.99984961]
 [-13.99990931 -17.99988161 -19.99986651 -19.99986561]
 [-19.99986561 -19.99986651 -17.99988161 -13.99990931]
 [-21.99984961 -19.99986561 -13.99990931   0.        ]]


## Example for 4-d

In [5]:
g3 = gridworld(DIM=4, WORLD_SIZE=4)

4-d Gridworld, 
 length of each side: 4
 reflecting boundary 
 and absorbing corner


In [6]:
start = timeit.default_timer()
out = g3.value_iteration()
stop = timeit.default_timer()
print('Run Time: ', stop - start) 
print('>>>number of iteration: \n'+ str(out[1]))
#print('>>>the state value function: \n'+ str(out[0]))

Run Time:  279.966130668
>>>number of iteration: 
5132
>>>the state value function: 
[[[[   0.         -253.99990046 -327.85701407 -349.85700528]
   [-253.99990046 -311.38083017 -342.47605586 -354.52367007]
   [-327.85701407 -342.47605586 -353.66652755 -358.76176359]
   [-349.85700528 -354.52367007 -358.76176359 -360.76176277]]

  [[-253.99990046 -311.38083017 -342.47605586 -354.52367007]
   [-311.38083017 -333.66653559 -349.7141482  -356.7617644 ]
   [-342.47605586 -349.7141482  -355.76176481 -358.42843039]
   [-354.52367007 -356.7617644  -358.42843039 -358.76176359]]

  [[-327.85701407 -342.47605586 -353.66652755 -358.76176359]
   [-342.47605586 -349.7141482  -355.76176481 -358.42843039]
   [-353.66652755 -355.76176481 -356.95224052 -356.7617644 ]
   [-358.76176359 -358.42843039 -356.7617644  -354.52367007]]

  [[-349.85700528 -354.52367007 -358.76176359 -360.76176277]
   [-354.52367007 -356.7617644  -358.42843039 -358.76176359]
   [-358.76176359 -358.42843039 -356.7617644  -354.5236

## Remark: Can you run 5-d gridworld