In [17]:
from __future__ import print_function, division
from builtins import range
import numpy as np

In [18]:
#Environment
class Grid:
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]
        
    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions
        
    def set_state(self, state):
        self.i = state[0]
        self.j = state[1]
        
    def current_state(self):
        return(self.i, self.j)

    
    def move(self, action):
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action =='D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1 
        
        return self.rewards.get((self.i, self.j), 0)
                        
    
    def all_states(self):
        return set().union(self.actions.keys(), self.rewards.keys())
        #return set(self.actions.keys() + self.rewards.keys())
    

In [19]:
def standard_grid():
    # .  .  .  1
    # .  x  .  -1
    # s  .  .  .
    
    # 1, -1 are rewards for the corresponding positions
    # 'x'  not allowed
    # 's'  start state
    grid = Grid(3, 4, (2,0))
    rewards = {(0,3) : 1, (1,3) : -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    grid.set(rewards, actions)
    return grid

In [20]:
convergence_threshold = 1e-3

In [21]:
def print_values(V, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") 
        print("")

In [22]:
def print_policy(P, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

In [23]:
if __name__ == '__main__':

    # find the value function for the given policy
    # (p(s',r|s,a)  is deterministic
    
    grid = standard_grid()
    states = grid.all_states()
    print(states)
    
    # unifrom random action policy
    V = {}
    for s in states:
        V[s] = 0
        
    # discount factor
    gamma = 1.0
    
    while True:
        max_change = 0 
        for s in states:
            old_v = V[s]

            if s in grid.actions:
                new_v = 0
                p_a = 1.0 / len(grid.actions[s])
                
                for a in grid.actions[s]:
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v = new_v + p_a * ( r + gamma * V[grid.current_state()])

                V[s] = new_v
                max_change = max(max_change, np.abs(old_v - V[s]))
        
        if max_change < convergence_threshold:
            break
            
    print("Value function for uniform random action")
    print_values(V,grid)
    print("\n\n")
    
    
    
    # FIXED POLICY
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }

    print_policy(policy, grid)
    V = {}
    for s in states:
        V[s] = 0
        
    # discount factor
    gamma = 0.9
    
    while True:
        max_change = 0 
        for s in states:
            old_v = V[s]
            
            if s in policy:
                a = policy[s]                
                grid.set_state(s)
                r = grid.move(a)
                V[s] = r + gamma * V[grid.current_state()]
                max_change = max(max_change, np.abs(old_v - V[s]))
        
        if max_change < convergence_threshold:
            break
            
    print("\n\nValue function for uniform random action")
    print_values(V,grid)
    print("\n\n") 

{(0, 1), (1, 2), (0, 0), (1, 3), (2, 1), (2, 0), (2, 3), (2, 2), (1, 0), (0, 2), (0, 3)}
Value function for uniform random action
---------------------------
-0.03| 0.09| 0.22| 0.00|
---------------------------
-0.16| 0.00|-0.44| 0.00|
---------------------------
-0.29|-0.41|-0.54|-0.77|



---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


Value function for uniform random action
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|



