In [1]:
from __future__ import print_function, division
from builtins import range
import numpy as np

In [2]:
#Environment
class Grid:
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]
        
    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions
        
    def set_state(self, state):
        self.i = state[0]
        self.j = state[1]
        
    def current_state(self):
        return(self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions 
    
    def move(self, action):
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action =='D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1 
        
        return self.rewards.get((self.i, self.j), 0)
        
    
    def undo_move(self):
        if action == 'U':
            self.i += 1
        elif action =='D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1 
        
        assert(self.current_state() in self.all_states())
                
            
    def game_over(self):
        return (self.i, self.j) not in self.actions
    
    
    def all_states(self):
        return set().union(self.actions.keys(), self.rewards.keys())

    

In [3]:
def standard_grid():
    # .  .  .  1
    # .  x  .  -1
    # s  .  .  .
    
    # 1, -1 are rewards for the corresponding positions
    # 'x'  not allowed
    # 's'  start state
    grid = Grid(3, 4, (2,0))
    rewards = {(0,3) : 1, (1,3) : -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    grid.set(rewards, actions)
    return grid

In [4]:
def negative_grid(step_cost=-0.1):
    grid = standard_grid()
    grid.rewards.update({
            (0, 0): step_cost,
            (0, 1): step_cost,
            (0, 2): step_cost,
            (1, 0): step_cost,
            (1, 2): step_cost,
            (2, 0): step_cost,
            (2, 1): step_cost,
            (2, 2): step_cost,
            (2, 3): step_cost,
    })
    return grid

In [5]:
def print_values(V, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") 
        print("")

In [12]:
def print_policy(P, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

In [13]:
CONVERGENCE_THRESHOLD = 10e-4
GAMMA = 0.9
POSSIBLE_ACTIONS = ('U','D','L','R') 

In [14]:
def random_action(a):
    p = np.random.random()
    if p < 0.5 :
        return a
    else:
        tmp = list(POSSIBLE_ACTIONS)
        tmp.remove(a)
        return np.random.choice(tmp)

In [15]:
def play_grid_world(grid, policy):
    
    # generate the start state randomly
    start_state = list(grid.actions.keys())
    start_index = np.random.choice(len(start_state))
    grid.set_state(start_state[start_index])

    
    s = grid.current_state()
    states_and_rewards = [(s, 0)]
    #play the game and capture the reward
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))

    
    # calculate returns
    G = 0
    states_and_returns = []
    first = True
    for s, r in reversed(states_and_rewards):
        if first:
            first = False
        else:
            states_and_returns.append((s,G))
        G = r + GAMMA * G
    states_and_returns.reverse()
    return states_and_returns

In [16]:
if __name__ == '__main__':
    grid = standard_grid()
    
    print("Rewards")
    print_values(grid.rewards, grid)
    
    
    # policy
    policy = {
        (0,0):'R',
        (1,0):'U',
        (2,0):'U',
        (0,1):'R',
        (1,2):'U',
        (2,1):'L',
        (0,2):'R',
        (2,2):'U',
        (2,3):'L',
    }
    
    V = {}
    returns = {}
    states = grid.all_states()
    for s in states:
        if s in grid.actions :
            returns[s] = []
        else:
            V[s] = 0
            
            
    for e in range(6000):
        # play an episode
        states_and_returns = play_grid_world(grid, policy)
        
        seen_states = set()
        for s, G in states_and_returns:
            if s not in seen_states:
                returns[s].append(G)
                V[s] = np.mean(returns[s])
                seen_states.add(s)
        
    print("\n\nvalues")
    print_values(V, grid)
    
    print("\n\npolicy")
    print_policy(policy, grid)

Rewards
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|


values
---------------------------
 0.42| 0.55| 0.72| 0.00|
---------------------------
 0.32| 0.00| 0.19| 0.00|
---------------------------
 0.24| 0.18| 0.09|-0.19|


policy
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  L  |  U  |  L  |
