In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ACTION_SPACE = ('U', 'D', 'L', 'R')
class gridWorld:
    
    def __init__(self,rows,cols,startPosition):
        self.rows = rows
        self.cols = cols
        self.i = startPosition[0]
        self.j = startPosition[1]
        
    
    def setOnGrid(self,rewards,actions):
        self.rewards = rewards
        self.actions = actions
    
    def setState(self,s):
        self.i = s[0]
        self.j = s[1]
    
    def move(self,a):
        if a in self.actions[self.i,self.j] :
            if a =='U':
                self.i -= 1
            elif a =='D':
                self.i += 1
            elif a == 'L':
                self.j -= 1
            elif a == 'R':
                self.j += 1
        return self.rewards.get((self.i,self.j),0)
    
    def getNextState(self,s,a):
        i,j = s[0],s[1]
        
        if a in self.actions[(i,j)]:
            if a =='U':
                i -= 1
            elif a =='D':
                i += 1
            elif a == 'L':
                j -= 1
            elif a == 'R':
                j += 1
        return i,j
                
    def allStates(self):
        return set(self.actions.keys()) | set(self.rewards.keys())
        
    def allActions(self):
        return np.array(['U','D','L','R'])
    
    def currentState(self):
        return (self.i,self.j)
    
    def isTerminal(self,s):
        return s not in self.actions
    
    def reset(self):
        self.i = 2
        self.j = 0
        return (self.i,self.j)
    
    def gameOver(self):
        return (self.i, self.j) not in self.actions
            
    
def standardGrid():
    g = gridWorld(2+1,3+1,(2,0))
    rewards = {(0,3): +1 ,(1,3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    g.setOnGrid(rewards,actions)
    return g

def negativeGrid():
    g = standardGrid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g

def printPolicy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

def printValues(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
        

In [3]:
def playGame(grid , policy , max_steps = 20):
    start_states = list (grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.setState(start_states[start_idx])
    
    s = grid.currentState()
    states = [s]
    rewards = [0]
    
    steps = 0
    while not grid.gameOver():
        a = policy[s]
        r = grid.move(a)
        next_s = grid.currentState()
        
        states.append(next_s)
        rewards.append(r)
        
        steps +=1
        if steps >=max_steps:
            break
        s = next_s
    
    return states,rewards

In [4]:
grid = standardGrid()
'''
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'U',
    (2, 1): 'R',
    (2, 2): 'U',
    (2, 3): 'L',
  }
'''
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
  }

V = {}
returns = {}
for s in grid.allStates():
    V[s] = 0
    returns[s] = []
printValues(V,grid)

gamma = 0.9

for _ in range(100):
    states,rewards = playGame(grid,policy)
    
    G = 0 
    T = len(states)
    for t in range(T-2,-1,-1):
        s = states[t]
        G = rewards[t+1] + gamma * G
        if s not in  states[:t]:
            returns[s].append(G)
            V[s] = np.mean(returns[s])
        

printValues(V,grid)
printPolicy(policy,grid)    


---------------------------
 0.00| 0.00| 0.00| 0.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
