In [1]:
from __future__ import print_function, division
from builtins import range
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ALPHA = 0.1
GAMMA = 0.9
POSSIBLE_ACTIONS = ('U','D','L','R') 

In [3]:
#Environment
class Grid:
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]
        
    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions
        
    def set_state(self, state):
        self.i = state[0]
        self.j = state[1]
        
    def current_state(self):
        return(self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions 
    
    def move(self, action):
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action =='D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1 
        
        return self.rewards.get((self.i, self.j), 0)
        
    
    def undo_move(self):
        if action == 'U':
            self.i += 1
        elif action =='D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1 
        
        assert(self.current_state() in self.all_states())
                
            
    def game_over(self):
        return (self.i, self.j) not in self.actions
    
    
    def all_states(self):
        return set().union(self.actions.keys(), self.rewards.keys())

    

In [4]:
def standard_grid():
    # .  .  .  1
    # .  x  .  -1
    # s  .  .  .
    
    # 1, -1 are rewards for the corresponding positions
    # 'x'  not allowed
    # 's'  start state
    grid = Grid(3, 4, (2,0))
    rewards = {(0,3) : 1, (1,3) : -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    grid.set(rewards, actions)
    return grid

In [5]:
def negative_grid(step_cost=-0.1):
    grid = standard_grid()
    grid.rewards.update({
            (0, 0): step_cost,
            (0, 1): step_cost,
            (0, 2): step_cost,
            (1, 0): step_cost,
            (1, 2): step_cost,
            (2, 0): step_cost,
            (2, 1): step_cost,
            (2, 2): step_cost,
            (2, 3): step_cost,
    })
    return grid

In [6]:
def print_values(V, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") 
        print("")

In [7]:
def print_policy(P, grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

In [8]:
def random_action(a, eps=0.1):
    p = np.random.random()
    if p < eps:
        return np.random.choice(POSSIBLE_ACTIONS)
    else:
        return a

In [9]:
def max_dict(d):
    max_key = None
    max_val = float('-inf')

    for k, v in d.items():
        if v > max_val:
            max_val = v
            max_key = k
    return max_key, max_val

In [10]:
class Model:

    def __init__(self):
        self.theta = np.random.randn(25) 


    def stateToFeatures(self, s, a):
        return np.array([
            s[0]              if a == 'U' else 0,
            s[1]              if a == 'U' else 0,
            (s[0]*s[1])        if a == 'U' else 0,
            (s[0]*s[0])        if a == 'U' else 0,
            (s[1]*s[1])        if a == 'U' else 0,
            1                 if a == 'U' else 0,
            s[0]              if a == 'D' else 0,
            s[1]              if a == 'D' else 0,
            (s[0]*s[1])        if a == 'D' else 0,
            (s[0]*s[0])        if a == 'D' else 0,
            (s[1]*s[1])        if a == 'D' else 0,
            1                 if a == 'D' else 0,
            s[0]              if a == 'L' else 0,
            s[1]              if a == 'L' else 0,
            (s[0]*s[1])       if a == 'L' else 0,
            (s[0]*s[0])       if a == 'L' else 0,
            (s[1]*s[1])       if a == 'L' else 0,
            1                if a == 'L' else 0,
            s[0]             if a == 'R' else 0,
            s[1]             if a == 'R' else 0,
            (s[0]*s[1])       if a == 'R' else 0,
            (s[0]*s[0])       if a == 'R' else 0,
            (s[1]*s[1])       if a == 'R' else 0,
            1                if a == 'R' else 0,
            1
        ])

    def predict(self, s, a):
        x = self.stateToFeatures(s, a)
        return self.theta.dot(x)



    def grad(self, s, a):
        return self.stateToFeatures(s, a)

In [11]:
def getQA(model, s):
    Qs = {}
    for a in POSSIBLE_ACTIONS:
        q_sa = model.predict(s, a)
        Qs[a] = q_sa
    return Qs

In [None]:
if __name__ == '__main__':
    grid = negative_grid(step_cost=-0.1)
    
    print("Rewards")
    print_values(grid.rewards, grid)
    
    model = Model()
    deltas = []
    t = 1.0
    for e in range(5000):
        if e % 100 == 0:
           t += 0.01
        
        alpha = ALPHA / t
    
        s = (2,0)
        grid.set_state(s)
        
        Qs = getQA(model, s)
        
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5/t) 
        

        max_change = 0        
        while not grid.game_over():
            r = grid.move(a)
            s_next = grid.current_state()
            
            old_theta = model.theta.copy()
            if grid.is_terminal(s_next):
                model.theta += alpha*(r - model.predict(s, a)) * model.grad(s, a)
            else:
                Qs_next = getQA(model, s_next)
                a_next = max_dict(Qs_next)[0]
                a_next = random_action(a_next, eps=0.5/t) 
                model.theta += alpha*(r + GAMMA*model.predict(s_next, a_next) - model.predict(s, a))*model.grad(s, a)

            s = s_next
            a = a_next
            max_change = max(max_change, np.abs(model.theta - old_theta).sum())
        deltas.append(max_change)
        
        
    plt.plot(deltas)
    plt.show()

    policy = {}
    V = {}
    Q = {}
    for s in grid.actions.keys():
        Qs = getQA(model, s)
        Q[s] = Qs
        a, max_q = max_dict(Qs)
        policy[s] = a
        V[s] = max_q

    print("\n\n values")
    print_values(V, grid)

    print("\n\npolicy")
    print_policy(policy, grid)