In [1]:
from numpy import *
import numpy.matlib as matlib
import itertools
import sys

In [2]:
DEBUG = True

GRID_HEIGHT = 4
GRID_WIDTH = 4

<img src="/files/4x4%20Grid%20-%20State%20Transistion%20Diagram.png"/>

In [3]:
class Action:
    value_map = {'up':0, 'down':1, 'left':2, 'right':3}
    
    def __init__(self, value):
        self.value = value.lower()
        self.index = self.value_map[self.value]
        
    def __eq__(self, other):
        if self.index == other.index:
            return True
        return False
    
    def __str__(self):
        return self.value
    
    
class State:
    def __init__(self, i, j):
        self.i = i
        self.j = j
        self.index = self.i * GRID_HEIGHT + self.j
   
    def left_of(self, other):
        if self.i == other.i and self.j - other.j == -1:
            return True
        return False

    def right_of(self, other):
        if self.i == other.i and self.j - other.j == 1:
            return True
        return False

    def above(self, other):
        if self.j == other.j and self.i - other.i == -1:
            return True
        return False

    def below(self, other):
        if self.j == other.j and self.i - other.i == 1:
            return True
        return False
    
    def on_top_edge(self):
        if self.i == 0:
            return True
        
    def on_bottom_edge(self):
        if self.i == GRID_HEIGHT - 1:
            return True
        
    def on_left_edge(self):
        if self.j == 0:
            return True
        
    def on_right_edge(self):
        if self.j == GRID_WIDTH - 1:
            return True
        
    def __eq__(self, other):
        if self.index == other.index:
            return True
        return False
    
    def __str__(self):
        return 's{}{}'.format(self.i,self.j)

In [4]:
actions = [Action('up'), Action('down'), Action('left'), Action('right')]
states = [State(i,j) for i,j in itertools.product(range(GRID_HEIGHT), range(GRID_WIDTH))]

# terminal states
s_term = [State(0,0), State(3,3)]

In [5]:
# |S| x |A|
uni_random_policy = full(shape=(len(states), len(actions)), fill_value=0.25)

In [18]:
uni_random_policy

array([[ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25],
       [ 0.25,  0.25,  0.25,  0.25]])

In [6]:
def trans(s, a, s_p):
    if s in s_term:
        return 0.0
    if a == Action('up'):
        if s_p.above(s) or (s == s_p and s.on_top_edge()):
            return 1.0
    elif a == Action('down'):
        if s_p.below(s) or (s == s_p and s.on_bottom_edge()):
            return 1.0
    elif a == Action('left'):
        if s_p.left_of(s) or (s == s_p and s.on_left_edge()):
            return 1.0
    elif a == Action('right'):
        if s_p.right_of(s) or (s == s_p and s.on_right_edge()):
            return 1.0

    return 0.0
    
# |S| x |A| x |S|
p_trans = zeros(shape=(len(states), len(actions), len(states)))

for s, a, s_p in itertools.product(states, actions, states):
    p_trans[s.index, a.index, s_p.index] = trans(s, a, s_p) 

In [7]:
r_term = 0.0  # Reward for terminal state
r_step = -1.0 # Reward for any non-terminal state

gamma = 1.0  # Discount factor

In [8]:
def reward(state, action, next_state):
    if state in s_term:
        return r_term
    else:
        return r_step

# |S| x |A| x |S|
r = zeros(shape=(len(states),len(actions),len(states)))

for s, state in enumerate(states):
    for a, action in enumerate(actions):
        for s_p, next_state in enumerate(states):
            r[s,a,s_p] = reward(state,action,next_state)

In [9]:
# Add a new parameter for vk_new (this will allow modified version and non-modified version based on call)
def policy_evaluation(policy, vk):
    vk_new = zeros(shape=(len(states)))
    for s, state in enumerate(states):
        for a, action in enumerate(actions):
            for s_p, next_state in enumerate(states):
                vk_new[s] += policy[s, a] * p_trans[s, a, s_p] * (r[s, a, s_p] + gamma * vk[s_p])
    return vk_new

Evaluating Uniform Random Policy

In [16]:
vk = zeros(shape=(len(states)))

NUM_ITERS = 500
for k in range(NUM_ITERS):
    vk = policy_evaluation(uni_random_policy, vk)

vk_uni = copy(vk)
for s in states:
    print '{} = {}'.format(s, vk_uni[s.index])

s00 = 0.0
s01 = -14.0
s02 = -20.0
s03 = -22.0
s10 = -14.0
s11 = -18.0
s12 = -20.0
s13 = -20.0
s20 = -20.0
s21 = -20.0
s22 = -18.0
s23 = -14.0
s30 = -22.0
s31 = -20.0
s32 = -14.0
s33 = 0.0


In [19]:
def policy_improvement(vk):
    
    new_policy = zeros(shape=(len(states), len(actions)))
    for s, state in enumerate(states):
        max_a = None
        max_vk = -sys.maxint - 1
        for a, action in enumerate(actions):
            vk_cand = 0.0
            for s_p, next_state in enumerate(states):
                vk_cand += p_trans[s, a, s_p] * (r[s, a, s_p] + gamma * vk[s_p])  
            
            if vk_cand > max_vk:
                max_vk = vk_cand
                max_a = a
                           
        new_policy[s, max_a] = 1.0
        
    return new_policy

Determine the optimal policy from the state value function for uniform random policy.

In [20]:
optimal_policy = policy_improvement(vk_uni)

In [21]:
optimal_policy # Enhancement would be to distribute probability over equal value states

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.]])

Evaluating State Value Function For Optimal Policy

In [22]:
vk_star = zeros(shape=(len(states)))

NUM_ITERS = 20
for k in range(NUM_ITERS):
    vk_star = policy_evaluation(optimal_policy, vk_star)
        
for s in states:
    print '{} = {}'.format(s, vk_star[s.index])

s00 = 0.0
s01 = -1.0
s02 = -2.0
s03 = -3.0
s10 = -1.0
s11 = -2.0
s12 = -3.0
s13 = -2.0
s20 = -2.0
s21 = -3.0
s22 = -2.0
s23 = -1.0
s30 = -3.0
s31 = -2.0
s32 = -1.0
s33 = 0.0


In [27]:
def policy_iteration(policy, vk, k_iters=1, epsilon=1e-4):

    stable_policy = False
    while not stable_policy:
        # Policy Evaluation (approximation of vk)
        for k in range(k_iters):
            vk = policy_evaluation(policy, vk)
       
        # Greedy choice of new policy
        new_policy = policy_improvement(vk)

        # Check termination criteria (epsilon-tolerance)
        if allclose(policy, new_policy, atol=epsilon):
            stable_policy = True
                    
        policy = new_policy
            
    return policy, vk

In [33]:
vk = zeros_like(states)
policy_star, vk_star = policy_iteration(uni_random_policy, vk, k_iters=3)

In [34]:
vk_star

array([ 0., -1., -2., -3., -1., -2., -3., -2., -2., -3., -2., -1., -3.,
       -2., -1.,  0.])

In [35]:
policy_star

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.]])