In [1]:
import numpy as np
import random
import copy

In [2]:
actions = [0, 1, 2, 3]  # up, down, left, right
states = [i for i in range(0, 54)]
barrier_states = [7, 11, 16, 20, 25, 29, 41]
goal_state = 8

In [3]:
MAX_ITER = 100
gamma = 0.9  # discount factor

In [4]:
def return_Q(s, a, values):
    if (s in barrier_states) or (s == goal_state):
        return 0
    
    #-------- up ---------
    if a == 0:
        ns = s - 9  # next state
        if (ns < 0) or (ns in barrier_states):
            return (-1 + (gamma * values[s]))
        elif ns == goal_state:
            return (1 + gamma * values[ns])
        else:
            return (0 + gamma * values[ns])
                        
    #-------- down ---------    
    if a == 1: 
        ns = s + 9  # next state
        if (ns > 53) or (ns in barrier_states):
            return (-1 + gamma * values[s])  
        else:
            return (0 + gamma * values[ns])

    #-------- left ---------    
    if a == 2: 
        ns = s - 1  # next state
        if (ns % 9 == 8) or (ns < 0) or (ns in barrier_states):
            return (-1 + gamma * values[s])  
        else:
            return (0 + gamma * values[ns])

    #-------- right ---------    
    if a == 3:
        ns = s + 1  # next state
        if (ns % 9 == 0) or (ns in barrier_states):
            return (-1 + gamma * values[s])  
        elif ns == goal_state:
            return (1 + gamma * values[ns])
        else:
            return (0 + gamma * values[ns])

In [5]:
# Policy Improvement

def policyImprovement(values):
    new_policy = np.array([0]*54)
    
    for s in states:
        Q = [0.] * len(actions)
        for a in actions:
            Q[a] = return_Q(s, a, values)
        new_policy[s] = np.argmax(Q)
        
    return new_policy

## Value Iteration

In [6]:
def valueIteration(values):
    for i in range(MAX_ITER):
        for s in states:
            Q = [0.] * len(actions)
            for a in actions:
                Q[a] = return_Q(s, a, values)
            values[s] = max(Q)
        
    return policyImprovement(values)

In [7]:
init_values = np.array([0.]*54)

final_policy = valueIteration(init_values)
print('Optimal policy:', final_policy)

Optimal policy: [3 3 3 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 0 3 3 3 3 3 0 3
 3 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]
