In [1]:
import numpy as np
import random
import copy

# Environment

In [2]:
actions = [0, 1, 2, 3]  # up, down, left, right
states = [i for i in range(0, 54)]
barrier_states = [7, 11, 16, 20, 25, 29, 41]
goal_state = 8

In [3]:
def environment(s, a):
    if s == goal_state:
        return (s, 0)
    if s in barrier_states:
        return (-10, -10)
    
    #-------- up ---------
    if a == 0:
        ns = s - 9  # next state
        if (ns < 0) or (ns in barrier_states):
            return (s, -1)
        elif ns == goal_state:
            return (ns, 1)
        else:
            return (ns, 0)
                        
    #-------- down ---------    
    if a == 1: 
        ns = s + 9  # next state
        if (ns > 53) or (ns in barrier_states):
            return (s, -1)  
        else:
            return (ns, 0)

    #-------- left ---------    
    if a == 2: 
        ns = s - 1  # next state
        if (ns % 9 == 8) or (ns < 0) or (ns in barrier_states):
            return (s, -1)  
        else:
            return (ns, 0)

    #-------- right ---------    
    if a == 3:
        ns = s + 1  # next state
        if (ns % 9 == 0) or (ns in barrier_states):
            return (s, -1)  
        elif ns == goal_state:
            return (ns, 1)
        else:
            return (ns, 0)

# Policy

In [4]:
def action_selection(state, Q):  # assume greedy policy
    action = np.random.choice(np.flatnonzero(Q[state] == Q[state].max()))   # choose randomly from maximum values   
    return action

# SARSA

In [5]:
gamma = 0.9  # discount factor
alpha = 1  # learning rate
episodes = 100

In [6]:
def sarsa(Q):
    for e in range(episodes):
#         print('episode:', e)
        s = random.randint(0, 53)  # start from random state
#         print('initial state:', s)
        a = action_selection(s, Q)
#         print('initial action:', a)
        while s != goal_state:
            (ns, r) = environment(s, a)   # next state and reward
            # to avoid starting from obstacle cell
            while ns == -10 and r == -10:  
                s = random.randint(0, 53)
                a = action_selection(s, Q)
                (ns, r) = environment(s, a)
                
#             print('environment response:', ns, r)
            na = action_selection(ns, Q)  # next action 
#             print('next action:', na)
            Q[s][a] = Q[s][a] + alpha * (r + gamma * Q[ns][na] - Q[s][a])
#             print('updated Q:', Q[s][a])
#             print('new Q:', Q)
            s = ns
            a = na

    policy = np.array([0]*54)
    for state in states:
        policy[state] = np.argmax(Q[state])

    return policy

In [7]:
random.seed(10)
Q = np.array([[0.] * len(actions)] * len(states))

policy = sarsa(Q)
print('Optimal policy:', policy)

Optimal policy: [1 1 3 1 2 1 2 0 0 1 1 0 1 0 1 1 0 0 3 1 0 3 3 3 1 0 0 3 1 0 0 3 3 3 1 0 0
 3 3 3 0 0 0 3 0 0 0 0 0 0 2 0 3 0]
