In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Problem specification
GRID_N = 5  # Number of points on either axis of the grid
GRID_SPACING = 1.0 # <float> spacing between points on the grid

UNSAFE_CENTER = np.array([0.0, 0.0])
UNSAFE_RADIUS = 1.5  # Radius of the 'dangerous area' in the middle
SAFETY_CONSTRAINT_TYPE = "SOFT"
SOFT_CONST_REWARD = -20.0

ACTIONS = {"UP": -1, "RIGHT": +1}
GOAL_STATE = [GRID_N - 1, GRID_N - 1]

# Learning specifications
GAMMA = 1.0
LEARNING_RATE = 0.5
GOAL_REWARD = +10.0
TIMESTEP_REWARD = -1.0
EPSILON = 0.3 #  For epsilon-greedy learning

In [3]:
grid_side = (GRID_N-1) * GRID_SPACING
grid_x = np.linspace(-grid_side*0.5, grid_side * 0.5, GRID_N)
grid_y = grid_x.copy()

In [4]:
def plot_states(states):
  x_val = [state[0] for state in states]
  y_val = [state[1] for state in states]
  plt.plot(x_val, y_val, 'ro')
  plt.axis()
  return

In [5]:
def get_next_state(state, action):
  next_state = [state[0], state[1]]
  if action == 0:
    next_state[1] = np.min([state[1] + 1, GRID_N - 1])

  elif action == 1:
    next_state[0] = np.min([state[0] + 1, GRID_N - 1])

  return next_state

def get_reward(next_state):
    reward = 0.0
    if SAFETY_CONSTRAINT_TYPE == "SOFT" and unsafe_state(next_state):
        reward = SOFT_CONST_REWARD
        
    if np.all(next_state == GOAL_STATE):
      return reward + GOAL_REWARD

    else:
      return reward + TIMESTEP_REWARD

def update_q(state, action, next_state, reward, q_table):
    """
    """
    next_q_max = np.max(q_table[next_state[0]][next_state[1]][:])
    current_q = q_table[state[0]][state[1]][action]

    loss = reward + GAMMA*next_q_max - current_q
    q_table[state[0]][state[1]][action] += LEARNING_RATE*loss
    return

def get_action(state, q_table):
    """
    Return index of most optimal action
    """
    q_values = q_table[state[0]][state[1]][:]
    optimal_action = np.argmax(q_values)

    if np.random.uniform(0,1) < EPSILON:
        return np.random.choice([0,1])

    else:
        return optimal_action

def unsafe_state(state):
    x = grid_x[state[0]]
    y = grid_y[state[1]]
    state_vec = np.array([x, y])
    dist = state_vec - UNSAFE_CENTER
    if np.dot(dist, dist) < UNSAFE_RADIUS**2:
        return True
    else: 
        return False

In [6]:
EPSILON = 0.9
DECAY = 0.9999
N_EPISODES = 10000
LEARNING_RATE = 0.1
q_table = np.zeros([GRID_N, GRID_N, len(ACTIONS)])
safe_episode = []
for episode in range(N_EPISODES):
    safe_episode.append(True)
    state = [0, 0]
    while True:
        action = get_action(state, q_table)
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)
        update_q(state, action, next_state, reward, q_table)
        state = next_state
        
        if unsafe_state(state):
            safe_episode[-1] = False
            
        if np.all(next_state == GOAL_STATE):
          break

        EPSILON = EPSILON*DECAY

# EPSILON = 0.0
print([get_action([x, int(GRID_N/2)], q_table) for x in range(GRID_N)])

[0, 0, 0, 1, 0]


In [7]:
q_max = []
for x in range(GRID_N):
  q_max.append([])
  for y in range(GRID_N):
    q_max[-1].append(np.max(q_table[x][y][:]))

0.0006479335975503439

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]