In [1]:
from __future__ import print_function, division
from builtins import range
# !sudo pip install -U future

import numpy as np


class Grid: # Environment
  def __init__(self, width, height, start):
    # i is vertical axis, j is horizontal
    self.width = width
    self.height = height
    self.i = start[0]
    self.j = start[1]

  def set(self, rewards, actions, obey_prob):
    # rewards should be a dict of: (i, j): r (row, col): reward
    # actions should be a dict of: (i, j): A (row, col): list of possible actions
    self.rewards = rewards
    self.actions = actions
    self.obey_prob = obey_prob

  def non_terminal_states(self):
    return self.actions.keys()

  def set_state(self, s):
    self.i = s[0]
    self.j = s[1]

  def current_state(self):
    return (self.i, self.j)

  def is_terminal(self, s):
    return s not in self.actions

  def check_move(self, action):
    i = self.i
    j = self.j
    # check if legal move first
    if action in self.actions[(self.i, self.j)]:
      if action == 'UP':
        i -= 1
      elif action == 'DOWN':
        i += 1
      elif action == 'RT':
        j += 1
      elif action == 'LT':
        j -= 1
    # return a reward (if any)
    reward = self.rewards.get((i, j), 0)
    return ((i, j), reward)

  def get_transition_probs(self, action):
    # returns a list of (probability, reward, s') transition tuples
    probs = []
    state, reward = self.check_move(action)
    probs.append((self.obey_prob, reward, state))
    disobey_prob = 1 - self.obey_prob
    if not (disobey_prob > 0.0):
      return probs
    if action == 'UP' or action == 'DOWN':
      state, reward = self.check_move('LT')
      probs.append((disobey_prob / 2, reward, state))
      state, reward = self.check_move('RT')
      probs.append((disobey_prob / 2, reward, state))
    elif action == 'LT' or action == 'RT':
      state, reward = self.check_move('UP')
      probs.append((disobey_prob / 2, reward, state))
      state, reward = self.check_move('DOWN')
      probs.append((disobey_prob / 2, reward, state))
    return probs

  def game_over(self):
    # returns true if game is over, else false
    # true if we are in a state where no actions are possible
    return (self.i, self.j) not in self.actions

  def all_states(self):
    # possibly buggy but simple way to get all states
    # either a position that has possible next actions
    # or a position that yields a reward
    return set(self.actions.keys()) | set(self.rewards.keys())


def standard_grid(obey_prob=1.0, step_cost=None):
  # define a grid that describes the reward for arriving at each state
  # and possible actions at each state
  # the grid looks like this
  # x means you can't go there
  # s means start position
  # number means reward at that state
  # .  .  .  1
  # .  x  . -1
  # s  .  .  .
  # obey_brob (float): the probability of obeying the command
  # step_cost (float): a penalty applied each step to minimize the number of moves (-0.1)
  g = Grid(3, 4, (2, 0))
  rewards = {(0, 3): 1, (1, 3): -1}
  actions = {
    (0, 0): ('DOWN', 'RT'),
    (0, 1): ('LT', 'RT'),
    (0, 2): ('LT', 'DOWN', 'RT'),
    (1, 0): ('UP', 'DOWN'),
    (1, 2): ('UP', 'DOWN', 'RT'),
    (2, 0): ('UP', 'RT'),
    (2, 1): ('LT', 'RT'),
    (2, 2): ('LT', 'RT', 'UP'),
    (2, 3): ('LT', 'UP'),
  }
  g.set(rewards, actions, obey_prob)
  if step_cost is not None:
    g.rewards.update({
      (0, 0): step_cost,
      (0, 1): step_cost,
      (0, 2): step_cost,
      (1, 0): step_cost,
      (1, 2): step_cost,
      (2, 0): step_cost,
      (2, 1): step_cost,
      (2, 2): step_cost,
      (2, 3): step_cost,
    })
  return g

Requirement already up-to-date: future in /usr/local/lib/python3.6/dist-packages (0.16.0)
