In [9]:
import numpy as np

In [33]:
# TD(0) for v_pi
alpha = 0.5

V = np.zeros(7)
p = np.full((7,2), 0.5)
rewards = np.array([0,0,0,0,0,0,1])
sp = np.array([
               [0,0],
               [0,2],
               [1,3],
               [2,4],
               [3,5],
               [4,6],
               [6,6],
])

for t in range(100):
  s_old = 4
  while s_old != 6 and s_old != 0:
    a = np.random.choice(np.arange(len(p[s_old])), p=p[s_old])
    s = sp[s_old,a]
    r = rewards[s]

    V[s_old] += alpha * (r + V[s] - V[s_old])
    s_old = s

In [31]:
V

array([0. , 0. , 0. , 0. , 0. , 0.5, 0. ])

In [34]:
class Env:
  def step(self):
    pass

class Agent:
  def action(self):
    pass

  def update(self):
    pass

In [89]:
class CliffEnv(Env):
  def __init__(self):
    self.s = (3,0)
    self.path = np.zeros((4,12))
    self.t = 0

  def reset(self):
    self.s = (3,0)
    self.path = np.zeros((4,12))
    self.t = 0
    return self.s

  def step(self, s, a):
    if a == 0:
      self.s = (s[0], np.max([s[1]-1,0]))
    elif a == 1:
      self.s = (np.max([s[0]-1,0]), s[1])
    elif a == 2:
      self.s = (s[0], np.min([s[1]+1,11]))
    elif a == 3:
      self.s = (np.min([s[0]+1,3]), s[1])
    r = -1

    if self.s[0] == 3 and 1 <= self.s[1] <= 10:
      self.s = (3,0)
      r = -100

    self.t += 1
    self.path[self.s[0], self.s[1]] = self.t

    return s, a, r, self.s

  def get_path(self):
    return self.path

In [90]:
class CliffAgent(Agent):
  def __init__(self, eps=0.1, alpha=0.9, delta=1, update_method = "q"):
    self.eps = eps
    self.alpha = alpha
    self.delta = delta
    self.Q = np.zeros((4,12,4))

    update_method_map = {
        "sarsa": self.sarsa_update,
        "q": self.q_update
    }
    self.update_method = update_method_map.get(update_method, None)

  def action(self, s):
    if np.random.random() > self.eps:
      return np.argmax(self.Q[s[0],s[1]])
    else:
      return np.random.choice(len(self.Q[s[0],s[1]]))
  
  def update(self, s, a, r, sp):
    return self.update_method(s, a, r, sp)

  def sarsa_update(self, s, a, r, sp):
    ap = self.action(sp)
    self.Q[s[0], s[1], a] += alpha * (r + self.delta*self.Q[sp[0], sp[1], ap] - self.Q[s[0], s[1], a])
    return sp, ap

  def q_update(self, s, a, r, sp):
    self.Q[s[0], s[1], a] += alpha * (r + self.delta*np.max(self.Q[sp[0], sp[1]]) - self.Q[s[0], s[1], a])
    return sp, ap

  def getQ(self):
    return self.Q

In [111]:
# SARSA for cliff problem
env = CliffEnv()
sarsa_agent = CliffAgent(update_method="sarsa")
T = 1000
for t in range(T):
  s = env.reset()
  a = sarsa_agent.action(s)
  while s != (3,11):
    s, a, r, sp = env.step(s, a)
    sp, ap = sarsa_agent.update(s, a, r, sp)
    s = sp
    a = ap
  if t == T-1:
    print(env.get_path())

[[ 3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]
 [ 2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 15.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 16.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 17.]]


In [112]:
# Q-learning for cliff problem
env = CliffEnv()
q_agent = CliffAgent(update_method="q")
T = 1000
for t in range(T):
  s = env.reset()
  while s != (3,11):
    a = q_agent.action(s)
    s, a, r, sp = env.step(s, a)
    sp, ap = q_agent.update(s, a, r, sp)
    s = sp
  if t == T-1:
    print(env.get_path())

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 13.]]
