<a href="https://colab.research.google.com/github/romenlaw/RL-playground/blob/main/rl_playground3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [None]:
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk &>/dev/null
!pip install git+https://github.com/mimoralea/gym-aima#egg=gym-aima &>/dev/null

In [None]:
import gym
import gym_walk, gym_aima
import numpy as np
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

from itertools import cycle, count
import itertools
from tabulate import tabulate

import random
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline
import warnings

warnings.filterwarnings('ignore')

# Utilities

In [None]:
"""exponentially decaying schedule
this function allows you to calculate all the values for alpha for the full training process
"""
def decay_schedule(init_value, min_value,
                   decay_ratio, # determines how many episodes to use for decay
                   max_steps, # i.e. n_episodes in previous chapters
                   log_start=-2, log_base=10):
  assert min_value<=init_value, "min_value must be <= init_value"
  decay_steps = max(int(max_steps*decay_ratio), 1)
  rem_steps = max_steps - decay_steps # remaining steps (i.e. not used for decay)

  # calculate actual values of an inverse log curve ([::-1] reverse the order)
  values = np.logspace(start=log_start, stop=0,
                       num=decay_steps, # number of samples to generate
                       base=log_base,
                       endpoint=True # samples are inclusive of 'stop'
                       )[::-1]
  # print("reverse logspace: ", values)
  # normalise to between 0 and 1
  values = (values - values.min()) / (values.max() - values.min())
  # transform the points to lay between init_value and min_value
  values = min_value + (init_value - min_value) * values
  values = np.pad(values, (0, rem_steps), 'edge')
  return values

"""Generate full trajectory
Running a policy and extracting the collection of experience tuples
(the trajectories) for offline processing.
"""
def generate_trajectory(pi, env, max_steps=200):
  done, trajectory =  False, []
  while not done:
    state = env.reset()
    trajectory = []
    t = 0
    while t<max_steps and not done: # max_steps allows truncation of long trajectory
      action = pi(state)
      next_state, reward, done, _ = env.step(action)
      experience = (state, action, reward, next_state, done)
      trajectory.append(experience)
      state = next_state

  return np.array(trajectory, np.object_)

In the replace-trace strategy, traces are set to 1 when a state-action pair is visited, and decay based on \lambda value just like accumulate-trace strategy (in chapter 5).

Diff between replace-trace and accumulate-trace:
* accumulate-trace tracks the visited states; the eligibility trace is without bound
* replace-trace tracks the visited state-action pair; also the eligibility trace is clipped to 1 to avoid dead loop

# Sarsa(\lambda)

This is a mix of Sarsa and TD(\lambda) methods.

In [None]:
def sarsa_lambda(env, gamma=1.0,
                 init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
                 init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
                 lambda_=0.5, # lambda is reserved word in python, so add _
                 replacing_traces=True,
                 n_episodes=3000):
  nS = env.observation_sapece.n
  nA = env.action_space.n
  pi_track = []
  Q = np.zeros((nS, nA), dtype=np.float32)
  Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
  # eligibility trace keeps track of state-action pairs
  E = np.zeros((nS, nA), dtype=np.float32)
  select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state]) if np.random.random() > epsilon \
    else np.random.randint(len(Q[state])) # random action of the state

  alphas = decay_schedule(init_alpha, min_alpha, alphs_decay_ratio, n_episodes)
  epsilons = decay_schedule(init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes)

  for e in tqdm(range(n_episodes), leave=False):
    E.fill(0) # at every episode, we reset eligitability of every state-action to 0
    state = env.reset()
    done = False
    action = select_action(state, Q, epsilons[e])

    while not done:
      # collect experience
      next_state, reward, done, _ = env.step(action)
      next_action = select_action(next_state, Q, epsilons[e])

      # same as original sarsa
      td_target = reward + gamma * Q[next_state][next_action] * (not done)
      td_error = td_target - Q[state][action]

      # increment state-action pair trace, clip to 1 if it's replacing trace
      E[state][action]+=1
      if replacing_traces: E.clip(0,1, out=E)

      # notice we update entire Q table for all eligible state-action pairs
      Q = Q + alphas[e] * td_error * E
      E = gamma * lambda_ * E # decay E

      state, action = next_state, next_action

    Q_track[e] = Q
    pi_track.append(np.argmax(Q[state], axis=1)) # axis 1 is action

  V = np.max(Q, axis=1)
  pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]

  return Q, V, pi, Q_track, pi_track

## Watkin's Q(\lambda)

it's an off-policy control version of \lambda algorithms. Q(\lambda) is Q-learning using \lambda return for policy evaluation of the GPI pattern.

In [None]:
def q_lambda(env, gamma=1.0,
             init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
             init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
             lambda_=0.5,
             replacing_traces=True,
             n_episodes=3000):
  nS = env.observation_space.n
  nA = env.action_space.n
  pi_track=[]
  Q = np.zeros((nS, nA), dtype=np.float32)
  Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float32)
  E = np.zeros((nS, nA), dtype=np.float32)
  select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state]) if np.random.random() > epsilon \
    else np.random.randint(Q[state])
  alphas = decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
  epsilons = decay_schedule(init_epsilon, min_epsilon, epsilonn_decay_ratio, n_episodes)

  for e in tqdm(range(n_episodes), leave=False):
    E.fill(0)
    state = env.reset()
    done = False
    action = select_action(state, Q, epsilons[e])
    while not done:
      next_state, reward, done, _ = env.step(action)
      next_action = select_action(next_state, Q, epsilons[e])
      # verify that action on next step is still from greedy policy
      next_action_is_greedy = Q[next_state][next_action] == Q[next_state].max()
      td_target = reward + gamma * Q[next_state].max() * (not done)
      td_error = td_target - Q[state][action] # note: use current state

      if replacing_traces: E[state].fill(0)

      E[state][action] += 1
      Q = Q + alphas[e] * td_error + E

      if next_action_is_greedy:
        E *= gamma * lambda_ # decay E as usual
      else:
        E.fill(0) # reset E because we want to learn greedy policy

      state, action = next_state, next_action

    Q_track[e] = Q
    pi_track.append(np.max(Q, axis=1))

  V = np.max(Q, axis=1)
  pi = lambda s: {s:a for s,a in enumerate(np.argmax(Q, axis=1))}[s]
  return Q

# Dyna-Q
Unifying model-free and model-based/planning methods  by interleaving a model-free method (Q-learning) and a planning method (similar to Value Iteration).