Comparing standard TD with Emphatic TD in a simple "conveyor belt" environment.

## Problem Setup

- Undiscounted case, with $\gamma = 0$ for all non-terminal states.
- Single feature with the increasing value
- Reward increasing as agent moves towards terminal state.
- Interest uniform for all non-terminal states

In [1]:
import numpy as np
from collections import defaultdict
from copy import copy
from itertools import chain

import algorithms, environments,features, policy
from algorithms import EmphaticTD, TD
from environments.misc import ConveyorBelt
from features import Bias, Combination, Int2Unary, Wrapper
from policy import RandomPolicy

In [2]:
# Attribute logging class
class AttributeLogger:
    def __init__(self, obj, names):
        self._obj = obj
        self._names = [x for x in names]
        self.dct = {name: list() for name in names}
        
    def log(self):
        for name in self._names:
            value = getattr(self._obj, name)
            self.dct[name].append(copy(value))
            
    def __getitem__(self, key):
        return self.dct[key]

In [9]:
# For calculating errors in this domain
def mse(a, b):
    return np.mean((a - b)**2)

def calc_values(states, phi, weights):
    return {s: np.dot(weights, phi(s)) for s in states}

def geometric(val, n):
    """Geometric series: 1 + val + val^2 + ... + val^n"""
    return (1-val**n)/(1-val)

def triangle(n):
    return n*(n+1)/2

def pyramid(n):
    return n*(n+1)*(n+2)/6

In [5]:
class Int2Int:
    """A custom feature, for performing operations on integers."""
    def __init__(self, func=None):
        if func is None:
            func = lambda x: x
        self.length = 1
        self._func = func
        
    def __call__(self, x):
        ret = self._func(x)
        return np.array(ret)

In [6]:
def make_episodes(num_episodes, env, policy):
    return [make_episode(env, policy) for i in range(num_episodes)]
    

def make_episode(env, policy):
    env.reset()
    ret = []
    while not env.is_terminal():
        # Observe, take action, get next observation, and compute reward
        s  = env.observe()
        a  = policy(s)
        r  = env.do(a)
        sp = env.observe()

        # Append step to episode trajectory
        ret.append((s, a, r, sp))
    return ret

def apply_fa(episodes, phi_func):
    """Apply function approximation to a series of episodes."""
    ret = []
    for episode in episodes:
        tmp = []
        for step in episode[:-1]:
            s, a, r, sp = step
            fvec   = phi_func(s)
            fvec_p = phi_func(sp)
            tmp.append((fvec, a, r, fvec_p))
        
        # Account for final step of the episode
        s, a, r, sp = episode[-1]
        fvec   = phi_func(s)
        fvec_p = np.zeros(phi_func.length, dtype=np.float)
        tmp.append((fvec, a, r, fvec_p))
        ret.append(tmp)
    return ret

In [57]:
# Generate some episodes
random_seed = 101
num_episodes = 1000
num_states = 2

# Setup environment
env = ConveyorBelt(num_states, random_seed=random_seed)
env.reward = lambda s, a, sp: 1

# Set policy
pol = RandomPolicy(env, random_seed=random_seed)

# Get the raw episode data
raw_episodes = make_episodes(num_episodes, env, pol)

# Convert to features for function approximation
phi = Wrapper(Int2Int(lambda x: x+1), terminals=env.terminals)

# Apply function approximation
episodes = apply_fa(raw_episodes, phi)

## Expected Return

In [58]:
class ExpectedReturn:
    """Estimate expected returns (for tabular case)."""
    def __init__(self, gamma=1.0):
        self._gamma = gamma
        
    def __call__(self, episodes):
        value  = defaultdict(list)
        visits = defaultdict(int)
        for episode in episodes:
            ret = 0
            for step in reversed(episode):
                s, a, r, sp = step
                if isinstance(s, np.ndarray):
                    s = tuple(s)
                elif hasattr(s, '__iter__'):
                    s = tuple(s)
                ret = ret*self._gamma + r
                visits[s] += 1
                value[s].append(ret)
        self.values = {s: sum(value[s])/visits[s] for s in value.keys()}
        return self.values
    
    def calc_mse(self, valdct):
        diff = np.array([v - valdct[k] for k, v in self.values.items()])
        return np.mean(diff**2)
    
ER = ExpectedReturn()
exp_ret = ER(raw_episodes)
print(exp_ret)

{0: 2.0, 1: 1.0}


## Monte Carlo

In [59]:
class EveryVisitMC:
    def __init__(self, n, gamma=1.0):
        self.n = n
        self._gamma = gamma
        
    def __call__(self, episodes):
        value  = np.zeros(self.n)
        visits = np.zeros(self.n)
        for episode in episodes:
            ret = 0
            for step in reversed(episode):
                s, a, r, sp = step
                s = np.array(s)
                ret = ret*self._gamma + r
                visits += s
                value += s*ret
        return value/visits

In [60]:
MC = EveryVisitMC(phi.length, gamma=1.0)
mc_theta = MC(episodes)
mc_values = calc_values(env.nonterminals, phi, mc_theta)
mc_mse = ER.calc_mse(mc_values)

print(mc_theta)
print(mc_values)
print(mc_mse)

[ 1.33333333]
{0: array([ 1.33333333]), 1: array([ 2.66666667])}
1.61111111111


## TD

In [51]:
# Specify the parameters for the episode
fixed_params = {'alpha': 0.01, 
                'gamma': 1.0, 
                'lmbda': 0.0}

# Parameters which are functions (e.g., of state)
param_funcs = {}

# Setup the agent
agent = TD(phi.length)

# Keep track of information over the episodes
log_names = ['theta', 'z']
logger = AttributeLogger(agent, log_names)

# Perform learning
for episode in episodes:
    agent.reset()
    for step in episode:
        s, a, r, sp = step
        
        # Parameters for update
        params = {}
        for name, func in param_funcs.items():
            params[k] = func(s)
        params.update(**fixed_params)
        
        # Update the agent
        agent.update(s, r, sp, params)
        
        # Record information
        logger.log()
        
td_theta = np.copy(agent.theta)
print(agent.theta)
print(logger['z'][:5])

[ 0.62529744]
[array([ 1.]), array([ 2.]), array([ 3.]), array([ 4.]), array([ 5.])]


In [34]:
td_values = calc_values(env.nonterminals, phi, td_theta)
td_mse = ER.calc_mse(td_values)

print(td_theta)
print(td_values)
print(td_mse)

[ 1.50750184]
{0: array([ 1.50750184]), 1: array([ 3.01500368]), 2: array([ 4.52250552])}
10.86360332


## Emphatic TD

In [28]:
# Specify the parameters for the episode
fixed_params = {'alpha': 0.01, 
                'gamma': 1.0, 
                'lmbda': 0.0, 
                'interest': 1.0,}

# Parameters which are functions (e.g., of state)
param_funcs = {}

# Setup the agent
agent = EmphaticTD(phi.length)

# Keep track of information over the episodes
log_names = ['theta', 'z', 'F', 'M']
logger = AttributeLogger(agent, log_names)

# Perform learning
for episode in episodes:
    agent.reset()
    for step in episode:
        s, a, r, sp = step
        
        # Parameters for update
        params = {}
        for name, func in param_funcs.items():
            params[k] = func(s)
        params.update(**fixed_params)
        
        # Update the agent
        agent.update(s, r, sp, params)
        
        # Record information
        logger.log()
        
        
etd_theta = np.copy(agent.theta)
print(agent.theta)
print(logger['F'][:5])
print(logger['M'][:5])
print(logger['z'][:5])

[ 0.73400569]
[1.0, 2.0, 3.0, 1.0, 2.0]
[1.0, 2.0, 3.0, 1.0, 2.0]
[array([ 1.]), array([ 4.]), array([ 9.]), array([ 1.]), array([ 4.])]


In [35]:
etd_values = calc_values(env.nonterminals, phi, etd_theta)
etd_mse = ER.calc_mse(etd_values)

print(etd_theta)
print(etd_values)
print(etd_mse)

[ 0.73400569]
{0: array([ 0.73400569]), 1: array([ 1.46801139]), 2: array([ 2.20201708])}
10.5075100663


In [36]:
episode

[(array(1), 0, 3.0, array(2)),
 (array(2), 0, 2.0, array(3)),
 (array(3), 0, 1.0, array([ 0.]))]