In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv

from agents import EmphaticTD, TD
from environments import Chain, RandomPolicy
from empirical import make_episodes, apply_fa, apply_rfunc, expected_return, learn
from features import *
from mdptools import *

from IPython.display import display

In [2]:
# Set pandas display options
pd.set_option('precision', 4)

# Set numpy display options
np.set_printoptions(precision=4)

We create a series of episodes for a random walk.

In [3]:
num_states = 7
num_episodes = 10000
env = Chain(num_states)
pol = RandomPolicy(env, random_seed=1010101)
raw_episodes = make_episodes(num_episodes, env, pol)

## Constant Feature, Constant Reward

* r(s) = 1 for s non-terminal
* x(s) = 1 for x non-terminal

With constant rewards, we effectively calculate the expected number of steps to termination.

In [None]:
# Adjust the rewards, apply function approximation
phi = Wrap(Bias(), terminals=env.terminals)
_rfunc = lambda s, a, sp: 1
rfunc = Parameter(_rfunc, terminals=env.terminals)

episodes = apply_rfunc(raw_episodes, rfunc)
episodes = apply_fa(episodes, phi)

# We can check the expected returns for each state 
print("Approximate expected returns for each state")
display(expected_return(apply_rfunc(raw_episodes, rfunc)))

# Test the algorithms empirically
num_repeats = 10
dct = {}

# Fixed parameters that (constant for each algorithm)
fixed_params = {'gamma': 1.0, 'lmbda': 0.0, 'interest': 1.0,}

# The TD solution
# param_funcs = {'alpha': Decay(0.99999)}
param_funcs = {'alpha': Constant(0.001)}
td = TD(len(phi))
learn(td, episodes, fixed_params, param_funcs, repeats=num_repeats)

# Store information
tmp = {'weights': td.theta} 
dct['TD'] = tmp
 
# The ETD solution
param_funcs = {'alpha': Decay(0.99999)}
etd = EmphaticTD(len(phi))
learn(etd, episodes, fixed_params, param_funcs, repeats=num_repeats)

# Store information
tmp = {'weights': etd.theta} 
dct['ETD'] = tmp

# Summarize and print information
df = pd.DataFrame(dct)
display(df)

Approximate expected returns for each state


{(1,): 4.979989893885801,
 (2,): 7.969806399839502,
 (3,): 8.96994327041104,
 (4,): 7.9439075420942515,
 (5,): 4.884900741929015}

Unnamed: 0,ETD,TD
weights,[7.6269300441],[8.9307697443]


## Constant Reward, Increasing Feature

* r(s) = 1 for s non-terminal
* x(s) = s+1 for s non-terminal

In this case, the analytic solution suggests that ETD will perform worse than TD. 
We check that actually running the algorithms in these case will behave approximately like the analytic solution.

### Analytic Solution

We analyze a random walk with 5 non-terminal states, starting in state 2 (the middle of the chain), and with parameters defined by

$$\gamma(s) = \lambda(s) = i(s) = 0 \quad \forall s \text{ terminal}$$

For $s$ non-terminal, 

* $\gamma(s) = 1$
* $\lambda(s) = 0$
* $i(s) = 1$

We get the following (asymptopic) solutions:


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>weights</th>
      <th>MSE</th>
      <th>values</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Least-Squares</th>
      <td> [1.90909090909]</td>
      <td> 11.709</td>
      <td> [1.90909090909, 3.81818181818, 5.72727272727, ...</td>
    </tr>
    <tr>
      <th>TD</th>
      <td>           [1.8]</td>
      <td> 11.840</td>
      <td>               [1.8, 3.6, 5.4, 7.2, 9.0, 0.0, 0.0]</td>
    </tr>
    <tr>
      <th>ETD</th>
      <td> [1.53333333333]</td>
      <td> 13.262</td>
      <td> [1.53333333333, 3.06666666667, 4.6, 6.13333333...</td>
    </tr>
  </tbody>
</table>

The true values are:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>states</th>
      <th>true values</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td> 1</td>
      <td> 5</td>
    </tr>
    <tr>
      <td> 2</td>
      <td> 8</td>
    </tr>
    <tr>
      <td> 3</td>
      <td> 9</td>
    </tr>
    <tr>
      <td> 4</td>
      <td> 8</td>
    </tr>
    <tr>
      <td> 5</td>
      <td> 5</td>
    </tr>
    <tr>
      <td> 0</td>
      <td> 0</td>
    </tr>
    <tr>
      <td> 6</td>
      <td> 0</td>
    </tr>
  </tbody>
</table>

In [None]:
# Set up the experiment
phi = Wrap(Identity(1), terminals=env.terminals)
_rfunc = lambda s, a, sp: 1
rfunc = Parameter(_rfunc, terminals=env.terminals)

episodes = apply_rfunc(raw_episodes, rfunc)
episodes = apply_fa(episodes, phi)

# We can check the expected returns for each state
print("Approximate expected returns for each state")
display(expected_return(apply_rfunc(raw_episodes, rfunc)))

# We check that the feature function behaves as expected
for s in env.states:
    print(s, phi(s))

# Test the algorithms empirically
num_repeats = 10
dct = {}

# Fixed parameters that (constant for each algorithm)
fixed_params = {'gamma': 1.0, 'lmbda': 0.0, 'interest': 1.0}

# The TD solution
param_funcs = {'alpha': Decay(0.99999)}
td = TD(len(phi))
learn(td, episodes, fixed_params, param_funcs, repeats=num_repeats)

# Store information
tmp = {'weights': td.theta} 
dct['TD'] = tmp
 
# The ETD solution
param_funcs = {'alpha': Decay(0.99999)}
etd = EmphaticTD(len(phi))
learn(etd, episodes, fixed_params, param_funcs, repeats=num_repeats)

# Store information
tmp = {'weights': etd.theta} 
dct['ETD'] = tmp

# Summarize and print information
df = pd.DataFrame(dct)
display(df)

Approximate expected returns for each state


{(1,): 4.979989893885801,
 (2,): 7.969806399839502,
 (3,): 8.96994327041104,
 (4,): 7.9439075420942515,
 (5,): 4.884900741929015}

This is approximately what we derived through the analytic method, so unless I have managed to make the same kind of mistake in both implementations TD indeed outperforms ETD in this particular setting.