# TD/ETD Comparison on "Conveyor Belt"

This notebook contains some comparisons between TD and Emphatic TD on a simple problem under function approximation. 

We identify the solutions each algorithm converges to using the matrix operator equations for each algorithm, and compare them to the optimal approximation (as found by the least squares solution).

In [30]:
import numpy as np
import pandas as pd
from functools import reduce
from numpy import dot
from numpy.linalg import pinv

from features import *
from mdptools import *
from solvers import td_solution, etd_solution, exact_solution
from report import report

In [31]:
from IPython.display import display

# Conveyor Belt

An environment resembling a conveyor belt moving to the "right".

* Here, states are indexed from 0 to N-1, with state N-1 being the terminal state and state 0 being the initial state.
* In terminal states, the feature vector $x(s)$ is the zero vector, the reward $r(s)$ is zero, and $\gamma(s) = \lambda(s) = i(s) = 0$.
* The environment is undiscounted, so $\gamma(s) = 1$ for $s$ non-terminal.
* For these experiments, $\lambda(s) = 0$ for all states $s$.
* Interest is constant and uniform for each state; $i(s) = 1$ for $s$ non-terminal.

In [32]:
def conveyor_belt_matrix(ns):
    transitions = []
    # non-terminal states
    for i in range(ns-1):
        tmp = np.zeros(ns)
        tmp[i+1] = 1
        transitions.append(tmp)
    # terminal state at end of conveyor belt
    tmp = np.zeros(ns)
    tmp[-1] = 1
    transitions.append(tmp)
    return np.array(transitions)

Here we examine the case with two nonterminal states (N=3).

In [33]:
# Common parts of problem specification
num_states = 3
s0 = int2basis(0, num_states)
pmat = conveyor_belt_matrix(num_states)
states = state_vectors(pmat)
indices = state_indices(pmat)
terminals = [as_tuple(s) for s in find_terminals(pmat)]
gmfunc = Constant(1.0, terminals=terminals)
lmfunc = Constant(0.0, terminals=terminals)
ifunc = Constant(1.0, terminals=terminals)

## Constant Reward, Constant Feature

* r(s) = 1 for s non-terminal
* x(s) = 1 for x non-terminal

In [35]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 2.  1.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[1.5],0.25,"[1.5, 1.5, 0.0]"
TD,[2.0],0.5,"[2.0, 2.0, 0.0]"
ETD,[1.5],0.25,"[1.5, 1.5, 0.0]"


In [36]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} & weights &   MSE \\
\midrule
Least-Squares &   [1.5] &  0.25 \\
TD            &   [2.0] &  0.50 \\
ETD           &   [1.5] &  0.25 \\
\bottomrule
\end{tabular}



## Increasing Reward, Constant Feature

* r(s) = s+1 for s non-terminal
* x(s) = 1 for x non-terminal

In [37]:
_rfunc = lambda x: basis2int(x) + 1
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  2.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 3.  2.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[2.5],0.25,"[2.5, 2.5, 0.0]"
TD,[3.0],0.5,"[3.0, 3.0, 0.0]"
ETD,[2.5],0.25,"[2.5, 2.5, 0.0]"


In [38]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} & weights &   MSE \\
\midrule
Least-Squares &   [2.5] &  0.25 \\
TD            &   [3.0] &  0.50 \\
ETD           &   [2.5] &  0.25 \\
\bottomrule
\end{tabular}



## Decreasing Reward, Constant Feature

* r(s) = N-1-s for s non-terminal
* x(s) = 1 for x non-terminal

In [44]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 2.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 3.  1.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[2.0],1,"[2.0, 2.0, 0.0]"
TD,[3.0],2,"[3.0, 3.0, 0.0]"
ETD,[2.0],1,"[2.0, 2.0, 0.0]"


In [45]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} & weights &  MSE \\
\midrule
Least-Squares &   [2.0] &    1 \\
TD            &   [3.0] &    2 \\
ETD           &   [2.0] &    1 \\
\bottomrule
\end{tabular}



## Constant Reward, Increasing Feature

* r(s) = 1 for s non-terminal
* x(s) = s+1 for s non-terminal

In [46]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 0.]]
True Values:
[ 2.  1.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[0.8],0.9,"[0.8, 1.6, 0.0]"
TD,[1.0],1.0,"[1.0, 2.0, 0.0]"
ETD,[0.714285714286],0.918367,"[0.714285714286, 1.42857142857, 0.0]"


## Decreasing Reward, Increasing Feature

* r(s) = N-1-s for s non-terminal
* x(s) = s+1 for s non-terminal

In [47]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 2.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 0.]]
True Values:
[ 3.  1.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[1.0],2.5,"[1.0, 2.0, 0.0]"
TD,[1.33333333333],2.777778,"[1.33333333333, 2.66666666667, 0.0]"
ETD,[0.857142857143],2.55102,"[0.857142857143, 1.71428571429, 0.0]"


In [48]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} &           weights &       MSE \\
\midrule
Least-Squares &             [1.0] &  2.500000 \\
TD            &   [1.33333333333] &  2.777778 \\
ETD           &  [0.857142857143] &  2.551020 \\
\bottomrule
\end{tabular}

