# TD/ETD Comparison on "Conveyor Belt"

This notebook contains some comparisons between TD and Emphatic TD on a simple problem under function approximation. 

We identify the solutions each algorithm converges to using the matrix operator equations for each algorithm, and compare them to the optimal approximation (as found by the least squares solution).

In [1]:
import numpy as np
import pandas as pd
from functools import reduce
from numpy import dot
from numpy.linalg import pinv

from features import *
from mdptools import *
from solvers import td_solution, etd_solution, exact_solution

In [2]:
from IPython.display import display

In [3]:
def report(P, R, phi_func, gm_func, lm_func, i_func):
    """Function for collating and comparing the various solutions."""
    states = state_vectors(pmat)
    nn = len(find_nonterminals(P))
    X = feature_matrix(states, phi_func)
    V = exact_solution(P, R, gm_func)
    
    # Best approximation (least squares)
    w_approx, *_ = np.linalg.lstsq(X, V)
    V_approx = np.dot(X, w_approx)
    E_approx = np.sum((V - V_approx)**2)/nn
    
    # TD
    w_td = td_solution(P, R, phi_func, gm_func, lm_func)
    V_td = np.dot(X, w_td)
    E_td = np.sum((V - V_td)**2)/nn
    
    # Emphatic TD fixed point
    w_etd = etd_solution(P, R, phi_func, gm_func, lm_func, i_func)
    V_etd = np.dot(X, w_etd)
    E_etd = np.sum((V - V_etd)**2)/nn
    
    dct = {"TD": {"weights": w_td, "MSE": E_td},
           "ETD": {"weights": w_etd, "MSE": E_etd},
           "Least-Squares": {"weights": w_approx, "MSE": E_approx},
           }
    
    df = pd.DataFrame(dct, 
                      index=["weights", "MSE"],
                      columns=["Least-Squares", "TD", "ETD"])
    
    # Additional Information
    print("Expected Reward:")
    print(R)
    
    print("Feature Matrix:")
    print(X)
    
    print("True Values:")
    print(V)
    
    print("Best Approximation:")
    print(V_approx)
    
    print("TD Approximation:")
    print(V_td)
    
    print("ETD Approximation:")
    print(V_etd)
    
    return df

# Conveyor Belt

An environment resembling a conveyor belt moving to the "right".

* Here, states are indexed from 0 to N-1, with state N-1 being the terminal state and state 0 being the initial state.
* In terminal states, the feature vector $x(s)$ is the zero vector, the reward $r(s)$ is zero, and $\gamma(s) = \lambda(s) = i(s) = 0$.
* The environment is undiscounted, so $\gamma(s) = 1$ for $s$ non-terminal.
* For these experiments, $\lambda(s) = 0$ for all states $s$.
* Interest is constant and uniform for each state; $i(s) = 1$ for $s$ non-terminal.

In [4]:
def conveyor_belt_matrix(ns):
    transitions = []
    # non-terminal states
    for i in range(ns-1):
        tmp = np.zeros(ns)
        tmp[i+1] = 1
        transitions.append(tmp)
    # terminal state at end of conveyor belt
    tmp = np.zeros(ns)
    tmp[-1] = 1
    transitions.append(tmp)
    return np.array(transitions)

Here we examine the case with two nonterminal states (N=3).

In [5]:
# Common parts of problem specification
num_states = 3
pmat = conveyor_belt_matrix(num_states)
states = state_vectors(pmat)
indices = state_indices(pmat)
terminals = [as_tuple(s) for s in find_terminals(pmat)]
gmfunc = Constant(1.0, terminals=terminals)
lmfunc = Constant(0.0, terminals=terminals)
ifunc = Constant(1.0, terminals=terminals)

## Constant Reward, Constant Feature

* r(s) = 1 for s non-terminal
* x(s) = 1 for x non-terminal

In [6]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

# Exact solution for returns
V = exact_solution(pmat, rvec, gmfunc)

# Best approximation (least squares)
fmat = feature_matrix(states, phi)
V_approx, *_ = np.linalg.lstsq(fmat, V)

# TD fixed point
V_td = td_solution(pmat, rvec, phi, gmfunc, lmfunc)

# Emphatic TD fixed point
V_etd = etd_solution(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

In [7]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

df = report(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

display(df)

Expected Reward:
[ 1.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 2.  1.  0.]
Best Approximation:
[ 1.5  1.5  0. ]
TD Approximation:
[ 2.  2.  0.]
ETD Approximation:
[ 1.5  1.5  0. ]


Unnamed: 0,Least-Squares,TD,ETD
weights,[1.5],[2.0],[1.5]
MSE,0.25,0.5,0.25


In [8]:
print(df.to_latex())

\begin{tabular}{llll}
\toprule
{} & Least-Squares &     TD &    ETD \\
\midrule
weights &         [1.5] &  [2.0] &  [1.5] \\
MSE     &          0.25 &    0.5 &   0.25 \\
\bottomrule
\end{tabular}



## Increasing Reward, Constant Feature

* r(s) = s+1 for s non-terminal
* x(s) = 1 for x non-terminal

In [9]:
_rfunc = lambda x: basis2int(x) + 1
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

df = report(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

display(df)

Expected Reward:
[ 1.  2.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 3.  2.  0.]
Best Approximation:
[ 2.5  2.5  0. ]
TD Approximation:
[ 3.  3.  0.]
ETD Approximation:
[ 2.5  2.5  0. ]


Unnamed: 0,Least-Squares,TD,ETD
weights,[2.5],[3.0],[2.5]
MSE,0.25,0.5,0.25


In [10]:
print(df.to_latex())

\begin{tabular}{llll}
\toprule
{} & Least-Squares &     TD &    ETD \\
\midrule
weights &         [2.5] &  [3.0] &  [2.5] \\
MSE     &          0.25 &    0.5 &   0.25 \\
\bottomrule
\end{tabular}



## Decreasing Reward, Constant Feature

* r(s) = N-1-s for s non-terminal
* x(s) = 1 for x non-terminal

In [11]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

df = report(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

display(df)

Expected Reward:
[ 2.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 0.]]
True Values:
[ 3.  1.  0.]
Best Approximation:
[ 2.  2.  0.]
TD Approximation:
[ 3.  3.  0.]
ETD Approximation:
[ 2.  2.  0.]


Unnamed: 0,Least-Squares,TD,ETD
weights,[2.0],[3.0],[2.0]
MSE,1,2,1


In [12]:
print(df.to_latex())

\begin{tabular}{llll}
\toprule
{} & Least-Squares &     TD &    ETD \\
\midrule
weights &         [2.0] &  [3.0] &  [2.0] \\
MSE     &             1 &      2 &      1 \\
\bottomrule
\end{tabular}



## Constant Reward, Increasing Feature

* r(s) = 1 for s non-terminal
* x(s) = s+1 for s non-terminal

In [13]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

df = report(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

display(df)

Expected Reward:
[ 1.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 0.]]
True Values:
[ 2.  1.  0.]
Best Approximation:
[ 0.8  1.6  0. ]
TD Approximation:
[ 1.  2.  0.]
ETD Approximation:
[ 0.71428571  1.42857143  0.        ]


Unnamed: 0,Least-Squares,TD,ETD
weights,[0.8],[1.0],[0.714285714286]
MSE,0.9,1,0.9183673


## Decreasing Reward, Increasing Feature

* r(s) = N-1-s for s non-terminal
* x(s) = s+1 for s non-terminal

In [14]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

df = report(pmat, rvec, phi, gmfunc, lmfunc, ifunc)

display(df)

Expected Reward:
[ 2.  1.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 0.]]
True Values:
[ 3.  1.  0.]
Best Approximation:
[ 1.  2.  0.]
TD Approximation:
[ 1.33333333  2.66666667  0.        ]
ETD Approximation:
[ 0.85714286  1.71428571  0.        ]


Unnamed: 0,Least-Squares,TD,ETD
weights,[1.0],[1.33333333333],[0.857142857143]
MSE,2.5,2.777778,2.55102


In [15]:
print(df.to_latex())

\begin{tabular}{llll}
\toprule
{} & Least-Squares &               TD &               ETD \\
\midrule
weights &         [1.0] &  [1.33333333333] &  [0.857142857143] \\
MSE     &           2.5 &         2.777778 &           2.55102 \\
\bottomrule
\end{tabular}

