# TD/ETD Comparison on "Random Walk"

This notebook contains some comparisons between TD and Emphatic TD on a simple problem under function approximation. 

We identify the solutions each algorithm converges to using the matrix operator equations for each algorithm, and compare them to the optimal approximation (as found by the least squares solution).

In [1]:
import numpy as np
import pandas as pd
from functools import reduce
from numpy import dot
from numpy.linalg import pinv

from features import *
from mdptools import *
from solvers import td_solution, etd_solution, exact_solution
from report import report

from IPython.display import display

In [2]:
# Set pandas display options
pd.set_option('precision', 4)

# Set numpy display options
np.set_printoptions(precision=4)

# Random Walk

An environment which represents a random walk along a 1-D chain.

* Here, states are indexed from 0 to N-1, with states N and N-1 being the terminal states.
    * We refer to state `N-1` as the "rightmost" state, and state `N-2` as the "leftmost" state.
* In terminal states, the feature vector $x(s)$ is the zero vector, the reward $r(s)$ is zero, and $\gamma(s) = \lambda(s) = i(s) = 0$.
* The environment is undiscounted, so $\gamma(s) = 1$ for $s$ non-terminal.
* For these experiments, $\lambda(s) = 0$ for all states $s$.
* Interest is constant and uniform for each state; $i(s) = 1$ for $s$ non-terminal.

In [3]:
def random_walk_matrix(n, p=0.5):
    """
    The transition matrix for a random walk with `n` states (including 
    two terminal states).
    """
    ret = np.zeros((n,n))
    # terminal state transitions
    ret[-2:, -2:] = np.eye(2) 
    # transient states that can terminate
    ret[0,-2] = p       # left side of chain
    ret[0,1] = (1-p)
    ret[-3,-4] = p      # right side of chain
    ret[-3,-1] = (1-p)
    # handle rest of transient states
    for i in range(1, n-3):
        ret[i][i-1] = p 
        ret[i][i+1] = (1-p)
    return ret

In [4]:
# Common parts of problem specification
num_states = 7
pleft = 0.5
pmat = random_walk_matrix(num_states, p=pleft)
s0 = int2basis(2, 7)
states = state_vectors(pmat)
indices = state_indices(pmat)
terminals = [as_tuple(s) for s in find_terminals(pmat)]
gmfunc = Constant(1.0, terminals=terminals)
lmfunc = Constant(0.0, terminals=terminals)
ifunc = Constant(1.0, terminals=terminals)

In [5]:
display(pmat)

array([[ 0. ,  0.5,  0. ,  0. ,  0. ,  0.5,  0. ],
       [ 0.5,  0. ,  0.5,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0.5,  0. ,  0.5,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0.5,  0. ,  0.5,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0.5,  0. ,  0. ,  0.5],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ]])

## Constant Feature, Unit Reward on Right-Termination

* reward for terminating in the rightmost state, `N-1`
    * r(s) = 1 for $(s, a, s') = (N-3, right, N-1)$
    * r(s) = 0 for all other transitions.
* x(s) = 1 for x non-terminal

In [6]:
rvec = np.zeros(num_states)
rvec[-3] = (1-pleft)
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]
display(full_df)

Expected Reward:
[ 0.   0.   0.   0.   0.5  0.   0. ]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]]
True Values:
[ 0.1667  0.3333  0.5     0.6667  0.8333  0.      0.    ]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[0.5],0.056,"[0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0]"
TD,[0.5],0.056,"[0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0]"
ETD,[0.5],0.056,"[0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0]"


## Decreasing Feature, Unit Reward on Left-Termination

* reward for terminating in the rightmost state, `N-1`
    * r(s) = 1 for $(s, a, s') = (1, right, 0)$
    * r(s) = 0 for all other transitions.
* x(s) = N - s - 1 for s non-terminal

In [7]:
rvec = np.zeros(num_states)
rvec[0] = pleft
phi = Wrap(Unary2Int(num_states), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

# Reduce precision of `values`
full_df['values'] = full_df['values'].apply(lambda a: np.around(a, decimals=3))
display(full_df)

Expected Reward:
[ 0.5  0.   0.   0.   0.   0.   0. ]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 0.]
 [ 0.]]
True Values:
[ 0.8333  0.6667  0.5     0.3333  0.1667  0.      0.    ]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[0.106060606061],0.182,"[0.106, 0.212, 0.318, 0.424, 0.53, 0.0, 0.0]"
TD,[0.0333333333333],0.24,"[0.033, 0.067, 0.1, 0.133, 0.167, 0.0, 0.0]"
ETD,[0.0333333333333],0.24,"[0.033, 0.067, 0.1, 0.133, 0.167, 0.0, 0.0]"


## Constant Reward, Constant Feature

* r(s) = 1 for s non-terminal
* x(s) = 1 for x non-terminal

This effectively calculates the expected number of steps to termination.

In [8]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  1.  1.  1.  1.  0.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]]
True Values:
[ 5.  8.  9.  8.  5.  0.  0.]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[7.0],2.8,"[7.0, 7.0, 7.0, 7.0, 7.0, 0.0, 0.0]"
TD,[9.0],6.8,"[9.0, 9.0, 9.0, 9.0, 9.0, 0.0, 0.0]"
ETD,[7.66666666667],3.244,"[7.66666666667, 7.66666666667, 7.66666666667, ..."


In [9]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} &          weights &    MSE \\
\midrule
Least-Squares &            [7.0] &  2.800 \\
TD            &            [9.0] &  6.800 \\
ETD           &  [7.66666666667] &  3.244 \\
\bottomrule
\end{tabular}



## Increasing Reward, Constant Feature

* r(s) = s+1 for s non-terminal
* x(s) = 1 for x non-terminal

In [10]:
_rfunc = lambda x: basis2int(x) + 1
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  2.  3.  4.  5.  0.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]]
True Values:
[ 11.6667  21.3333  27.      26.6667  18.3333   0.       0.    ]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[21.0],32.489,"[21.0, 21.0, 21.0, 21.0, 21.0, 0.0, 0.0]"
TD,[27.0],68.489,"[27.0, 27.0, 27.0, 27.0, 27.0, 0.0, 0.0]"
ETD,[23.0],36.489,"[23.0, 23.0, 23.0, 23.0, 23.0, 0.0, 0.0]"


In [11]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} & weights &     MSE \\
\midrule
Least-Squares &  [21.0] &  32.489 \\
TD            &  [27.0] &  68.489 \\
ETD           &  [23.0] &  36.489 \\
\bottomrule
\end{tabular}



## Decreasing Reward, Constant Feature

* r(s) = N-1-s for s non-terminal
* x(s) = 1 for x non-terminal

In [12]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Bias(), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 6.  5.  4.  3.  2.  0.  0.]
Feature Matrix:
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]]
True Values:
[ 23.3333  34.6667  36.      29.3333  16.6667   0.       0.    ]
Emphasis As Good or Better?: True


Unnamed: 0,weights,MSE,values
Least-Squares,[28.0],52.089,"[28.0, 28.0, 28.0, 28.0, 28.0, 0.0, 0.0]"
TD,[36.0],116.089,"[36.0, 36.0, 36.0, 36.0, 36.0, 0.0, 0.0]"
ETD,[30.6666666667],59.2,"[30.6666666667, 30.6666666667, 30.6666666667, ..."


In [13]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} &          weights &      MSE \\
\midrule
Least-Squares &           [28.0] &   52.089 \\
TD            &           [36.0] &  116.089 \\
ETD           &  [30.6666666667] &   59.200 \\
\bottomrule
\end{tabular}



## Constant Reward, Increasing Feature

* r(s) = 1 for s non-terminal
* x(s) = s+1 for s non-terminal

In [14]:
rfunc =  Constant(1.0, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 1.  1.  1.  1.  1.  0.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 0.]
 [ 0.]]
True Values:
[ 5.  8.  9.  8.  5.  0.  0.]
Emphasis As Good or Better?: False


Unnamed: 0,weights,MSE,values
Least-Squares,[1.90909090909],11.709,"[1.90909090909, 3.81818181818, 5.72727272727, ..."
TD,[1.8],11.84,"[1.8, 3.6, 5.4, 7.2, 9.0, 0.0, 0.0]"
ETD,[1.53333333333],13.262,"[1.53333333333, 3.06666666667, 4.6, 6.13333333..."


## Decreasing Reward, Increasing Feature

* r(s) = N-1-s for s non-terminal
* x(s) = s+1 for s non-terminal

In [15]:
_rfunc = lambda x: (num_states - basis2int(x) - 1)
rfunc =  Parameter(_rfunc, terminals=terminals)
rvec = np.array([rfunc(s) for s in states])
phi = Wrap(Unary2Int(num_states), terminals=terminals)

full_df = report(pmat, rvec, s0, phi, gmfunc, lmfunc, ifunc)
df = full_df[["weights", "MSE"]]

display(full_df)

Expected Reward:
[ 6.  5.  4.  3.  2.  0.  0.]
Feature Matrix:
[[ 1.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 0.]
 [ 0.]]
True Values:
[ 23.3333  34.6667  36.      29.3333  16.6667   0.       0.    ]
Emphasis As Good or Better?: False


Unnamed: 0,weights,MSE,values
Least-Squares,[7.29696969697],250.385,"[7.29696969697, 14.5939393939, 21.8909090909, ..."
TD,[6.4],259.236,"[6.4, 12.8, 19.2, 25.6, 32.0, 0.0, 0.0]"
ETD,[5.36296296296],291.53,"[5.36296296296, 10.7259259259, 16.0888888889, ..."


In [16]:
print(df.to_latex())

\begin{tabular}{llr}
\toprule
{} &          weights &      MSE \\
\midrule
Least-Squares &  [7.29696969697] &  250.385 \\
TD            &            [6.4] &  259.236 \\
ETD           &  [5.36296296296] &  291.530 \\
\bottomrule
\end{tabular}

