In [52]:
import numpy as np
import numpy.linalg as la
from numpy import matrix
from numpy.linalg import pinv

from functools import reduce

# General Analytic Solution

* Need to ensure consistency of all indices
* May need to modify to handle the episodic case

In [98]:
# Find (non)terminal states
def get_terminals(pmat):
    ret = []
    for row in pmat:
        if np.any(row == 1):
            ret.append(row)
    return ret

def get_nonterminals(pmat):
    ret = []
    for row in pmat:
        if not np.any(row == 1):
            ret.append(row)
    return ret

In [5]:
def gm_discount(mat, gm=1.0):
    n = len(np.diag(mat))
    di = np.diag_indices(n, ndim=2)
    gmat = np.zeros((n,n))
    gmat[di] = gm
    return np.eye(n) - np.dot(mat, gmat)

def lmgm_pmat(pmat, gm=1.0, lm=0.0):
    """(I - P_\pi \Gamma \Lambda)"""
    n = len(np.diag(pmat))
    di = np.diag_indices(n, ndim=2)
    gmat = np.zeros((n,n))
    lmat = np.zeros((n,n))
    gmat[di] = gm
    lmat[di] = lm
    return np.eye(n) - np.dot(pmat, np.dot(gmat, lmat))

def discounted_pmat(pmat, gm=1.0, lm=0.0):
    n = len(np.diag(pmat))
    di = np.diag_indices(n, ndim=2)
    gmat = np.zeros((n,n))
    lmat = np.zeros((n,n))
    gmat[di] = gm
    lmat[di] = lm
    
    # (I - PGL)^-1
    a = np.linalg.inv(np.eye(n) - np.dot(pmat, np.dot(gmat, lmat)))
    
    # (I - PG)
    b = np.eye(n) - np.dot(pmat, gmat)

    return np.eye(n) - np.dot(a,b)

def emphasis_mat(pmat, ivec, gm=1.0, lm=0.0):
    n = len(np.diag(pmat))
    dpmat = discounted_pmat(pmat, gm=gm, lm=lm)
    mvec = np.dot(np.linalg.pinv(np.eye(n) - dpmat), ivec)
    return np.diag(mvec)

def feature_mat(states, phi):
    return np.array([phi(s) for s in states])

def mprod(*arrays):
    return reduce(np.dot, arrays)

# Testing 

## Conveyor Belt

Using a conveyor belt environment with uniform rewards.

In [89]:
pmat = np.array([[0,1,0],[0,0,1],[0,0,1]])
s0 = np.array([1,0,0])
rvec = np.array([1,1,0])
gmvec = np.array([1,1,0])
ivec = np.array([1,1,0])

## Random Walk

A random walk, with a reward of 1 on the right hand side, and 0 on the left.

In [106]:
pmat = np.array([
        [0, 0.5, 0, 0.5, 0],
        [0.5, 0, 0.5, 0, 0],
        [0, 0.5, 0, 0, 0.5],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1],
    ])
ns = len(pmat)
states = [row for row in np.eye(ns)]
terminals = [tuple(s) for s in get_terminals(pmat)]
s0 = np.array([0,1,0,0,0])
rvec = np.array([0,0,1/2,0,0])
gmvec = np.array([1,1,1,0,0])
lmvec = np.array([0,0,0,0,0])
ivec = np.array([1,1,1,0,0])
# ivec = np.array([1,1,1,1,1])

In [110]:
# phi = lambda s : np.array(s)
phi = lambda s: np.array([1]) if tuple(s) not in terminals else np.array([0])
fmat = feature_mat(states, phi)

mmat = emphasis_mat(pmat, ivec, gm=gmvec)

lgpmat = lmgm_pmat(pmat, gm=gmvec)
dpmat = discounted_pmat(pmat, gm=gmvec)

I = np.eye(ns)
P = np.copy(pmat)
M = np.copy(mmat)
# M = np.eye(ns)
G = np.diag(gmvec)
L = np.diag(lmvec)
X = np.copy(fmat)
R = np.copy(rvec)

# LSTD solution
bb = np.dot(X.T, R)
AA = mprod(X.T, (I - np.dot(P, G)), X)
td_theta = np.dot(np.linalg.pinv(AA), bb)
print("TD Solution")
print(td_theta)

# ETD solution
P_gl = pinv(I - mprod(P, G, L))
EA = mprod(X.T, M, P_gl, (I - np.dot(P, G)), X)
Eb = mprod(X.T, M, P_gl, R)
etd_theta = np.dot(pinv(EA), Eb)
print("ETD Solution")
print(etd_theta) 

print("ETD Values:")
# np.dot(etd_theta, X.T)
# np.dot(P_gl, rvec) + 
# print(EA)
# print(AA)
# print("Difference:")
# print(EA - AA)

#TODO: Verify that each is a fixed point

TD Solution
[ 0.5]
ETD Solution
[ 0.5]
ETD Values:


In [109]:
mmat

array([[ 3.,  0.,  0.,  0.,  0.],
       [ 0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  3.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [70]:
mprod(X.T, M, P_gl)

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])

In [44]:
lstd_theta

array([  2.50000000e-01,   5.00000000e-01,   7.50000000e-01,
        -2.35922393e-16,   1.87350135e-16])

In [41]:
AA

array([[ 1. , -0.5,  0. , -0.5,  0. ],
       [-0.5,  1. , -0.5,  0. ,  0. ],
       [ 0. , -0.5,  1. ,  0. , -0.5],
       [ 0. ,  0. ,  0. ,  1. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  1. ]])

In [42]:
A

array([[ 0. ,  0.5,  0. ,  0.5,  0. ],
       [ 1. ,  0. ,  1. ,  0. ,  0. ],
       [ 0. ,  0.5,  0. ,  0. ,  0.5],
       [ 0. ,  0. ,  0. , -2. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. , -2. ]])

In [39]:
print(b)
print(A)

[ 0.   0.   0.5  0.   0. ]
[[ 0.   0.5  0.   0.5  0. ]
 [ 1.   0.   1.   0.   0. ]
 [ 0.   0.5  0.   0.   0.5]
 [ 0.   0.   0.  -2.   0. ]
 [ 0.   0.   0.   0.  -2. ]]


In [38]:
print(pmat)
print(rvec)
print(fmat)
print(mmat)
print(lgpmat)
print(dpmat)


[[ 0.   0.5  0.   0.5  0. ]
 [ 0.5  0.   0.5  0.   0. ]
 [ 0.   0.5  0.   0.   0.5]
 [ 0.   0.   0.   1.   0. ]
 [ 0.   0.   0.   0.   1. ]]
[ 0.   0.   0.5  0.   0. ]
[[ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 1.  0.  0.  0.  0.]
 [ 0.  2.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0. -2.  0.]
 [ 0.  0.  0.  0. -2.]]
[[ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.   0.5  0.   0.5  0. ]
 [ 0.5  0.   0.5  0.   0. ]
 [ 0.   0.5  0.   0.   0.5]
 [ 0.   0.   0.   1.   0. ]
 [ 0.   0.   0.   0.   1. ]]
[ 0.   0.   0.5  0.   0. ]
[[ 0.   0.5  0.   0.5  0. ]
 [ 1.   0.   1.   0.   0. ]
 [ 0.   0.5  0.   0.   0.5]
 [ 0.   0.   0.  -2.   0. ]
 [ 0.   0.   0.   0.  -2. ]]
