In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import pinv
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

import mdpy as mdp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Overview

We provide code for a few different ways of estimating variance and second moments of various cumulants in a discrete MDP.
In this notebook we focus on the case where linear function approximation is used.

# Problem Setup

We define and solve an MDP for its value function and its variance (using Sobel's method).

In [35]:
# MDP solved analytically
ns = 6
I = np.eye(ns)

# Probability of transitioning from state s_i --> s_j = P[i,j]
P = np.diag(np.ones(ns-1), 1) * 0.5
P[:,0] = 0.5
P[-1, 0] = 1

# Expected reward for transitioning from s_i --> s_j = R[i,j]
R = np.zeros((ns, ns))
# -1 Reward for non-terminal transitions
R[:,:] = -1
# Reaching edge has zero reward
R[-2, -1] = 0
# Transitions from terminal state have zero reward
R[-1,:] = 0
r = np.sum(P*R, axis=1)

# State-dependent discount
gvec = np.ones(ns)*0.9
gvec[0] = 0
G = np.diag(gvec)

# State-dependent bootstrapping
lvec = np.ones(ns)*0.0
L = np.diag(lvec)

# Value function (expected Monte Carlo return)
v_pi = pinv(I - P @ G) @ r

# Compute stationary distribution for transition matrix
d_pi = mdp.stationary(P)
D = np.diag(d_pi)


# From Sobel, setting up variance Bellman equation
T = -v_pi**2
for i in range(ns):
    for j in range(ns):
        T[i] += P[i,j] * (R[i,j] + gvec[j]*v_pi[j])**2

# Alternatively,
T = (P * (R + G @ v_pi)**2) @ np.ones(ns) - v_pi**2
        
# Solve Bellman equation for variance of return
v_var = pinv(I - P @ G @ G) @ T 

print('v_pi:\n', v_pi)
print('per-state variance:\n', v_var)

v_pi:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
per-state variance:
 [ 0.8412  0.6353  0.3654  0.1519  0.25    0.    ]


## Second Moment of Return

We calculate the second moment of the return following the approach in the VTD paper (White & White, 2016).

In [14]:
# Using the VTD paper to calculate second moments of the return.
# Note that here we are using the most accurate values for everything
# in order to check the equations.
Pbar = np.zeros((ns, ns))
Rbar = np.zeros((ns,ns))
rbar = np.zeros(ns)

# Specify parameters
lvec = np.ones(ns)

# Calculate R-bar transition matrix
for i in range(ns):
    for j in range(ns):
        Rbar[i,j] = R[i,j]**2 + 2*gvec[j]*lvec[j]*R[i,j]*v_pi[j]

# Calculate r-bar vector
for i in range(ns):
    for j in range(ns):
        rbar[i] += P[i,j]*Rbar[i,j]

# Calculate P-bar
for i in range(ns):
    for j in range(ns):
        Pbar[i,j] += P[i,j]*(gvec[j]**2)*(lvec[j]**2)
        
        
# Calculate second moment of return
r_second = pinv(I - Pbar) @ rbar

# Print the results
print("Second moment of return:\n", r_second)
print("Estimated variance via second moment of return:\n", r_second - v_pi**2)
print("Sobel variance:\n", v_var)


# An alternative approach, which is somewhat more concise
# Second moment of return
# rr = (P*R**2) @ np.ones(ns) + (2*P @ G * R) @ v_pi
# vv = pinv(I - P @ G @ G)@(rr)

Second moment of return:
 [ 3.9533  3.5187  2.7718  1.6525  0.5     0.    ]
Estimated variance via second moment of return:
 [ 0.8412  0.6353  0.3654  0.1519  0.25    0.    ]
Sobel variance:
 [ 0.8412  0.6353  0.3654  0.1519  0.25    0.    ]


# Linear Function Approximation 

Linear function approximation is an important test case when analyzing the utility of a learning algorithm.

Here we use a relatively simple scheme where we take the binary representation of the state's number as its feature representation, with a bias unit.

So state \#2 becomes `[1, 0, 1, 0]` and state \#3 becomes `[1, 0, 1, 1]`.

In [6]:
# hacky way of computing binary representation of state number
blst = [np.binary_repr(i, width=int(np.ceil(np.log2(ns)))) for i in range(ns)]
# convert strings to array form while prepending a bias unit
flst = [np.append(1, np.array([int(x) for x in s])) for s in blst]
# feature matrix
X = np.array(flst)
# set terminal state features to zero vector
X[-1] *= 0

In [7]:
X

array([[1, 0, 0, 0],
       [1, 0, 0, 1],
       [1, 0, 1, 0],
       [1, 0, 1, 1],
       [1, 1, 0, 0],
       [0, 0, 0, 0]])

## TD(λ) fixed point

We compute the weights that TD(λ) converges to asymptotically, and the associated value function.

In [43]:
# Compute TD(λ) fixed point
w_hat = pinv(X.T @ D @ pinv(I - P @ G @ L) @ (I - P @ G) @ X) @ X.T @ D @ pinv(I - P @ G @ L) @ r
v_hat = X @ w_hat

print('TD(λ) fixed point weights:\n', w_hat)
print('TD(λ) approximate value function:\n', v_hat)
print('v_pi - v_hat:\n', v_pi - v_hat)
print('Error due to bias:\n', np.sum((v_pi - v_hat)**2))

TD(λ) fixed point weights:
 [-1.7646  1.2646  0.2232  0.12  ]
TD(λ) approximate value function:
 [-1.7646 -1.6445 -1.5414 -1.4214 -0.5     0.    ]
v_pi - v_hat:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.     -0.    ]
Error due to bias:
 0.0415271656424


## Least-Squares / Monte Carlo Approximation

We can also calculate the fixed point for when $\lambda=1$ and we are approximating the Monte Carlo return. 

This is also called the least-squares approximation, because we are essentially performing a weighted least-squares regression on the value function.

In [44]:
# Compute least-squares / TD(1) fixed point
w_lsq = pinv(X.T @ D @ X) @ X.T @ D @ pinv(I - P @ G) @ r
v_lsq = X @ w_lsq

print('Least-squares weights:\n', w_lsq)
print('Least-squares approximate value function:\n', v_lsq)
print('v_pi - v_lsq:\n', v_pi - v_lsq)
print('Error due to bias (least-squares):\n', np.sum((v_pi - v_lsq)**2))

Least-squares weights:
 [-1.7815  1.2815  0.2996  0.1181]
Least-squares approximate value function:
 [-1.7815 -1.6634 -1.4819 -1.3638 -0.5     0.    ]
v_pi - v_lsq:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.     -0.    ]
Error due to bias (least-squares):
 0.0255740100645


# Second Moment Calculation


In a previous notebook analyzing the tabular case, we found that the second moment of the $\delta$-return provides us with the mean squared error for a given function approximator, and that in some situations the $\delta^2$-return is a good approximation to this quantity.

## Remarks and Review

Recall that the $\delta$-return is defined as 

$$
G^{\delta, \lambda}_{t} = \sum_{n=0}^{\infty} \delta_{t+n} \prod_{k=1}^{n-1} \gamma_{t+k} \lambda_{t+k}
$$

Furthermore, we have the following identity for what the $\delta$-return represents:

$$
G^{\lambda}_{t} - \hat{v}(S_t) = \sum_{n=0}^{\infty} \delta_{t+n} \prod_{k=1}^{n-1} \gamma_{t+k} \lambda_{t+k}
$$

So the second moment of the $\delta$-return (setting $\lambda=1$) gives us $\mathbb{E}[(G_{t} - \hat{v}(S_t))^2]$.
Although a more careful analysis is required, it was enough to conjecture that this quantity is really the MSE, incorporating both the error introduced via bias in the value function and the variance in the MDP. 

The $\delta^2$-return, defined as:

$$
G^{\delta^{2}, \lambda}_{t} = \sum_{n=0}^{\infty} \delta_{t+n}^2 \prod_{k=1}^{n-1} \gamma_{t+k}^2 \lambda_{t+k}^2
$$

Is well-defined, and likely more amenable to approximation via LFA, but except when the bias is small doesn't really capture either the variance or the bias.
This is because it is missing the 'cross-terms' of the full second moment expansion.

$$
\begin{aligned}
(G^{\lambda}_{t} - \hat{v}(S_{t}))^2 
&= \Bigg( \sum_{n=0}^{\infty} \delta_{t+n} \prod_{k=1}^{n-1} \gamma_{t+k} \lambda_{t+k}\Bigg)^2
\\
&= \sum_{n=0}^{\infty} \delta_{t+n} \prod_{k=1}^{n-1} \gamma_{t+k} \lambda_{t+k}
\Bigg(
\sum_{m=0}^{\infty} \delta_{t+m} \prod_{j=1}^{m-1} \gamma_{t+j} \lambda_{t+j}
\Bigg)
\\
&= \delta_{t}^2 
+ \delta_{t} \Bigg(
\sum_{m=1}^{\infty} \delta_{t+m} \prod_{j=1}^{m-1} \gamma_{t+j} \lambda_{t+j}
\Bigg)
+ \sum_{n=1}^{\infty} \delta_{t+n} \prod_{k=1}^{n-1} \gamma_{t+k} \lambda_{t+k}
\Bigg(
\sum_{m=1}^{\infty} \delta_{t+m} \prod_{j=1}^{m-1} \gamma_{t+j} \lambda_{t+j}
\Bigg)
\\
&= \delta_{t}^{2} + (G^{\lambda}_{t+1} - \hat{v}(S_{t+1}))^2 
+ \underbrace{\delta_{t} \Bigg(
\sum_{m=1}^{\infty} \delta_{t+m} \prod_{j=1}^{m-1} \gamma_{t+j} \lambda_{t+j}
\Bigg)}_{\text{'cross terms'}}
\\
&= G_{t}^{\delta^{2}, \lambda} 
+ \delta_{t} \Bigg(
\sum_{m=1}^{\infty} \delta_{t+m} \prod_{j=1}^{m-1} \gamma_{t+j} \lambda_{t+j}
\Bigg)
\end{aligned}
$$

When the bias is zero, these cross terms disappear (because in expectation $\delta_{t}$ is zero for all times $t$).
When the bias is sufficiently small, the combination of discounting and randomness tends to diminish their contribution as well.
As the bias gets larger, or if it is correlated (persistently over/underestimating the value function), the difference between the $\delta^2$-return and the second moment of the $\delta$-return grows rapidly.

## Checking results for LFA

We check what happens when we use the value function derived under LFA with our simple representation.

Note that this assumes we are able to estimate the $\delta$-return, its second moment, and the $\delta^2$-return exactly, which is likely not the case in the 'real-world' given that we had to make an approximation in order to get the value function.

### TD(λ) Fixed Point

We first examine the various quantities using the value function that TD(λ) converges to.

In [53]:
# Compute TD(λ) fixed point
w_hat = pinv(X.T @ D @ pinv(I - P @ G @ L) @ (I - P @ G) @ X) @ X.T @ D @ pinv(I - P @ G @ L) @ r
v_hat = X @ w_hat

# Bias of approximate value function
bias = v_pi - v_hat

# TD error matrix, for error given transition i-->j
Δ = np.zeros_like(R)
for i in range(ns):
    for j in range(ns):
        Δ[i,j] = (R[i,j] + gvec[j]*v_hat[j] - v_hat[i])

# Expected TD-error
δ = (P*Δ) @ np.ones(ns)
        
# Expected δ^2
δ_sq = (P * Δ**2) @ np.ones(ns)
        
# δ-return
gd = pinv(I - P @ G) @ δ

# δ^2-return
gd_sq = pinv(I - P @ G @ G) @ δ_sq

# Second moment of δ-return
dd = (P * Δ**2) @ np.ones(ns) + (2*P @ G * Δ) @ gd
gd_second = pinv(I - P @ G @ G)@(dd)

# Print summary of results
print("v_π:\n", v_pi)
print("v_hat:\n", v_hat)
print("bias:\n", v_pi-v_hat)
print("δ-return:\n", gd)
print("δ^2-return:\n", gd_sq)
print("Second moment expected 'reward' (r-bar):\n", dd)
print("Second moment of delta-return:\n", gd_second)
print("Variance of return:\n", v_var)
print("Expected squared error:\n", (v_pi - v_hat)**2 + v_var)

v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7646 -1.6445 -1.5414 -1.4214 -0.5     0.    ]
bias:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.     -0.    ]
δ-return:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.      0.    ]
δ^2-return:
 [ 0.8254  0.6844  0.4959  0.1904  0.25    0.    ]
Second moment expected 'reward' (r-bar):
 [ 0.5827  0.4901  0.2884  0.0892  0.25    0.    ]
Second moment of delta-return:
 [ 0.8412  0.6381  0.3655  0.1904  0.25    0.    ]
Variance of return:
 [ 0.8412  0.6353  0.3654  0.1519  0.25    0.    ]
Expected squared error:
 [ 0.8412  0.6381  0.3655  0.1904  0.25    0.    ]


### Least-Squares Fixed Point

We now check the results using the least-squares approximation as our value function.
This will be more accurate (less biased) almost by definition.

In [42]:
# Compute TD(λ) fixed point
w_hat = w_lsq
v_hat = X @ w_hat

# Bias of approximate value function
bias = v_pi - v_hat

# TD error matrix, for error given transition i-->j
Δ = np.zeros_like(R)
for i in range(ns):
    for j in range(ns):
        Δ[i,j] = (R[i,j] + gvec[j]*v_hat[j] - v_hat[i])

# Expected TD-error
δ = (P*Δ) @ np.ones(ns)
        
# Expected δ^2
δ_sq = (P * Δ**2) @ np.ones(ns)
        
# δ-return
gd = pinv(I - P @ G) @ δ

# δ^2-return
gd_sq = pinv(I - P @ G @ G) @ δ_sq

# Second moment of δ-return
dd = (P * Δ**2) @ np.ones(ns) + (2*P @ G * Δ) @ gd
gd_second = pinv(I - P @ G @ G)@(dd)

# Variance using Sobel's method and approximate value function
T_hat = (P * (R + G @ v_hat)**2) @ np.ones(ns) - v_hat**2
        
# Solve Bellman equation for variance of return w/ approx value function
var_hat = pinv(I - P @ G @ G) @ T_hat

# Print summary of results
print("v_π:\n", v_pi)
print("v_hat:\n", v_hat)
print("bias:\n", v_pi-v_hat)
print("δ-return:\n", gd)
print("δ^2-return:\n", gd_sq)
print("Variance of return using v_hat:\n", var_hat)
print("Second moment expected 'reward' (r-bar):\n", dd)
print("Second moment of delta-return:\n", gd_second)
print("Variance of return:\n", v_var)
print("Expected squared error:\n", (v_pi - v_hat)**2 + v_var)

v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7815 -1.6634 -1.4819 -1.3638 -0.5     0.    ]
bias:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.     -0.    ]
δ-return:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.      0.    ]
δ^2-return:
 [ 0.8175  0.6323  0.4633  0.1711  0.25    0.    ]
Variance of return using v_hat:
 [ 0.7436  0.74    0.7007 -0.2074  0.25    0.    ]
Second moment expected 'reward' (r-bar):
 [ 0.5837  0.4865  0.3009  0.0699  0.25    0.    ]
Second moment of delta-return:
 [ 0.8415  0.6365  0.3702  0.1711  0.25    0.    ]
Variance of return:
 [ 0.8412  0.6353  0.3654  0.1519  0.25    0.    ]
Expected squared error:
 [ 0.8415  0.6365  0.3702  0.1711  0.25    0.    ]


## Can we approximate the δ-return with the same features?

In short, the answer is 'no, not really'.
This can be shown from some linear algebra, but in essence the problem is that if you were able to estimate how biased your predictor was with certainty, then you wouldn't need to be biased at all.

We are still not computing these quantities online. 
The current approach is similar to a batch setting, where for some reason we have access to the full problem, but elect to approximate the value function and then also approximate these quantities as well.

We use the value function from the TD(0) fixed point, and estimate the δ- and δ^2-returns using the TD(1) fixed point.

This allows for us to arrive at a nonzero value function for the δ-return.

In [76]:
# Compute TD(0) fixed point
L     = np.eye(ns)*0
A_inv = pinv(X.T @ D @ pinv(I - P @ G @ L) @ (I - P @ G) @ X) @ X.T @ D
w_hat = A_inv @ pinv(I - P @ G @ L) @ r
v_hat = X @ w_hat

# Bias of approximate value function
bias = v_pi - v_hat

# TD error matrix, for error given transition i-->j
Δ = np.zeros_like(R)
for i in range(ns):
    for j in range(ns):
        Δ[i,j] = (R[i,j] + gvec[j]*v_hat[j] - v_hat[i])

# Expected TD-error
δ = (P*Δ) @ np.ones(ns)
        
# Expected δ^2
δ_sq = (P * Δ**2) @ np.ones(ns)
        
# δ-return
gd = pinv(I - P @ G) @ δ

# δ^2-return
gd_sq = pinv(I - P @ G @ G) @ δ_sq

# Second moment of δ-return
dd = (P * Δ**2) @ np.ones(ns) + (2*P @ G * Δ) @ gd
gd_second = pinv(I - P @ G @ G)@(dd)

################################################################################
# LFA for δ-return, δ^2-return, and moments

L = np.eye(ns)
wd_hat = pinv(X.T @ D @ pinv(I - P @ G @ L) @ (I - P @ G) @ X) @ X.T @ D @ pinv(I - P @ G @ L) @ δ
vd_hat = X @ wd_hat

# This may not be right
Gbar = G @ G
Lbar = np.diag(np.ones(ns))
A_sq_inv = pinv(X.T @ D @ pinv(I - P @ Gbar @ Lbar) @ (I - P @ Gbar) @ X) @ X.T @ D
wdsq_hat = A_sq_inv @ pinv(I - P @ Gbar @ Lbar) @ δ_sq
vdsq_hat = X @ wdsq_hat

print("v_π:\n", v_pi)
print("v_hat:\n", v_hat)
print("bias:\n", v_pi-v_hat)
print("δ-return:\n", gd)
print("δ^2-return:\n", gd_sq)
print("Approximate δ-return:\n", vd_hat)
print("Approximate δ^2-return:\n", vdsq_hat)

v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7646 -1.6445 -1.5414 -1.4214 -0.5     0.    ]
bias:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.     -0.    ]
δ-return:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.      0.    ]
δ^2-return:
 [ 0.8254  0.6844  0.4959  0.1904  0.25    0.    ]
Approximate δ-return:
 [-0.0169 -0.0188  0.0596  0.0576 -0.      0.    ]
Approximate δ^2-return:
 [ 0.8364  0.6625  0.4521  0.2781  0.25    0.    ]


## Variations on Approximation Approach

We note that it is not necessary to use the same bootstrapping parameter (λ) when approximating the various quantities of interest.

Using a different $\lambda$ *might* allow us to approximate the $\delta$-return with the same representation, although intuitively it seems like it still won't be possible to capture everything.

In [74]:
# Compute TD(λ) fixed point
L1    = np.eye(ns)*1
A_inv = pinv(X.T @ D @ pinv(I - P @ G @ L1) @ (I - P @ G) @ X) @ X.T @ D
w_hat = A_inv @ pinv(I - P @ G @ L1) @ r
v_hat = X @ w_hat

# Bias of approximate value function
bias = v_pi - v_hat

# TD error matrix, for error given transition i-->j
Δ = np.zeros_like(R)
for i in range(ns):
    for j in range(ns):
        Δ[i,j] = (R[i,j] + gvec[j]*v_hat[j] - v_hat[i])

# Expected TD-error
δ = (P*Δ) @ np.ones(ns)
        
# Expected δ^2
δ_sq = (P * Δ**2) @ np.ones(ns)
        
# δ-return
gd = pinv(I - P @ G) @ δ

# δ^2-return
gd_sq = pinv(I - P @ G @ G) @ δ_sq

# Second moment of δ-return
dd = (P * Δ**2) @ np.ones(ns) + (2*P @ G * Δ) @ gd
gd_second = pinv(I - P @ G @ G)@(dd)

################################################################################
# LFA for δ-return, δ^2-return, and moments
# with potentially distinct lambda

L2 = np.eye(ns)*0

wd_hat = pinv(X.T @ D @ pinv(I - P @ G @ L2) @ (I - P @ G) @ X) @ X.T @ D @ pinv(I - P @ G @ L2) @ δ
vd_hat = X @ wd_hat

# This may not be right
L3   = np.eye(ns)*0
Gbar = G @ G @ L3 @ L3
Lbar = np.eye(ns)
A_sq_inv = pinv(X.T @ D @ pinv(I - P @ Gbar @ Lbar) @ (I - P @ Gbar) @ X) @ X.T @ D
wdsq_hat = A_sq_inv @ pinv(I - P @ Gbar @ Lbar) @ δ_sq
vdsq_hat = X @ wdsq_hat

print("λ for v_hat:\n", np.diag(L1))
print("λ for δ-return:\n", np.diag(L2))
print("λ for δ^2-return:\n", np.diag(L3))
print("v_π:\n", v_pi)
print("v_hat:\n", v_hat)
print("bias:\n", v_pi-v_hat)
print("δ-return:\n", gd)
print("δ^2-return:\n", gd_sq)
print("Approximate δ-return:\n", vd_hat)
print("Approximate δ^2-return:\n", vdsq_hat)

λ for v_hat:
 [ 1.  1.  1.  1.  1.  1.]
λ for δ-return:
 [ 0.  0.  0.  0.  0.  0.]
λ for δ^2-return:
 [ 0.  0.  0.  0.  0.  0.]
v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7815 -1.6634 -1.4819 -1.3638 -0.5     0.    ]
bias:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.     -0.    ]
δ-return:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.      0.    ]
δ^2-return:
 [ 0.8175  0.6323  0.4633  0.1711  0.25    0.    ]
Approximate δ-return:
 [ 0.0169  0.0188 -0.0596 -0.0576  0.      0.    ]
Approximate δ^2-return:
 [ 0.5752  0.417   0.3387  0.1805  0.25    0.    ]


### Results

Here we present some selected results for using different values of $\lambda$.

#### TD(0) for value function, TD(0) for δ- and δ^2-returns

```
λ for v_hat:
 [ 0.  0.  0.  0.  0.  0.]
λ for δ-return:
 [ 0.  0.  0.  0.  0.  0.]
λ for δ^2-return:
 [ 0.  0.  0.  0.  0.  0.]
v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7646 -1.6445 -1.5414 -1.4214 -0.5     0.    ]
bias:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.     -0.    ]
δ-return:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.      0.    ]
δ^2-return:
 [ 0.8254  0.6844  0.4959  0.1904  0.25    0.    ]
Approximate δ-return:
 [-0. -0.  0.  0. -0.  0.]
Approximate δ^2-return:
 [ 0.5659  0.4482  0.3481  0.2304  0.25    0.    ]
```

The δ-return is not approximable in this case, as expected.

The δ^2-return is not particularly well approximated in this case.
It appears that the errors in the early states are higher because they incorporate both the inaccuracies in the approximate value function *and* bootstrap from states which have errors themselves.

#### TD(1) for value function, TD(1) for δ- and δ^2-returns

```
λ for v_hat:
 [ 1.  1.  1.  1.  1.  1.]
λ for δ-return:
 [ 1.  1.  1.  1.  1.  1.]
λ for δ^2-return:
 [ 1.  1.  1.  1.  1.  1.]
v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7815 -1.6634 -1.4819 -1.3638 -0.5     0.    ]
bias:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.     -0.    ]
δ-return:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.      0.    ]
δ^2-return:
 [ 0.8175  0.6323  0.4633  0.1711  0.25    0.    ]
Approximate δ-return:
 [ 0.  0.  0.  0.  0.  0.]
Approximate δ^2-return:
 [ 0.8246  0.6181  0.4348  0.2282  0.25    0.    ]
```

As expected, the δ-return is not approximable in this case where all the parameters are the same.

The δ^2-return is approximated rather well in this case.

#### TD(0) for value function, TD(1) for δ- and δ^2-returns

```
λ for v_hat:
 [ 0.  0.  0.  0.  0.  0.]
λ for δ-return:
 [ 1.  1.  1.  1.  1.  1.]
λ for δ^2-return:
 [ 1.  1.  1.  1.  1.  1.]
v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7646 -1.6445 -1.5414 -1.4214 -0.5     0.    ]
bias:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.     -0.    ]
δ-return:
 [ 0.0005 -0.0535 -0.0098  0.1964 -0.      0.    ]
δ^2-return:
 [ 0.8254  0.6844  0.4959  0.1904  0.25    0.    ]
Approximate δ-return:
 [-0.0169 -0.0188  0.0596  0.0576 -0.      0.    ]
Approximate δ^2-return:
 [ 0.8364  0.6625  0.4521  0.2781  0.25    0.    ]
```

The approximation for the δ-return is nonzero, but it's not particularly robust. 

The δ^2-return is approximated rather well.

#### TD(1) for value function, TD(0) for δ- and δ^2-returns

```
λ for v_hat:
 [ 1.  1.  1.  1.  1.  1.]
λ for δ-return:
 [ 0.  0.  0.  0.  0.  0.]
λ for δ^2-return:
 [ 0.  0.  0.  0.  0.  0.]
v_π:
 [-1.7641 -1.6981 -1.5513 -1.225  -0.5    -0.    ]
v_hat:
 [-1.7815 -1.6634 -1.4819 -1.3638 -0.5     0.    ]
bias:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.     -0.    ]
δ-return:
 [ 0.0173 -0.0347 -0.0694  0.1388  0.      0.    ]
δ^2-return:
 [ 0.8175  0.6323  0.4633  0.1711  0.25    0.    ]
Approximate δ-return:
 [ 0.0169  0.0188 -0.0596 -0.0576  0.      0.    ]
Approximate δ^2-return:
 [ 0.5752  0.417   0.3387  0.1805  0.25    0.    ]
```

The δ-return is approximable, and apparently it's not too far off from the true δ-return.
More analysis would be necessary to say whether this is the case in general.

The δ^2-return is not well approximated, as might be expected given that TD(0) relies on having a good estimate of subsequent states in order to bootstrap properly.
Interestingly, the approximate value is very similar to the approximation for the δ^2-return with TD(0) for the value function and TD(0) for the δ^2-return listed above.

# Variance Computations With Approximate Value Function

We have some unfinished work examining whether it's either possible or useful to compute the variance with an approximate value function in lieu of $v_{\pi}$.

From a cursory examination that does not appear to be the case, because we can easily end up with negative values and rather large deviations from the 'true' variance of the return.

What this means is unclear at the present.

In [None]:
# TD(λ)
w_hat = pinv(X.T @ D @ pinv(I - P @ G @ L) @ (I - P @ G) @ X) @ X.T @ D @ pinv(I - P @ G @ L) @ r
v_hat = X @ w_hat

# Least-squares
# w_hat = w_lsq
# v_hat = X @ w_hat

# Using the VTD paper to calculate second moments of the return.
Pbar = np.zeros((ns, ns))
Rbar = np.zeros((ns,ns))
rbar = np.zeros(ns)

# Specify parameters
lvec = np.ones(ns)
# lvec = np.zeros(ns)

# Calculate R-bar transition matrix
for i in range(ns):
    for j in range(ns):
        Rbar[i,j] = R[i,j]**2 + 2*gvec[j]*lvec[j]*R[i,j]*v_hat[j]

# Calculate r-bar vector
for i in range(ns):
    for j in range(ns):
        rbar[i] += P[i,j]*Rbar[i,j]

# Calculate P-bar
for i in range(ns):
    for j in range(ns):
        Pbar[i,j] += P[i,j]*(gvec[j]**2)*(lvec[j]**2)
        
        
# Calculate second moment of return
r_second = pinv(I - Pbar) @ rbar

# Variance using Sobel's method and approximate value function
T_hat = (P * (R + G @ v_hat)**2) @ np.ones(ns) - v_hat**2
        
# Solve Bellman equation for variance of return
var_hat = pinv(I - P @ G @ G) @ T_hat

# Print the results
print("Second moment of return:\n", r_second)
print("Estimated variance via second moment of return:\n", r_second - v_hat**2)
print("Sobel variance (using v_hat):\n", var_hat)
print("Sobel variance:\n", v_var)