In [4]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import pinv
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

import mdpy as mdp
import mdpy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Analysis Functions

Here we attempt to encapsulate the process of approximating various quantities (e.g., the return, least-squares return, λ-return (under FA), and second moment) in discrete functions in order to speed up analysis and avoid errors from copy-pasting chunks of code that must be modified to work properly.

In [2]:
mdp.td_solution

['Number',
 '__all__',
 '__author__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_whitelist',
 'approx_stationary',
 'as_array',
 'bellman',
 'cols',
 'colsum',
 'det',
 'distribution_matrix',
 'etd_solution',
 'etd_weights',
 'find_nonterminals',
 'find_terminal_indices',
 'find_terminals',
 'followon',
 'get_all_stationary',
 'get_period',
 'inspect',
 'is_absorbing',
 'is_diagonal',
 'is_distribution',
 'is_ergodic',
 'is_matrix',
 'is_nonnegative',
 'is_periodic',
 'is_pvec',
 'is_reducible',
 'is_square',
 'is_stochastic',
 'is_substochastic',
 'least_squares',
 'linalg',
 'matrix_rank',
 'norm',
 'normalize',
 'np',
 'pinv',
 'potential',
 'propagator',
 'rand_p',
 'random_binary',
 'random_mdp',
 'reduce',
 'rows',
 'rowsum',
 'scipy',
 'solve',
 'someones',
 'somezeros',
 'stationary',
 'td_solution',
 'td_weights',
 'unit',
 'util',
 'warp']

# Example MDP

In [64]:
# MDP solved analytically
ns = 6
I = np.eye(ns)

# Probability of transitioning from state s_i --> s_j = P[i,j]
P = np.diag(np.ones(ns-1), 1) * 0.5
P[:,0] = 0.5
P[-1, 0] = 1

# Expected reward for transitioning from s_i --> s_j = R[i,j]
R = np.zeros((ns, ns))
# -1 Reward for non-terminal transitions
R[:,:] = -1
# Reaching edge has zero reward
R[-2, -1] = 0
# Transitions from terminal state have zero reward
R[-1,:] = 0
r = np.sum(P*R, axis=1)

# State-dependent discount
gvec = np.ones(ns)*0.9
gvec[0] = 0
G = np.diag(gvec)

# State-dependent bootstrapping
lvec = np.ones(ns)*0.0
L = np.diag(lvec)

# Value function (expected Monte Carlo return)
v_pi = pinv(I - P @ G) @ r

# Compute stationary distribution for transition matrix
d_pi = mdp.stationary(P)
D = np.diag(d_pi)

Y = np.array([
        [1, 0, 0, 0],
        [1, 0, 0, 1],
        [1, 0, 1, 0],
        [1, 0, 1, 1],
        [1, 1, 0, 0],
        [0, 0, 0, 0]
    ])

In [50]:
def mc_return(P, r, Γ):
    assert(mdpy.is_stochastic(P))
    I = np.eye(len(P))
    return np.linalg.pinv(I - P @ Γ) @ r

def ls_weights(P, r, Γ, X):
    assert(mdpy.is_stochastic(P))
    assert(X.ndim == 2)
    assert(len(X) == len(P))
    value = mc_return(P, r, Γ)
    dist  = mdpy.stationary(P)
    D     = np.diag(dist)
    return np.linalg.pinv(X.T @ D @ X) @ X.T @ D @ value

def ls_values(P, r, Γ, X):
    weights = ls_weights(P, r, Γ, X)
    return X @ weights

def td_weights(P, r, Γ, Λ, X):
    assert(mdpy.is_stochastic(P))
    assert(X.ndim == 2)
    assert(len(X) == len(P))
    assert(mdp.is_diagonal(Γ))
    assert(mdp.is_diagonal(Λ))
    r_lm = (I - P @ Γ @ Λ) @ r
    P_lm = I - pinv(I - P @ Γ @ Λ) @ (I - P @ Γ)
    A = X.T @ D @ (I - P_lm) @ X
    b = X.T @ D @ r_lm
    return np.linalg.pinv(A) @ b

def td_values(P, r, Γ, Λ, X):
    return X @ td_weights(P, r, Γ, Λ, X)
    
def lambda_return(P, r, Γ, Λ, v_hat):
    # Incorporate next-state's value into expected reward
    r_hat = r + P @ Γ @ (I - Λ) @ v_hat
    # Solve the Bellman equation
    return np.linalg.pinv(I - P @ Γ @ Λ) @ r_hat

def sobel_variance(P, R, Γ):
    assert(mdpy.is_stochastic(P))
    assert(P.shape == R.shape)
    assert(mdp.is_diagonal(Γ))
    ns = len(P)
    r = (P * R) @ np.ones(ns)
    v_pi = mc_return(P, r, Γ)
    
    # Set up Bellman equation
    q = -v_pi**2
    for i in range(ns):
        for j in range(ns):
            q[i] += P[i,j]*(R[i,j] + Γ[j,j]*v_pi[j])**2
    # Solve Bellman equation
    return np.linalg.pinv(I - P @ Γ @ Γ) @ q

def second_moment(P, R, Γ, Λ):
    assert(mdpy.is_stochastic(P))
    assert(P.shape == R.shape)
    assert(mdp.is_diagonal(Γ))
    assert(mdp.is_diagonal(Λ))
    ns = len(P)
    # Here the MC-return is both the lambda return and its approximation
    v_lm = mc_return(P, r, Γ)
    γ = np.diag(Γ)
    λ = np.diag(Λ)
    
    # Compute reward-like transition matrix
    R_bar = np.zeros((ns, ns))
    for i in range(ns):
        for j in range(ns):
            R_bar[i,j] = R[i,j]**2 \
                + (γ[j] * (1-λ[j])*v_lm[j])**2 \
                + 2*( γ[j] * (1 - λ[j]) * R[i,j] * v_lm[j] ) \
                + 2*( γ[j] * λ[j] * R[i,j] * v_lm[j]) \
                + 2*( (γ[j]**2)*λ[j]*(1-λ[j]) * (v_lm[j]**2) )
    # Set up Bellman equation for second moment
    r_bar = (P * R_bar) @ np.ones(ns)
    
    # Solve the Bellman equation
    return np.linalg.pinv(I - P @ Γ @ Γ @ Λ @ Λ) @ r_bar

def lambda_second_moment(P, R, Γ, Λ, v_hat):
    assert(mdpy.is_stochastic(P))
    assert(P.shape == R.shape)
    assert(mdp.is_diagonal(Γ))
    assert(mdp.is_diagonal(Λ))
    ns = len(P)
    # Expected immediate reward
    r = (P * R) @ np.ones(ns)
    # Lambda return may be different from approximate lambda return
    v_lm = lambda_return(P, r, Γ, Λ, v_hat)
    
    # Get per-state discount and bootstrapping
    γ = np.diag(Γ)
    λ = np.diag(Λ)
    
    # Compute reward-like transition matrix
    R_bar = np.zeros((ns, ns))
    for i in range(ns):
        for j in range(ns):
            R_bar[i,j] = R[i,j]**2 \
                + (γ[j] * (1-λ[j])*v_lm[j])**2 \
                + 2*( γ[j] * (1 - λ[j]) * R[i,j] * v_hat[j] ) \
                + 2*( γ[j] * λ[j] * R[i,j] * v_lm[j]) \
                + 2*( (γ[j]**2)*λ[j]*(1-λ[j]) * (v_hat[j]*v_lm[j]) )
    # Set up Bellman equation for second moment
    r_bar = (P * R_bar) @ np.ones(ns)
    
    # Solve the Bellman equation
    return pinv(I - P @ Γ @ Γ @ Λ @ Λ) @ r_bar

In [53]:
sobel_variance(P, R, G)

array([ 0.8412,  0.6353,  0.3654,  0.1519,  0.25  ,  0.    ])

In [55]:
lambda_second_moment(P, R, G, np.eye(ns)*1, v_pi) - v_pi**2

array([ 0.8412,  0.6353,  0.3654,  0.1519,  0.25  ,  0.    ])

In [73]:
v_hat = v_pi.copy()
v_hat[np.abs(v_hat) > 1e-6] += 0.5
L_hat = np.eye(ns)*0
lambda_second_moment(P, R, G, L_hat, v_hat) - v_hat**2

array([ 1.3591,  1.2231,  0.9524,  0.5756,  0.5   , -0.    ])

In [None]:
v_hat = v_pi.copy()
v_hat[np.abs(v_hat) > 1e-6] += 0.5
L_hat = np.eye(ns)*0
lambda_second_moment(P, R, G, L_hat, v_hat) - v_hat**2
# TODO: ccompare w/ computed lambda return instead of vhat

In [74]:
second_moment(P, R, G, L) - mc_return(P, r, G)**2

array([ 0.5839,  0.4873,  0.3039,  0.0506,  0.25  , -0.    ])

In [23]:
td_values(P, r, G, L, Y)

array([-1.7646, -1.6445, -1.5414, -1.4214, -0.5   ,  0.    ])

In [14]:
Y.shape

(6, 4)

In [8]:
mc_return(P, r, G)

array([-1.7641, -1.6981, -1.5513, -1.225 , -0.5   , -0.    ])

In [9]:
v_pi

array([-1.7641, -1.6981, -1.5513, -1.225 , -0.5   , -0.    ])

In [25]:
pinv(I - P @ G @ L) @ (r + P @ G @ (I - L) @ v_pi)

array([-1.7641, -1.6981, -1.5513, -1.225 , -0.5   ,  0.    ])

In [29]:
lambda_return(P, r, G, L, v_pi + 0.5)

array([-1.5391, -1.4731, -1.3263, -1.    , -0.275 ,  0.    ])

In [32]:
v_pi + 0.5

array([-1.2641, -1.1981, -1.0513, -0.725 , -0.    ,  0.5   ])