In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import pinv
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

import mdpy as mdp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Chicken Example

In [2]:
# Chicken problem solved analytically
ns = 6
I = np.eye(ns)

# Probability of transitioning from state s_i --> s_j = P[i,j]
P = np.diag(np.ones(ns-1), 1) * 0.5
P[:,0] = 0.5
P[-1, 0] = 1

# Expected reward for transitioning from s_i --> s_j = R[i,j]
R = np.zeros((ns, ns))
R[-2, -1] = 1.0
r = np.sum(P*R, axis=1)

# Discount
gvec = np.ones(ns)*0.9
gvec[0] = 0
# gvec[-1] = 0
G = np.diag(gvec)

# Bootstrapping
L = np.eye(ns)*0.0

# Function approximation
X = np.eye(ns)


# Value function
v_pi = pinv(I - P @ G) @ r


# From sobel, recursive expected variance contribution?
T = -v_pi**2
for i in range(ns):
    for j in range(ns):
        T[i] += P[i,j] * (R[i,j] + gvec[j]*v_pi[j])**2

# Alternatively,
# T = np.sum(P * (R + G @ v_pi)**2, axis=1) - v_pi**2
        
# Variance (again from Sobel)
v_var = pinv(I - P @ G @ G) @ T 

print(v_pi)
print(T)
print(v_var)

[ 0.0205  0.0456  0.1013  0.225   0.5    -0.    ]
[ 0.0004  0.0021  0.0103  0.0506  0.25   -0.    ]
[ 0.013   0.0311  0.0718  0.1519  0.25   -0.    ]


In [13]:
class Chicken:
    ACTIONS = {'advance': 0, 'return': 1}
    initial_state = 0
    def __init__(self, ns):
        self.length = ns - 1
        self.reset()
        
    def do(self, action):
        if action == 0:
            if self._state == self.length:
                sp = self.initial_state
            else:
                sp = self._state + 1
        elif action == 1:
            sp = self.initial_state
        else:
            raise Exception("Invalid action:", action)
        r = self.reward(self._state, action, sp)
        self._state = sp
        return r, sp
    
    def reward(self, s, a, sp):
        if s == self.length and a == self.ACTIONS['advance']:
            return 1
        else:
            return 0
        
    def reset(self):
        self._state = self.initial_state

    def observe(self):
        return self._state

In [14]:
def generate_steps(env, pol, nsteps):
    ret = []
    env.reset()
    for i in range(nsteps):
        s = env.observe()
        a = policy(s)
        r, sp = env.do(a)
        
        ret.append((s, a, r, sp))
    return ret

def create_episodes(steps, *terminals):
    ret = []
    epi = []
    terminals = set(terminals)
    for step in steps:
        s, a, r, sp = step
        epi.append((s, a, r, sp))
        if sp in terminals:
            ret.append(epi)
            epi = []
    return ret

def compute_return(steps, gmfunc):
    ret = []
    g = 0
    for step in reversed(steps):
        s, a, r, sp = step
        g = r + gmfunc(sp)*g
        ret.append((s, a, g, sp))
    return list(reversed(ret))

def compute_lambda_return(steps, gmfunc, lmfunc, vfunc):
    pass

In [15]:
# Randomly choose either 0 or 1
def policy(s):
    return np.random.binomial(1, 0.5)

In [16]:
# Generate some steps
num_states = 5
slst = generate_steps(Chicken(num_states), policy, 100000)

In [17]:
gamma = lambda x: 0 if x == 0 else 0.9
glst = compute_return(slst, gamma)
df = pd.DataFrame(glst, columns=['s', 'a', 'g', 'sp'])

In [20]:
grouped = pd.groupby(df, 's')

In [21]:
grouped.aggregate({'g': np.mean})

Unnamed: 0_level_0,g
s,Unnamed: 1_level_1
0,0.019572
1,0.043651
2,0.097591
3,0.219467
4,0.495833


In [22]:
grouped.aggregate({'g': np.var})

Unnamed: 0_level_0,g
s,Unnamed: 1_level_1
0,0.012458
1,0.029917
2,0.06953
3,0.149378
4,0.250063


In [23]:
# Experimental variance as a matrix
var_exp = grouped.aggregate({'g': np.var})['g'].as_matrix()

In [35]:
# Create a matrix for tabular feature representation
I = np.eye(ns) 
X = np.array([I[s] for s, *_ in slst])
# Get a vector of returns, one per-timestep
g = np.array([i[2] for i in glst])

In [54]:
# Compute the best approximate weights via least-squares
w_hat, *_ = np.linalg.lstsq(X, g)

In [55]:
w_hat

array([ 0.0196,  0.0437,  0.0976,  0.2195,  0.4958,  0.    ])

In [50]:
# Compute vector of per-timestep variance in returns
g_hat = np.dot(X, w_hat)
var_g = (g - g_hat)**2

In [56]:
# Compute weights for per-feature variance
w_var, *_ = np.linalg.lstsq(X, var_g)

In [57]:
w_var

array([ 0.0125,  0.0299,  0.0695,  0.1494,  0.25  ,  0.    ])

In [58]:
var_exp

array([ 0.0125,  0.0299,  0.0695,  0.1494,  0.2501])

In [59]:
v_var

array([ 0.013 ,  0.0311,  0.0718,  0.1519,  0.25  , -0.    ])