In [41]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv

from agents import EmphaticTD, TD
from environments import ConveyorBelt, RandomPolicy
from empirical import make_episodes, apply_fa, apply_rfunc, expected_return, learn
from features import *
from mdptools import *

from IPython.display import display

We create a series of episodes in the conveyor belt setting.

In [24]:
num_states = 2
env = ConveyorBelt(2)
pol = RandomPolicy(env, random_seed=101)
raw_episodes = make_episodes(10, env, pol)

## Constant Feature, Constant Reward

In [35]:
phi = Wrap(Bias(), terminals=env.terminals)
_rfunc = lambda s, a, sp: 1
rfunc = Parameter(_rfunc, terminals=env.terminals)

episodes = apply_rfunc(raw_episodes, rfunc)
episodes = apply_fa(episodes, phi)

In [37]:
expected_return(apply_rfunc(raw_episodes, rfunc))

{(1,): 1.0, (2,): 2.0}

In [43]:
td = TD(len(phi))

In [46]:
ep = episodes[0]
step = ep[0]
s, a, r, sp = step
td.update(s=s, r=r, sp=sp, alpha=0.01, gamma=1.0, lmbda=0.0)

1.0

In [55]:
fixed_params = {'gamma': 1.0, 'lmbda': 0.0, 'alpha': 0.01}
learn(td, episodes, fixed_params, repeats=1000)
print(td.theta)

[ 1.99]


In [54]:
td.theta

array([ 0.21741113])