In [1]:
# install/import lspi package
try:
    import lspi
except ImportError:
    !pip install git+https://github.com/qdevpsi3/rl-lspi.git
    import lspi

In [2]:
import numpy as np

# Experiment 1 : LSPI on the *Chain-Walk* environment
---------- 
First, we create a function that computes the *Hamming* distance between the optimal policy and the agent policy.

In [3]:
def hamming(agent, optimal_policy):
    nS = agent.env.observation_space.n
    agent_policy = np.array([agent.predict(s) for s in range(nS)])
    dist = np.sum(optimal_policy != agent_policy)
    return dist

## Experiment 1.1 : *Chain-Walk(4)* with Polynomial Features 
For the first experiment, we perform *LSPI* on 4 states with polynomial features of the form $(1,s,s^2)$ for each action.
We also preprocess the states since they are numbered $\{0,1,2,3\}$ instead of $\{1,2,3,4\}$.

In [4]:
# build the environment
nS = 4
env = lspi.envs.ChainWalkEnv(nS)

# build the agent
degree = 2
preprocess_obs = lambda x: x + 1
agent = lspi.agents.PolynomialAgent(env, degree, preprocess_obs)

# build the trainer
gamma = 0.9
memory_size = 500
memory_type = 'sample'
eval_type = 'sherman_morrison'
baseline = lspi.baselines.LSPolicyIteration(env, agent, gamma, memory_size,
                                            memory_type, eval_type)

# define optimal policy
optimal_policy = np.array([1, 1, 0, 0])

# build the memory
baseline.init_memory()

# run the algorithm
n_iter = 5
dist = hamming(agent, optimal_policy)
print('iteration = {:02d} - distance to optimal policy : {}'.format(0, dist))
for it in range(1, n_iter + 1):
    baseline.train_step()
    dist = hamming(agent, optimal_policy)
    print('iteration = {:02d} - distance to optimal policy : {}'.format(
        it, dist))

iteration = 00 - distance to optimal policy : 2
iteration = 01 - distance to optimal policy : 1
iteration = 02 - distance to optimal policy : 1
iteration = 03 - distance to optimal policy : 0
iteration = 04 - distance to optimal policy : 0
iteration = 05 - distance to optimal policy : 0


## Experiment 1.2 : *Chain-Walk(20)* with Polynomial Features 
For the second experiment, we perform *LSPI* on 20 states with polynomial features of the form $(1,s,s^2,s^3,s^4)$ for each action. A reward of +1 is given only at the boundaries.

In [5]:
def reward_function(s):
    return int(s + 1 in [1, 20])

In [6]:
# build the environment
nS = 20
env = lspi.envs.ChainWalkEnv(nS, reward_function=reward_function)

# build the agent
degree = 4
preprocess_obs = lambda x: x + 1
agent = lspi.agents.PolynomialAgent(env, degree, preprocess_obs)

# build the trainer
gamma = 0.9
memory_size = 5000
memory_type = 'sample'
eval_type = 'batch'
baseline = lspi.baselines.LSPolicyIteration(env, agent, gamma, memory_size,
                                            memory_type, eval_type)

# define optimal policy
optimal_policy = np.array(10 * [0] + 10 * [1])

# build the memory
baseline.init_memory()

# run the algorithm
n_iter = 10
dist = hamming(agent, optimal_policy)
print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
    0, dist))
for it in range(1, n_iter + 1):
    baseline.train_step()
    dist = hamming(agent, optimal_policy)
    print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
        it, dist))

iteration = 00 - distance to optimal policy : 10
iteration = 01 - distance to optimal policy : 07
iteration = 02 - distance to optimal policy : 15
iteration = 03 - distance to optimal policy : 04
iteration = 04 - distance to optimal policy : 00
iteration = 05 - distance to optimal policy : 00
iteration = 06 - distance to optimal policy : 00
iteration = 07 - distance to optimal policy : 00
iteration = 08 - distance to optimal policy : 00
iteration = 09 - distance to optimal policy : 00
iteration = 10 - distance to optimal policy : 00


## Experiment 1.3 : *Chain-Walk(50)* with Polynomial Features 
For the second experiment, we perform *LSPI* on 50 states with polynomial features of the form $(1,s,s^2,s^3,s^4)$ for each action. A reward of +1 is given only at states $10$ and $41$. 

In [7]:
def reward_function(s):
    return int(s + 1 in [10, 41])

In [8]:
# build the environment
nS = 50
env = lspi.envs.ChainWalkEnv(nS, reward_function=reward_function)

# build the agent
degree = 4
preprocess_obs = lambda x: x + 1
agent = lspi.agents.PolynomialAgent(env, degree, preprocess_obs)

# build the trainer
gamma = 0.9
memory_size = 10000
memory_type = 'sample'
eval_type = 'batch'
baseline = lspi.baselines.LSPolicyIteration(env, agent, gamma, memory_size,
                                            memory_type, eval_type)
# define optimal policy
optimal_policy = np.array(9 * [1] + 16 * [0] + 16 * [1] + 9 * [0])

# build the memory
baseline.init_memory()

# run the algorithm
n_iter = 10
dist = hamming(agent, optimal_policy)
print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
    0, dist))
for it in range(1, n_iter + 1):
    baseline.train_step()
    dist = hamming(agent, optimal_policy)
    print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
        it, dist))

iteration = 00 - distance to optimal policy : 23
iteration = 01 - distance to optimal policy : 10
iteration = 02 - distance to optimal policy : 08
iteration = 03 - distance to optimal policy : 08
iteration = 04 - distance to optimal policy : 08
iteration = 05 - distance to optimal policy : 08
iteration = 06 - distance to optimal policy : 08
iteration = 07 - distance to optimal policy : 08
iteration = 08 - distance to optimal policy : 08
iteration = 09 - distance to optimal policy : 08
iteration = 10 - distance to optimal policy : 08


## Experiment 1.4 : *Chain-Walk(50)* with RBF Features 
For the second experiment, we perform *LSPI* on 50 states with RBF features with $10$ means spread uniformly over the state space for each action and variance $\sigma=4$. A reward of +1 is given only at states $10$ and $41$. 

In [9]:
def reward_function(s):
    return int(s + 1 in [10, 41])

In [10]:
# build the environment
nS = 50
env = lspi.envs.ChainWalkEnv(nS, reward_function=reward_function)

# build the agent
centers = np.expand_dims(np.linspace(1, 50, 10), 1)
sigma = 4
preprocess_obs = lambda x: x + 1
agent = lspi.agents.RadialAgent(env, centers, sigma, preprocess_obs)

# build the trainer
gamma = 0.9
memory_size = 10000
memory_type = 'sample'
eval_type = 'batch'
baseline = lspi.baselines.LSPolicyIteration(env, agent, gamma, memory_size,
                                            memory_type, eval_type)

# define optimal policy
optimal_policy = np.array(9 * [1] + 16 * [0] + 16 * [1] + 9 * [0])

# build the memory
baseline.init_memory()

# run the algorithm
n_iter = 10
dist = hamming(agent, optimal_policy)
print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
    0, dist))
for it in range(1, n_iter + 1):
    baseline.train_step()
    dist = hamming(agent, optimal_policy)
    print('iteration = {:02d} - distance to optimal policy : {:02d}'.format(
        it, dist))

iteration = 00 - distance to optimal policy : 25
iteration = 01 - distance to optimal policy : 22
iteration = 02 - distance to optimal policy : 13
iteration = 03 - distance to optimal policy : 23
iteration = 04 - distance to optimal policy : 15
iteration = 05 - distance to optimal policy : 05
iteration = 06 - distance to optimal policy : 03
iteration = 07 - distance to optimal policy : 04
iteration = 08 - distance to optimal policy : 04
iteration = 09 - distance to optimal policy : 04
iteration = 10 - distance to optimal policy : 04
