In [1]:
import tensorflow as tf

import numpy as np

from pilco.environments import Environment
from pilco.policies import RBFPolicy
from pilco.costs import EQCost
from pilco.agents import EQGPAgent

from tqdm.notebook import trange

In [10]:
dtype = tf.float64

# Create pendulum environment from Gym
env = Environment(name='Pendulum-v0')
env.reset()

# Create stuff for our controller
# Upright position, stationary and zero action position for pendulum is [0, 0, 0]
target_loc = tf.zeros([1, 2])
target_scale = 1.

eq_cost = EQCost(target_loc=target_loc,
                 target_scale=target_scale,
                 dtype=dtype)

# create EQ policy
eq_policy = RBFPolicy(state_dim=2,
                      action_dim=1,
                      num_rbf_features=5,
                      dtype=dtype)

# create agent
eq_agent = EQGPAgent(state_dim=2,
                     action_dim=1,
                     policy=eq_policy,
                     cost=eq_cost,
                     dtype=dtype)

In [11]:
num_episodes = 1
num_steps = 10

eq_agent.policy.reset()

for episode in range(num_episodes):
    
    print(f"Episode {episode + 1}")
    
    state = env.reset()
    
    for step in trange(num_steps):
        
        action = eq_agent.act(state)
        state, action, next_state = env.step(action[None].numpy())
        eq_agent.observe(state, action, next_state)
        
        #env.env.render()
        
        print(f"state, action, next_state: {state}, {action}, {next_state}")
        
    

Episode 1


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

state, action, next_state: [-2.01967675 -0.72846017], [-0.1438447], [-2.0909636  -1.42573701]
state, action, next_state: [-2.0909636  -1.42573701], [-0.1438447], [-2.19586939 -2.09811575]
state, action, next_state: [-2.19586939 -2.09811575], [0.06729627], [-2.33067997 -2.69621159]
state, action, next_state: [-2.33067997 -2.69621159], [0.12072712], [-2.49176945 -3.22178965]
state, action, next_state: [-2.49176945 -3.22178965], [0.09452371], [-2.67483922 -3.66139533]
state, action, next_state: [-2.67483922 -3.66139533], [0.05734963], [-2.87435346 -3.99028491]
state, action, next_state: [-2.87435346 -3.99028491], [0.03175944], [-3.08353212 -4.18357323]
state, action, next_state: [-3.08353212 -4.18357323], [0.01818783], [-3.29475042 -4.22436599]
state, action, next_state: [-3.29475042 -4.22436599], [0.01200873], [-3.50015767 -4.10814491]
state, action, next_state: [-3.50015767 -4.10814491], [0.00982572], [-3.69233131 -3.8434729 ]



In [12]:
horizon = 10

init_state = tf.constant([[-np.pi, 0.]], dtype=tf.float64)
init_cov = tf.eye(2, dtype=tf.float64)

loc = init_state
cov = init_cov

cost = 0.

for t in range(horizon):

    mean_full, cov_full = eq_agent.policy.match_moments(loc, cov)

    loc, cov = eq_agent.match_moments(mean_full, cov_full)

    cost = cost + eq_agent.cost.expected_cost(loc[None, :], cov)

In [13]:
cost

<tf.Tensor: shape=(), dtype=float64, numpy=6.448761337840721>