In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import random
import tempfile
import tensorflow as tf
import time

from collections import defaultdict

In [3]:
from tf_rl.controller import ContinuousDeepQ
from tf_rl.models     import MLP
from tf_rl.simulation import DoublePendulum
from tf_rl            import simulate

In [4]:
LOG_DIR = tempfile.mkdtemp()
print(LOG_DIR)

/tmp/tmp7xgacf8j


In [5]:
DOUBLE_PENDULUM_PARAMS = {
    'g_ms2': 9.8, # acceleration due to gravity, in m/s^2
    'l1_m': 1.0, # length of pendulum 1 in m
    'l2_m': 2.0, # length of pendulum 2 in m
    'm1_kg': 1.0, # mass of pendulum 1 in kg
    'm2_kg': 1.0, # mass of pendulum 2 in kg
    'damping': 0.4,
    'max_control_input': 20.0
}

In [8]:
# Tensorflow business - it is always good to reset a graph before creating a new controller.
if 'session' in globals():
    session.close()
tf.ops.reset_default_graph()
session = tf.InteractiveSession()

# This little guy will let us run tensorboard
#      tensorboard --logdir [LOG_DIR]
journalist = tf.train.SummaryWriter(LOG_DIR)

# Brain maps from observation to Q values for different actions.
# Here it is a done using a multi layer perceptron with 2 hidden
# layers
actor  = MLP([DoublePendulum.observation_size,], [30, 30, 1], 
            [tf.tanh, tf.tanh, tf.tanh], scope="actor")
critic = MLP([DoublePendulum.observation_size, DoublePendulum.action_size], [30, 30, 1], 
            [tf.tanh, tf.tanh, tf.identity], scope="critic")

# The optimizer to use. Here we use RMSProp as recommended
# by the publication
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)

# DiscreteDeepQ object
current_controller = ContinuousDeepQ(DoublePendulum.observation_size, DoublePendulum.action_size,
                                     actor, critic, optimizer, session,
                                     discount_rate=0.99, exploration_period=5000, max_experience=10000, 
                                     store_every_nth=4, train_every_nth=4,
                                     exploration_sigma=0.01,
                                     summary_writer=journalist)

session.run(tf.initialize_all_variables())
session.run(current_controller.update_all_targets)
# graph was not available when journalist was created  
journalist.add_graph(session.graph_def)

In [9]:
fast_mode = False

if fast_mode:
    FPS, SPEED, RES = 5, 20.0, 0.03
else:
    FPS, SPEED, RES = 30, 1., 0.03

try:
    while True:
        d = DoublePendulum(DOUBLE_PENDULUM_PARAMS)
        simulate(d, current_controller, fps = FPS,
                 simulation_resultion=RES,
                 actions_per_simulation_second=10,
                 disable_training=False,
                 speed=SPEED,
                 run_for=10.0 / SPEED)
except KeyboardInterrupt:
    print("Interrupted")

TypeError: simulate() got an unexpected keyword argument 'simulation_resultion'

In [17]:
len(current_controller.experience)

10000

In [18]:
current_controller.iteration

143557

In [12]:
d = DoublePendulum(DOUBLE_PENDULUM_PARAMS)
d.joint_positions()

((0.0, 1.0), (0.0, 3.0))

In [13]:
d.collect_reward()

-1.0

In [14]:
d.observe()

array([ 0.,  0.,  0.,  0.])

In [15]:
current_controller.action(d.observe(), disable_exploration=False)

array([-0.9758904], dtype=float32)

In [36]:
current_controller.actions_executed_so_far

83667

In [21]:
noise_sigma = ContinuousDeepQ.linear_annealing(current_controller.actions_executed_so_far,
                                               current_controller.exploration_period,
                                               1.0,
                                               current_controller.exploration_sigma)

In [22]:
noise_sigma

0.01

In [39]:
current_controller.__class__ = ContinuousDeepQ