In [None]:
from __future__ import division
import pickle
import os
import types
import random
import uuid
import math
from copy import deepcopy as copy
import logging

import gym
from gym import spaces
from gym.envs.classic_control import rendering
import numpy as np
import tensorflow as tf
from scipy.misc import logsumexp
from baselines import deepq
import baselines.common.tf_util as U

In [None]:
from matplotlib import pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rc('savefig', dpi=300)
mpl.rc('text', usetex=True)

In [None]:
logger = logging.getLogger()
assert len(logger.handlers) == 1
handler = logger.handlers[0]
handler.setLevel(logging.WARNING)

In [None]:
data_dir = os.path.join('data', '5.0-lander-ime')

In [None]:
sess = tf.Session()

create envs, pilot policies

In [None]:
throttle_mag = 0.75
def disc_to_cont(action):
  if type(action) == np.ndarray:
    return action
  # main engine
  if action < 3:
    m = -throttle_mag
  elif action < 6:
    m = throttle_mag
  else:
    raise ValueError
  # steering
  if action % 3 == 0:
    s = -throttle_mag
  elif action % 3 == 1:
    s = 0
  else:
    s = throttle_mag
  return np.array([m, s])

In [None]:
n_act_dim = 6
n_obs_dim = 9

In [None]:
max_ep_len = 1000

In [None]:
slow_fps = 60
fast_fps = 40

grid search

In [None]:
fan_fpses = np.arange(fast_fps - (slow_fps - fast_fps), slow_fps + 2, 2)

fan_confs = [{'fps': fps} for fps in fan_fpses]
n_ims = len(fan_confs)

In [None]:
aristotle_conf_idxes = [i for i, conf in enumerate(fan_confs) if conf['fps'] == 60]

In [None]:
n_ims, fan_fpses

...or random search

In [None]:
conf_bounds = {
  'fps': (20, 60),
  'mep': (6.5, 19.5),
  'sep': (0.3, 0.9),
  'sea': (6, 18),
  'seh': (7, 21),
  'scale': (15, 45),
  'leg_down': (9, 27)
}
n_ims = 20

In [None]:
fan_confs = [{k: (v[0] + np.random.random() * (v[1] - v[0])) for k, v in conf_bounds.items()} for _ in range(n_ims)]

In [None]:
train_goals = np.arange(1, 10, 1).astype(int)
n_train_tasks = train_goals.size

In [None]:
def make_lander_env(fps=slow_fps, goal=None):
  env = gym.make('LunarLanderContinuous-v2')
  env.unwrapped.goal = goal
  env.action_space = spaces.Discrete(n_act_dim)
  env.unwrapped._step_orig = env.unwrapped._step
  def _step(self, action):
    obs, r, done, info = self._step_orig(disc_to_cont(action))
    return obs, r, done, info
  env.unwrapped._step = types.MethodType(_step, env.unwrapped)
  env.unwrapped.fps = fps
  return env

In [None]:
train_newton_envs = [make_lander_env(fps=fast_fps, goal=goal) for goal in train_goals]
train_aristotle_envs = [make_lander_env(fps=slow_fps, goal=goal) for goal in train_goals]

In [None]:
def run_ep(policy, env, max_ep_len=max_ep_len, render=False, task_idx=None):
  obs = env.reset()
  done = False
  totalr = 0.
  prev_obs = obs
  rollout = []
  for step_idx in range(max_ep_len+1):
    if done:
      break
    action = policy(obs)
    obs, r, done, info = env.step(action)
    rollout.append((prev_obs, action, r, obs, float(done), task_idx))
    prev_obs = obs
    if render:
      env.render()
    totalr += r
  return rollout

train agent with soft dqn

In [None]:
train_aristotle_env = make_lander_env(fps=slow_fps)

In [None]:
n_training_episodes = 800
load_pretrained_pilot = True

In [None]:
make_q_func = lambda: deepq.models.mlp([64, 64])
dqn_learn_kwargs = {
  'lr': 1e-3,
  'target_network_update_freq': 3000,
  'print_freq': 100,
  'max_timesteps': max_ep_len * (1 if load_pretrained_pilot else n_training_episodes)
}

In [None]:
with open(os.path.join(data_dir, 'aristotle_dqn_pilot_scope.pkl'), 'rb') as f:
  aristotle_dqn_pilot_scope = pickle.load(f)

In [None]:
aristotle_dqn_pilot_scope = str(uuid.uuid4())

In [None]:
raw_aristotle_dqn_pilot_policy, _ = deepq.learn(
  train_aristotle_env,
  q_func=make_q_func(),
  scope=aristotle_dqn_pilot_scope,
  **dqn_learn_kwargs
)

In [None]:
with open(os.path.join(data_dir, 'aristotle_dqn_pilot_scope.pkl'), 'wb') as f:
  pickle.dump(aristotle_dqn_pilot_scope, f, pickle.HIGHEST_PROTOCOL)

In [None]:
aristotle_dqn_pilot_path = os.path.join(data_dir, 'aristotle_dqn_pilot.tf')

In [None]:
def save_tf_vars(sess, scope, path):
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.save(sess, save_path=path)

In [None]:
def load_tf_vars(sess, scope, path):
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.restore(sess, path)

In [None]:
save_tf_vars(U.get_session(), aristotle_dqn_pilot_scope, aristotle_dqn_pilot_path)

In [None]:
load_tf_vars(U.get_session(), aristotle_dqn_pilot_scope, aristotle_dqn_pilot_path)

In [None]:
VIEWPORT_W = 600
VIEWPORT_H = 400
SCALE = 30.0
W = VIEWPORT_W/SCALE
H = VIEWPORT_H/SCALE
CHUNKS = 11
chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)]
helipad_xs = [(chunk_x[goal-1]+chunk_x[goal+1])/2 for goal in train_goals]
train_goal_obses = [(helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2) for helipad_x in helipad_xs]

In [None]:
temperature = 1
def make_aristotle_pilot_policy(train_task_idx):
  goal_obs = train_goal_obses[train_task_idx]
  def aristotle_pilot_policy(obs):
    my_obs = copy(obs)
    my_obs[8] = goal_obs
    with tf.variable_scope(aristotle_dqn_pilot_scope, reuse=None):
      return raw_aristotle_dqn_pilot_policy._act(my_obs[None, :], temperature=temperature)[0]
  return aristotle_pilot_policy

In [None]:
aristotle_pilot_policies = [make_aristotle_pilot_policy(train_task_idx) for train_task_idx in range(n_train_tasks)]

In [None]:
def make_aristotle_pilot_policy(train_task_idx):
  return aristotle_pilot_policies[train_task_idx]

sanity-check envs, agents

In [None]:
train_task_idx = 0

In [None]:
run_ep(aristotle_pilot_policies[train_task_idx], train_aristotle_envs[train_task_idx], render=True)

In [None]:
train_aristotle_envs[train_task_idx].close()

In [None]:
run_ep(aristotle_pilot_policies[train_task_idx], train_newton_envs[train_task_idx], render=True)

In [None]:
train_newton_envs[train_task_idx].close()

fit internal dynamics model

In [None]:
n_train_rollouts_per_env = 1000

In [None]:
demo_rollouts = [[run_ep(aristotle_pilot_policies[train_task_idx], newton_env, render=False, task_idx=train_task_idx)
                  for _ in range(n_train_rollouts_per_env)]
                 for train_task_idx, newton_env in enumerate(train_newton_envs)]

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_demo_rollouts.pkl'), 'wb') as f:
  pickle.dump(demo_rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
demo_rollouts_path = os.path.join('data', '5.1-lander-newton', 'sid_pilot_policy_demo_rollouts.pkl')
#demo_rollouts_path = os.path.join(data_dir, 'aristotle_pilot_policy_demo_rollouts.pkl')

In [None]:
with open(demo_rollouts_path, 'rb') as f:
  demo_rollouts = pickle.load(f)

In [None]:
def build_mlp(
    input_placeholder,
    output_size,
    scope,
    n_layers=1,
    size=256,
    activation=tf.nn.relu,
    output_activation=None,
    reuse=False
  ):
  out = input_placeholder
  with tf.variable_scope(scope, reuse=reuse):
    for _ in range(n_layers):
      out = tf.layers.dense(out, size, activation=activation)
    out = tf.layers.dense(out, output_size, activation=output_activation)
  return out

In [None]:
n_obs_feats = n_obs_dim
featurize_obs = lambda s: s

In [None]:
def vectorize_rollouts(rollouts):
  obs = [[] for _ in range(n_train_tasks)]
  actions = [[] for _ in range(n_train_tasks)]
  for task_idx, task_rollouts in enumerate(rollouts):
    for task_rollout in task_rollouts:
      more_obs, more_actions = list(zip(*task_rollout))[:2]
      obs[task_idx].extend([featurize_obs(s) for s in more_obs])
      actions[task_idx].extend(more_actions)
  l = min(len(x) for x in obs)
  idxes = [random.sample(list(range(len(x))), l) for x in obs]
  f = lambda x: np.array(x[1])[idxes[x[0]]]
  obs = np.array(list(map(f, enumerate(obs))))
  actions = np.array(list(map(f, enumerate(actions))))
  return obs, actions

In [None]:
demo_obs = None
demo_actions = None
demo_next_obs = None
demo_task_idxes = None
train_demo_example_idxes = None
val_demo_batch = None

In [None]:
def process_demo_rollouts(demo_rollouts):
  global demo_obs
  global demo_actions
  global demo_next_obs
  global demo_task_idxes
  global train_demo_example_idxes
  global val_demo_batch

  vectorized_demo_rollouts = vectorize_rollouts(demo_rollouts)
  
  demo_obs, demo_actions = vectorized_demo_rollouts
  demo_example_idxes = list(range(demo_obs.shape[1]))
  
  random.shuffle(demo_example_idxes)
  n_train_demo_examples = int(0.9 * len(demo_example_idxes))
  train_demo_example_idxes = demo_example_idxes[:n_train_demo_examples]
  val_demo_example_idxes = demo_example_idxes[n_train_demo_examples:]
  val_demo_batch = demo_obs[:, val_demo_example_idxes], demo_actions[:, val_demo_example_idxes]

In [None]:
process_demo_rollouts(demo_rollouts)

In [None]:
def sample_batch(size):
  idxes = random.sample(train_demo_example_idxes, size)
  demo_batch = demo_obs[:, idxes], demo_actions[:, idxes]
  return demo_batch

In [None]:
gamma = 0.99
iterations = 100000
learning_rate = 1e-3
batch_size = 512 // n_train_tasks
sq_td_err_penalty = 1e-3

q_n_layers = 1
q_layer_size = 32
q_activation = tf.nn.relu
q_output_activation = None

constraint_sampling_freq = 100000
constraint_batch_size = batch_size
n_constraint_rollouts_per_env = 100

val_update_freq = 100

In [None]:
im_scope = str(uuid.uuid4())
q_scope = str(uuid.uuid4())

In [None]:
demo_obs_t_ph = tf.placeholder(tf.float32, [n_train_tasks, None, n_obs_feats])
demo_act_t_ph = tf.placeholder(tf.int32, [n_train_tasks, None])
demo_batch_size_ph = tf.placeholder(tf.int32)

constraint_obs_t_ph = tf.placeholder(tf.float32, [n_train_tasks, None, n_obs_feats])
constraint_act_t_ph = tf.placeholder(tf.int32, [n_train_tasks, None])
constraint_obs_tp1_ph = tf.placeholder(tf.float32, [n_train_tasks, None, n_ims, n_obs_feats])
constraint_rew_t_ph = tf.placeholder(tf.float32, [n_train_tasks, None, n_ims])
constraint_batch_size_ph = tf.placeholder(tf.int32)

In [None]:
demo_batch_idxes = tf.reshape(
  tf.range(0, demo_batch_size_ph, 1), 
  [demo_batch_size_ph, 1])

extract_task = lambda x, i: tf.squeeze(tf.gather(x, tf.convert_to_tensor(
  [i], dtype=tf.int32)), axis=[0]) 

demo_q_t = tf.stack([tf.gather_nd(
  build_mlp(
    extract_task(demo_obs_t_ph, train_task_idx),
    n_act_dim, q_scope+'-'+str(train_task_idx), 
    n_layers=q_n_layers, size=q_layer_size,
    activation=q_activation, output_activation=q_output_activation
  ), 
  tf.concat([
    demo_batch_idxes, 
    tf.expand_dims(extract_task(demo_act_t_ph, train_task_idx), 1)], axis=1)
) for train_task_idx in range(n_train_tasks)], axis=0)

demo_v_t = tf.reduce_logsumexp(
  tf.stack([build_mlp(
    extract_task(demo_obs_t_ph, train_task_idx),
    n_act_dim, q_scope+'-'+str(train_task_idx), 
    n_layers=q_n_layers, size=q_layer_size,
    activation=q_activation, output_activation=q_output_activation,
    reuse=True
  ) for train_task_idx in range(n_train_tasks)], axis=0),
  axis=2)

act_log_likelihoods = demo_q_t - demo_v_t

In [None]:
neg_avg_log_likelihood = -tf.reduce_mean(act_log_likelihoods)

In [None]:
q_tp1 = tf.stack([tf.reshape(
  build_mlp(
    tf.reshape(
      extract_task(constraint_obs_tp1_ph, train_task_idx),
      [constraint_batch_size_ph*n_ims, n_obs_feats]), 
    n_act_dim, q_scope+'-'+str(train_task_idx),
    n_layers=q_n_layers, size=q_layer_size,
    activation=q_activation, output_activation=q_output_activation, 
    reuse=True
  ), 
  [constraint_batch_size_ph, n_ims, n_act_dim]
) for train_task_idx in range(n_train_tasks)], axis=0)
v_tp1 = tf.reduce_logsumexp(q_tp1, axis=3)

In [None]:
im_probs = tf.get_variable(im_scope, [n_ims], initializer=tf.random_normal_initializer)
im_probs = tf.exp(im_probs) / tf.reduce_sum(tf.exp(im_probs))

In [None]:
exp_v_tp1 = tf.reduce_sum(im_probs * v_tp1, axis=2)
exp_rew_t = tf.reduce_sum(im_probs * constraint_rew_t_ph, axis=2)
target_t = exp_rew_t + gamma * exp_v_tp1

In [None]:
constraint_batch_idxes = tf.reshape(
  tf.range(0, constraint_batch_size_ph, 1), 
  [constraint_batch_size_ph, 1])

q_t = tf.stack([tf.gather_nd(
  build_mlp(
    extract_task(constraint_obs_t_ph, train_task_idx), 
    n_act_dim, q_scope+'-'+str(train_task_idx), 
    n_layers=q_n_layers, size=q_layer_size,
    activation=q_activation, output_activation=q_output_activation, 
    reuse=True
  ), 
  tf.concat([
    constraint_batch_idxes, 
    tf.expand_dims(extract_task(constraint_act_t_ph, train_task_idx), 1)], axis=1)
) for train_task_idx in range(n_train_tasks)], axis=0)

In [None]:
td_err = q_t - target_t

In [None]:
sq_td_err = tf.reduce_mean(td_err**2)

In [None]:
loss = neg_avg_log_likelihood + sq_td_err_penalty * sq_td_err

In [None]:
update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [None]:
def compute_int_dyn_nll():
  im_probs_eval = sess.run(im_probs)
  #int_dyn_nll = -np.log(1e-9+im_probs_eval[aristotle_conf_idxes]).sum()
  int_dyn_nll = np.max(im_probs_eval)
  return {'int_dyn_nll': int_dyn_nll}

In [None]:
def sample_constraints(_):
  constraint_rollouts = [[] for _ in range(n_train_tasks)]
  
  for train_task_idx in range(n_train_tasks):
    rollouts = [[] for _ in range(n_constraint_rollouts_per_env)]
    envs = [make_lander_env(
      fps=fast_fps, goal=train_goals[train_task_idx]) for _ in range(
      n_constraint_rollouts_per_env)]
    obses = np.array([env.reset() for env in envs])
    dones = [False for _ in envs]
    prev_obses = obses
    for step_idx in range(max_ep_len+1):
      not_done_idxes = [i for i, done in enumerate(dones) if not done]
      batch_size = len(not_done_idxes)
      if batch_size == 0:
        break
      actions = np.random.choice(n_act_dim, batch_size)
      for i, env_idx in enumerate(not_done_idxes):
        env = envs[env_idx]
        action = actions[i]
        env.unwrapped.fan = True
        env.unwrapped.fan_confs = fan_confs
        obs, r, done, info = env.step(action)
        obses[env_idx] = obs
        dones[env_idx] = done
        rollouts[env_idx].append((
          prev_obses[env_idx], action, info['rews'], info['obses']))
      prev_obses = copy(obses)
    constraint_rollouts[train_task_idx].extend([r for r in rollouts if r != []])

  size = min(sum(len(r) for r in rollouts) for rollouts in constraint_rollouts)
  
  global train_constraint_example_idxes
  global val_constraint_batch
  global constraint_obs_t
  global constraint_act_t
  global constraint_obs_tp1
  global constraint_rew_t
    
  constraint_obs_t = np.zeros((n_train_tasks, size, n_obs_feats))
  constraint_act_t = np.zeros((n_train_tasks, size))
  constraint_obs_tp1 = np.zeros((n_train_tasks, size, n_ims, n_obs_feats))
  constraint_rew_t = np.zeros((n_train_tasks, size, n_ims))
  
  for train_task_idx in range(n_train_tasks):
    unfeat_obses, actions, rews, next_obses = list(zip(*sum(
      constraint_rollouts[train_task_idx], [])))
    obses = [featurize_obs(s) for s in unfeat_obses]
    next_obses = [[featurize_obs(s) for s in fan_s] for fan_s in next_obses]
    idxes = random.sample(list(range(len(obses))), size)
    constraint_obs_t[train_task_idx, :, :] = np.array(obses)[idxes, :]
    constraint_act_t[train_task_idx, :] = np.array(actions)[idxes]
    constraint_obs_tp1[train_task_idx, :, :, :] = np.array(next_obses)[idxes, :, :]
    constraint_rew_t[train_task_idx, :, :] = np.array(rews)[idxes, :]
  
  constraint_example_idxes = list(range(size))
  random.shuffle(constraint_example_idxes)
  n_train_constraint_examples = int(0.9 * size)
  
  train_constraint_example_idxes = constraint_example_idxes[:n_train_constraint_examples]
  val_constraint_example_idxes = constraint_example_idxes[n_train_constraint_examples:]
  val_constraint_batch = constraint_obs_t[:, val_constraint_example_idxes], constraint_act_t[:, val_constraint_example_idxes], constraint_rew_t[:, val_constraint_example_idxes], constraint_obs_tp1[:, val_constraint_example_idxes]

In [None]:
def sample_constraint_batch(size):
  global n_iters_since_prev_constraint_sample
  if n_iters_since_prev_constraint_sample % constraint_sampling_freq == 0:
    sample_constraints(size)
    n_iters_since_prev_constraint_sample = 0
  n_iters_since_prev_constraint_sample += 1

  idxes = random.sample(train_constraint_example_idxes, size)
  constraint_batch = constraint_obs_t[:, idxes], constraint_act_t[:, idxes], constraint_rew_t[:, idxes], constraint_obs_tp1[:, idxes]
  return constraint_batch

In [None]:
train_constraint_example_idxes = None
val_constraint_batch = None
constraint_obs_t = None
constraint_act_t = None
constraint_obs_tp1 = None
constraint_rew_t = None
n_iters_since_prev_constraint_sample = 0

In [None]:
with open(os.path.join(data_dir, 'constraint_samples.pkl'), 'wb') as f:
  pickle.dump((
    train_constraint_example_idxes, 
    val_constraint_batch,
    constraint_obs_t,
    constraint_act_t,
    constraint_obs_tp1,
    constraint_rew_t,
    n_iters_since_prev_constraint_sample), f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, 'constraint_samples.pkl'), 'rb') as f:
  (
    train_constraint_example_idxes, 
    val_constraint_batch,
    constraint_obs_t,
    constraint_act_t,
    constraint_obs_tp1,
    constraint_rew_t,
    n_iters_since_prev_constraint_sample) = pickle.load(f)

In [None]:
tf.global_variables_initializer().run(session=sess)

In [None]:
n_iters = iterations * demo_obs.shape[1] // batch_size
train_logs = {
  'loss_evals': [],
  'nll_evals': [],
  'ste_evals': [],
  'val_loss_evals': [],
  'val_nll_evals': [],
  'val_ste_evals': [],
  'int_dyn_nll_evals': []
}

In [None]:
def compute_batch_loss(demo_batch, constraint_batch, step=False, t=None):
  demo_batch_obs_t, demo_batch_act_t = demo_batch
  constraint_batch_obs_t, constraint_batch_act_t, constraint_batch_rew_t, constraint_batch_obs_tp1 = constraint_batch
  
  feed_dict = {
    demo_obs_t_ph: demo_batch_obs_t,
    demo_act_t_ph: demo_batch_act_t,
    demo_batch_size_ph: demo_batch_obs_t.shape[1],
    constraint_obs_t_ph: constraint_batch_obs_t,
    constraint_act_t_ph: constraint_batch_act_t,
    constraint_obs_tp1_ph: constraint_batch_obs_tp1,
    constraint_rew_t_ph: constraint_batch_rew_t,
    constraint_batch_size_ph: constraint_batch_obs_t.shape[1],
  }
  
  [loss_eval, neg_avg_log_likelihood_eval, sq_td_err_eval] = sess.run(
    [loss, neg_avg_log_likelihood, sq_td_err], feed_dict=feed_dict)
  
  if step:
    sess.run(update_op, feed_dict=feed_dict)
  
  d = {
    'loss': loss_eval,
    'nll': neg_avg_log_likelihood_eval,
    'ste': sq_td_err_eval
  }
  if not step:
    d.update(compute_int_dyn_nll())
  return d

In [None]:
val_log = None
while len(train_logs['loss_evals']) < n_iters:
  demo_batch = sample_batch(batch_size)
  constraint_batch = sample_constraint_batch(constraint_batch_size)
  
  t = len(train_logs['loss_evals'])
  train_log = compute_batch_loss(demo_batch, constraint_batch, step=True, t=t)
  if val_log is None or len(train_logs['loss_evals']) % val_update_freq == 0:
    val_log = compute_batch_loss(val_demo_batch, val_constraint_batch, step=False, t=t)
  
  print('%d %d %f %f %f %f %f %f %f' % (
    t, n_iters, train_log['loss'],
    train_log['nll'], train_log['ste'], val_log['loss'],
    val_log['nll'], val_log['ste'], val_log['int_dyn_nll'])
  )
  
  for k, v in train_log.items():
    train_logs['%s_evals' % k].append(v)
  for k, v in val_log.items():
    train_logs['%s%s_evals' % ('val_' if k in ['loss', 'nll', 'ste'] else '', k)].append(v)

In [None]:
for k in ['val_nll_evals', 'val_ste_evals']:
  plt.xlabel('Iterations')
  plt.ylabel(k.split('_')[1])
  plt.plot(train_logs[k])
  plt.show()

In [None]:
plt.xlabel('Iterations')
plt.ylabel('Negative Log-Likelihood')
plt.plot(train_logs['int_dyn_nll_evals'], color='orange')
plt.axhline(y=-np.log(1/n_ims), linestyle='--', color='gray', label='Uniform')
plt.ylim([-0.05, None])
plt.legend(loc='best')
plt.show()

In [None]:
im_probs_eval = sess.run(im_probs)
newton_fps = 40

In [None]:
fpses = [conf['fps'] for conf in fan_confs]

In [None]:
mpl.rcParams.update({'font.size': 20})

In [None]:
plt.xlabel('Game Speed')
plt.ylabel('Likelihood')
plt.title('Lunar Lander User Study')
speeds = 1/np.array(fan_fpses[::-1])
width = [y-x for x, y in zip(speeds[:-1], speeds[1:])]
width.append(width[-1])
plt.bar(
  speeds, im_probs_eval[::-1], linewidth=0, color='orange', 
  width=width,
  label='Internal Dynamics')
plt.axvline(x=1/newton_fps, linestyle='--', label='Real Dynamics', color='gray')
plt.legend(loc='best')
plt.savefig(os.path.join(data_dir, 'human-speed-distrn.pdf'), bbox_inches='tight')
plt.show()

inverse real dynamics

In [None]:
def init_unitialized_tf_vars():
  uninitialized_vars = []
  for var in tf.all_variables():
    try:
      sess.run(var)
    except tf.errors.FailedPreconditionError:
      uninitialized_vars.append(var)
  tf.initialize_variables(uninitialized_vars).run(session=sess)

In [None]:
class NNInvDynamicsModel():
  
  def __init__(self,
      n_layers,
      size,
      activation,
      normalization,
      batch_size,
      iterations,
      learning_rate,
      sess,
      invdyn_scope
    ):
    self.scope = invdyn_scope
    with tf.variable_scope(self.scope, reuse=None):
      self.obs_t_ph = tf.placeholder(tf.float32, [None, n_obs_dim])
      self.obs_delta_t_ph = tf.placeholder(tf.float32, [None, n_obs_dim])
      self.act_t_ph = tf.placeholder(tf.int32, [None])
      obs_cat_delta_t = tf.concat([self.obs_t_ph, self.obs_delta_t_ph], axis=1)
      self.act_logits = build_mlp(
        obs_cat_delta_t, n_act_dim, invdyn_scope, n_layers=n_layers, size=size,
        activation=activation
      )
      self.act_preds = tf.argmax(self.act_logits, axis=1)
      self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=self.act_t_ph,
        logits=self.act_logits,
      ))

      self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

      init_unitialized_tf_vars()

    self.sess = sess
    self.iterations = iterations
    self.batch_size = batch_size
    self.normalization = normalization

  def fit(self, data):
    obs, actions, rewards, next_obs, dones = data
    mean_obs, std_obs, mean_deltas, std_deltas = self.normalization
    normed_obs = normalize(obs, mean_obs, std_obs)
    deltas = next_obs - obs
    normed_deltas = normalize(deltas, mean_deltas, std_deltas)

    example_idxes = range(len(obs))
    def sample_batch(size):
      idxes = random.sample(example_idxes, size)
      return normed_obs[idxes], actions[idxes], normed_deltas[idxes]

    n_iters = self.iterations * len(obs) // self.batch_size
    with tf.variable_scope(self.scope, reuse=None):
      for i in range(n_iters):
        batch_obs_t, batch_act_t, batch_obs_delta = sample_batch(self.batch_size)
        feed_dict = {
          self.obs_t_ph: batch_obs_t,
          self.act_t_ph: batch_act_t,
          self.obs_delta_t_ph: batch_obs_delta
        }
        [loss, _] = self.sess.run([self.loss, self.update_op], feed_dict=feed_dict)
        print('%d %d %f' % (i, n_iters, loss))

  def predict(self, states, next_states):
    mean_obs, std_obs, mean_deltas, std_deltas = self.normalization
    normed_states = normalize(states, mean_obs, std_obs)
    normed_deltas = normalize(next_states - states, mean_deltas, std_deltas)
    with tf.variable_scope(self.scope, reuse=None):
      feed_dict = {
        self.obs_t_ph: normed_states,
        self.obs_delta_t_ph: normed_deltas
      }
      return self.sess.run(self.act_preds, feed_dict=feed_dict)

In [None]:
def vectorize_rollouts(rollouts):
  obs = []
  actions = []
  rewards = []
  next_obs = []
  dones = []
  for task_rollouts in rollouts:
    for rollout in task_rollouts:
      more_obs, more_actions, more_rewards, more_next_obs, more_dones, _ = list(zip(*rollout))
      obs.extend(more_obs)
      actions.extend(more_actions)
      rewards.extend(more_rewards)
      next_obs.extend(more_next_obs)
      dones.extend(more_dones)
  return np.array(obs), np.array(actions), np.array(rewards), np.array(next_obs), np.array(dones)

In [None]:
vectorized_demo_rollouts = vectorize_rollouts(demo_rollouts)

In [None]:
def compute_normalization(vectorized_rollouts):
  obs, actions, rewards, next_obs, _ = vectorized_rollouts
  mean_obs = np.mean(obs, axis=0)
  std_obs = np.std(obs, axis=0)
  deltas = next_obs - obs
  mean_deltas = np.mean(deltas, axis=0)
  std_deltas = np.std(deltas, axis=0)
  return mean_obs, std_obs, mean_deltas, std_deltas

In [None]:
def normalize(data, mean, std, eps=1e-9):
  return (data - mean) / (std + eps)

def unnormalize(data, mean, std, eps=1e-9):
  return data * (std + eps) + mean

In [None]:
normalization = compute_normalization(vectorized_demo_rollouts)

In [None]:
n_layers = 2
layer_size = 64
activation = tf.nn.relu
learning_rate = 1e-4
batch_size = 64
iterations = 10

In [None]:
with open(os.path.join(data_dir, 'invdyn_scope.pkl'), 'rb') as f:
  invdyn_scope = pickle.load(f)

In [None]:
invdyn_scope = str(uuid.uuid4())

In [None]:
true_invdyn_model = NNInvDynamicsModel(
  n_layers=n_layers,
  size=layer_size,
  activation=activation,
  normalization=normalization,
  batch_size=batch_size,
  iterations=iterations,
  learning_rate=learning_rate,
  sess=sess,
  invdyn_scope=invdyn_scope
)

In [None]:
true_invdyn_model.fit(vectorized_demo_rollouts)

In [None]:
with open(os.path.join(data_dir, 'invdyn_scope.pkl'), 'wb') as f:
  pickle.dump(invdyn_scope, f, pickle.HIGHEST_PROTOCOL)

In [None]:
invdyn_path = os.path.join(data_dir, 'invdyn.tf')

In [None]:
save_tf_vars(sess, invdyn_scope, invdyn_path)

In [None]:
load_tf_vars(sess, invdyn_scope, invdyn_path)

In [None]:
with open(os.path.join(data_dir, 'invdyn_normalization.pkl'), 'wb') as f:
  pickle.dump(normalization, f, pickle.HIGHEST_PROTOCOL)

internal2real dynamics transfer

In [None]:
assisted_conf = fan_confs[np.argmax(sess.run(im_probs))]

In [None]:
def make_assisted_env():
  env = gym.make('LunarLanderContinuous-v2')
  env.action_space = spaces.Discrete(n_act_dim)
  env.unwrapped._step_orig = env.unwrapped._step
  def _step(self, action):
    if type(action) == np.int64 or len(action) == 1:
      if type(action) == np.ndarray:
        action = action[0]
        
      if self.curr_obs is not None:
        intended_state = self.sim_step(disc_to_cont(action), **assisted_conf)[0]
        intended_action = true_invdyn_model.predict(
          np.array([self.curr_obs]), np.array([intended_state]))[0]
      else:
        intended_action = action
        
      obs, r, done, info = self._step_orig(disc_to_cont(intended_action))
      return obs, r, done, info
    else:
      return self._step_orig(action)
  env.unwrapped._step = types.MethodType(_step, env.unwrapped)
  env.unwrapped.fps = fast_fps
  
  test_task_idx = np.random.choice(n_train_tasks)
  test_aristotle_pilot_policy = make_aristotle_pilot_policy(test_task_idx)
  env.unwrapped.goal = train_goals[test_task_idx]
  
  return test_aristotle_pilot_policy, env

In [None]:
def make_env_without_dyn_transfer(using_slow_fps):
  test_task_idx = np.random.choice(n_train_tasks)
  test_aristotle_pilot_policy = make_aristotle_pilot_policy(test_task_idx)
  unassisted_env = train_newton_envs[test_task_idx] if not using_slow_fps else train_aristotle_envs[test_task_idx]
  return test_aristotle_pilot_policy, unassisted_env

In [None]:
make_unassisted_env = lambda: make_env_without_dyn_transfer(using_slow_fps=False)
make_ideal_env = lambda: make_env_without_dyn_transfer(using_slow_fps=True)

In [None]:
n_eval_rollouts = 100

In [None]:
assisted_rollouts = [run_ep(*make_assisted_env(), render=False) for _ in range(n_eval_rollouts)]

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_assisted_rollouts.pkl'), 'wb') as f:
  pickle.dump(assisted_rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_assisted_rollouts.pkl'), 'rb') as f:
  assisted_rollouts = pickle.load(f)

In [None]:
unassisted_rollouts = [run_ep(*make_unassisted_env(), render=False) for _ in range(n_eval_rollouts)]

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_unassisted_rollouts.pkl'), 'wb') as f:
  pickle.dump(unassisted_rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_unassisted_rollouts.pkl'), 'rb') as f:
  unassisted_rollouts = pickle.load(f)

In [None]:
ideal_rollouts = [run_ep(*make_ideal_env(), render=False) for _ in range(n_eval_rollouts)]

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_ideal_rollouts.pkl'), 'wb') as f:
  pickle.dump(ideal_rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, 'aristotle_pilot_policy_ideal_rollouts.pkl'), 'rb') as f:
  ideal_rollouts = pickle.load(f)

In [None]:
unassisted_rew = [sum(x[2] for x in r) for r in unassisted_rollouts]
ideal_rew = [sum(x[2] for x in r) for r in ideal_rollouts]

In [None]:
assisted_rew = [sum(x[2] for x in r) for r in assisted_rollouts]

In [None]:
np.mean(unassisted_rew), np.mean(ideal_rew)

In [None]:
np.mean(assisted_rew)

In [None]:
succ_rew_bonus = 100
crash_rew_penalty = -100
is_succ = lambda r: r[-1][2] > succ_rew_bonus / 2
is_crash = lambda r: r[-1][2] < crash_rew_penalty / 2

In [None]:
unassisted_succ = [1 if is_succ(r) else 0 for r in unassisted_rollouts]
ideal_succ = [1 if is_succ(r) else 0 for r in ideal_rollouts]

In [None]:
assisted_succ = [1 if is_succ(r) else 0 for r in assisted_rollouts]

In [None]:
np.mean(unassisted_succ), np.mean(ideal_succ)

In [None]:
np.mean(assisted_succ) 

In [None]:
unassisted_crash = [1 if is_crash(r) else 0 for r in unassisted_rollouts]
ideal_crash = [1 if is_crash(r) else 0 for r in ideal_rollouts]

In [None]:
assisted_crash = [1 if is_crash(r) else 0 for r in assisted_rollouts]

In [None]:
np.mean(unassisted_crash), np.mean(ideal_crash)

In [None]:
np.mean(assisted_crash)

In [None]:
run_ep(*make_assisted_env(), render=True)