In [1]:
import time

start = time.time()

In [2]:
# comment out `import tensorflow as tf` from `mimi/utils.py` and `mimi/models.py`, 
# otherwise hand tracking setup will hang

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib

Using matplotlib backend: TkAgg


In [4]:
from __future__ import division

from copy import deepcopy
import pickle
import os
import uuid

import scipy
import numpy as np
from PIL import Image

from matplotlib import pyplot as plt
import matplotlib as mpl

from mimi import envs
from mimi import utils
import sys
learning_hand_interface=os.path.join(utils.home_dir, 'learning-hands-interface')
sys.path.append(learning_hand_interface)

import hand_modle

from mimi import opt
from mimi import reward_models
from mimi import models



In [5]:
data_dir = os.path.join(utils.data_dir, 'lander')

In [None]:
user_model = hand_modle.HumanHandUser()
import tensorflow as tf
utils.tf = tf
models.tf = tf

In [None]:
sess = utils.make_tf_session(gpu_mode=False)

In [None]:
#max time length for each landing
max_ep_len = 100
step_delay = 0.02
reset_delay = 1

In [None]:
user_model = hand_modle.HumanHandUser()

In [None]:
env = envs.LanderEnv(
  sess,
  user_model,
  max_ep_len=max_ep_len,
  reset_delay=reset_delay,
  step_delay=step_delay
)

In [None]:
mi_model_init_args = [sess]
mi_model_init_kwargs = {
  'n_env_obs_dim': env.n_min_env_obs_dim,
  'n_user_obs_dim': env.n_user_obs_dim,
  'n_act_dim': env.n_act_dim,
  'n_layers': 2,
  'layer_size': 64
}
mi_model_train_kwargs = {
  'iterations': 1000,
  'ftol': 1e-6,
  'learning_rate': 1e-3,
  'batch_size': 64,
  'val_update_freq': None,
  'verbose': False,
  'warm_start': False
}

In [None]:
reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs,
  use_min_env_obs=True
)

In [None]:
true_rew_of_rollout = lambda rollout: 1 if rollout[-1][-1]['succ'] else 0
true_reward_model = lambda rollouts: np.mean([true_rew_of_rollout(rollout) for rollout in rollouts])

In [None]:
gp_optimizer = opt.GP(
  env,
  reward_model,
  param_bounds=(-1., 1.)
)

In [None]:
gp_min_kwargs = {'n_initial_points': 3}
ep_kwargs = {'init_delay': 1, 'render': True}
# n run for each policy
n_eps_per_pol = 1

In [None]:
gp_policy, res = gp_optimizer.run(
    #n policy which means n interface
  n_pols=3,
  n_eps_per_pol=n_eps_per_pol,
  gp_min_kwargs=gp_min_kwargs,
  ep_kwargs=ep_kwargs,
  reward_model_train_kwargs=mi_model_train_kwargs
)

In [None]:
#eval_data_of_pol.extend(gp_optimizer.eval_data_of_pol)

In [None]:
eval_data_of_pol = gp_optimizer.eval_data_of_pol

In [None]:
user_id = '0'

In [None]:
gp_results_path = os.path.join(data_dir, user_id, 'gp_results.pkl')

In [None]:
with open(gp_results_path, 'wb') as f:
  pickle.dump(eval_data_of_pol, f, pickle.HIGHEST_PROTOCOL)

In [None]:
#with open(gp_results_path, 'rb') as f:
#  eval_data_of_pol = pickle.load(f)

In [None]:
#x0, _, y0 = [list(z) for z in zip(*eval_data_of_pol)]
#gp_min_kwargs.update({'x0': x0, 'y0': y0})

In [None]:
perf_evals = []
for user_id in range(3):
  user_path = os.path.join(data_dir, str(user_id))
  if os.path.exists(user_path):
    gp_results_path = os.path.join(user_path, 'gp_results.pkl')
    with open(gp_results_path, 'rb') as f:
      eval_data_of_pol = pickle.load(f)
    true_rews_of_pol = [[true_reward_model([rollout]) for rollout in eval_data[1]] for eval_data in eval_data_of_pol]
    true_rew_of_pol = [true_reward_model(eval_data[1]) for eval_data in eval_data_of_pol]
    rew_of_pol = [eval_data[2] for eval_data in eval_data_of_pol]
    
    baseline_true_rewards = true_rew_of_pol[:3]
    true_rews = sum(true_rews_of_pol, [])
    perf_evals.append({
      'true_rews': true_rews, 
      'true_rew': true_rew_of_pol,
      'rews': rew_of_pol,
      'rew_xs': np.cumsum([len(eval_data[1]) for eval_data in eval_data_of_pol]),
      'true_xs': np.cumsum(np.ones(len(true_rews)))-1,
      'baseline_true_reward': np.mean(baseline_true_rewards),
    })

In [None]:
mpl.rcParams.update({'font.size': 12})

In [None]:
plt.title('Lunar Lander with Hand Gestures')
plt.xlabel('Number of Online Training Episodes')
plt.ylabel('Success Rate')
utils.plot_perf_evals(perf_evals, 'true_xs', 'true_rews', label='MIMI (Ours)', smooth_win=10, color='orange')
plt.axhline(y=np.mean([perf['baseline_true_reward'] for perf in perf_evals]), linestyle='--', color='gray', label='Random Interfaces (Baseline)')
plt.legend(loc='upper left')
#plt.savefig(os.path.join(data_dir, 'lander-study-truerew-vs-eps.pdf'), bbox_inches='tight')
plt.show()

In [None]:
plt.title('Mutual Information Reward')
plt.xlabel('Number of Online Training Episodes')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_{t+1}))$")
utils.plot_perf_evals(perf_evals, 'rew_xs', 'rews', label='MIMI (Ours)', smooth_win=1, color='teal')
plt.legend(loc='upper left')
plt.savefig(os.path.join(data_dir, 'lander-study-rew-vs-eps.pdf'), bbox_inches='tight')
plt.show()

In [None]:
xs = sum([perf_eval['true_rew'] for perf_eval in perf_evals], [])
ys = sum([perf_eval['rews'] for perf_eval in perf_evals], [])
rho = scipy.stats.spearmanr(xs, ys)[0]
plt.title(r'Success Rate vs. Mutual Information Reward ($\rho = %0.2f$)' % rho)
plt.xlabel('Success Rate')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_{t+1}))$")
plt.scatter(
  xs, 
  ys,
  color='orange',
  s=50,
  alpha=0.5
)
plt.xticks(fontsize=10)
plt.savefig(os.path.join(data_dir, 'lander-study-truerew-vs-mi.pdf'), bbox_inches='tight')
plt.show()

In [None]:
end = time.time()
print(end - start)