In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from __future__ import division

import pickle
import os
import time

import scipy
import numpy as np

from matplotlib import pyplot as plt
import matplotlib as mpl

from mimi import envs
from mimi import utils
from mimi import user_models
from mimi import opt
from mimi import reward_models



In [3]:
sess = utils.make_tf_session(gpu_mode=False)













In [4]:
user_id = 'pilot'

In [5]:
data_dir = os.path.join(utils.data_dir, 'cursor', user_id)
if not os.path.exists(data_dir):
  os.makedirs(data_dir)

In [6]:
win_dims = np.array([1, 1]) * 1000
max_ep_len = 300
speed = 0.02
goal_dist_thresh = 0.05
reset_delay = 0
step_delay = 0.075

In [7]:
user_model = user_models.HumanMouseUser(win_dims=win_dims, step_delay=step_delay)

In [8]:
env = envs.CursorEnv(
  sess,
  user_model,
  max_ep_len=max_ep_len,
  goal_dist_thresh=goal_dist_thresh,
  speed=speed,
  win_dims=win_dims,
  reset_delay=reset_delay
)

utils.prep_env_for_human_user(env, user_model)

In [9]:
mi_model_init_args = [sess]
mi_model_init_kwargs = {
  'n_env_obs_dim': env.n_env_obs_dim,
  'n_user_obs_dim': env.n_user_obs_dim,
  'n_act_dim': env.n_act_dim,
  'n_layers': 2,
  'layer_size': 64
}
mi_model_train_kwargs = {
  'iterations': 1000,
  'ftol': 1e-6,
  'learning_rate': 1e-3,
  'batch_size': 64,
  'val_update_freq': None,
  'verbose': False,
  'warm_start': False
}
reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs
)



















Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please use `layer.__call__` method instead.


In [10]:
def true_rew_of_rollout(rollout):
  goal = rollout[-1][-1]['goal']
  rews = [-np.linalg.norm(x[0][:2]-goal) for x in rollout]
  p = len(rollout) / env.max_ep_len
  return p * np.mean(rews) + (1-p) * rews[-1]
true_reward_model = lambda rollouts: np.mean([true_rew_of_rollout(rollout) for rollout in rollouts])

**Pause here for instructions**

In [11]:
def make_rand_policy():
  ang = np.random.random() * 2*np.pi
  return lambda obs: utils.rotate_vec(env.extract_user_obses(obs[np.newaxis])[0], ang)

n_policies = 10
n_rollouts_per_policy = 1

In [12]:
baseline_rollouts = [[] for _ in range(n_policies)]

In [13]:
for i in range(n_policies):
  policy = make_rand_policy()
  time.sleep(5)
  while len(baseline_rollouts[i]) < n_rollouts_per_policy:
    rollout = utils.run_ep(policy, env, render=True, init_delay=1)
    baseline_rollouts[i].append(rollout)

KeyboardInterrupt: 

In [None]:
baseline_path = os.path.join(data_dir, 'baseline_rollouts.pkl')

In [None]:
with open(baseline_path, 'wb') as f:
  pickle.dump(baseline_rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
true_reward_model(sum(baseline_rollouts, []))

**Pause here for instructions**

In [None]:
gp_optimizer = opt.GP(
  env,
  reward_model,
  param_bounds=(0, 2*np.pi),
  n_policy_params=1,
  W_from_w=(lambda w: np.array([[np.cos(w[0]), -np.sin(w[0])], [np.sin(w[0]), np.cos(w[0])]]))
)

In [None]:
gp_min_kwargs = {'n_initial_points': 5}
ep_kwargs = {'init_delay': 1, 'render': True}
n_eps_per_pol = 10

In [None]:
#x0, _, y0 = zip(*eval_data_of_pol)
#gp_min_kwargs.update({'x0': list(x0), 'y0': list(y0)})

**Pause here for instructions**

In [None]:
gp_policy, res = gp_optimizer.run(
  n_pols=50,
  n_eps_per_pol=n_eps_per_pol,
  gp_min_kwargs=gp_min_kwargs,
  ep_kwargs=ep_kwargs,
  reward_model_train_kwargs=mi_model_train_kwargs
)

**Pause here for instructions**

In [None]:
#eval_data_of_pol.extend(gp_optimizer.eval_data_of_pol)

In [None]:
eval_data_of_pol = gp_optimizer.eval_data_of_pol

In [None]:
gp_results_path = os.path.join(data_dir, 'gp_results.pkl')

In [None]:
with open(gp_results_path, 'wb') as f:
  pickle.dump(eval_data_of_pol, f, pickle.HIGHEST_PROTOCOL)

**Pause here for instructions**

In [None]:
data_dir = os.path.join(utils.data_dir, 'cursor')

In [None]:
perf_evals = []
pols = []

for user_id in range(12):
  user_path = os.path.join(data_dir, str(user_id))
  if os.path.exists(user_path):
    baseline_path = os.path.join(user_path, 'baseline_rollouts.pkl')
    with open(baseline_path, 'rb') as f:
      baseline_rollouts = pickle.load(f)
    baseline_rollouts = sum(baseline_rollouts, [])
    baseline_true_rewards = true_reward_model(baseline_rollouts)
    
    gp_results_path = os.path.join(user_path, 'gp_results.pkl')
    with open(gp_results_path, 'rb') as f:
      eval_data_of_pol = pickle.load(f)
    true_rews_of_pol = [[true_reward_model([rollout]) for rollout in eval_data[1]] for eval_data in eval_data_of_pol]
    true_rew_of_pol = [true_reward_model(eval_data[1]) for eval_data in eval_data_of_pol]
    rew_of_pol = [eval_data[2] for eval_data in eval_data_of_pol]
    n_eps_of_pol = [len(eval_data[1]) for eval_data in eval_data_of_pol]
    
    true_rews = sum(true_rews_of_pol, [])
    perf_evals.append({
      'true_rews': true_rews, 
      'true_rew': true_rew_of_pol, 
      'rew': rew_of_pol,
      'n_eps': np.cumsum(n_eps_of_pol),
      'xs': np.cumsum(np.ones(len(true_rews)))-1,
      'baseline_true_reward': np.mean(baseline_true_rewards)
    })
    pols.append(eval_data_of_pol[-1][0])

In [None]:
mpl.rcParams.update({'font.size': 12})

In [None]:
pol_angs = [[pol[0] for pol in pols]]

N = 20
bottom = 8
max_height = 4

radii, theta = plt.hist(pol_angs, bins=20)[:2]
theta = np.mean(list(zip(theta[:-1], theta[1:])), axis=1)
width = (2*np.pi) / N

ax = plt.subplot(111, polar=True)
bars = ax.bar(theta, radii, width=width, color='orange')

plt.title('Emergent Interfaces')
plt.savefig(os.path.join(data_dir, 'user-study-learned-int.pdf'), bbox_inches='tight')
plt.show()

In [None]:
xs = sum([perf_eval['true_rew'] for perf_eval in perf_evals], [])
ys = sum([perf_eval['rew'] for perf_eval in perf_evals], [])
rho = scipy.stats.spearmanr(xs, ys)[0]
plt.title(r'True Reward vs. Mutual Information Reward ($\rho = %0.2f$)' % rho)
plt.xlabel('True Reward (Avg. Distance to Target)')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_{t+1}))$")
plt.scatter(
  xs, 
  ys,
  color='orange',
  s=50,
  alpha=0.5
)
plt.xticks(fontsize=10)
plt.savefig(os.path.join(data_dir, 'user-study-truerew-vs-mi.pdf'), bbox_inches='tight')
plt.show()

In [None]:
plt.title('Mutual Information Reward')
plt.xlabel('Number of Online Training Episodes')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_{t+1}))$")
utils.plot_perf_evals(perf_evals, 'n_eps', 'rew', label='MIMI (Ours)', smooth_win=1, color='teal')
plt.legend(loc='lower right')
plt.savefig(os.path.join(data_dir, 'user-study-mi-vs-eps.pdf'), bbox_inches='tight')
plt.show()

In [None]:
plt.title('User Study: 2D Cursor Control with Perturbed Mouse')
plt.xlabel('Number of Online Training Episodes')
plt.ylabel('True Reward (Avg. Distance to Target)')
utils.plot_perf_evals(perf_evals, 'xs', 'true_rews', label='MIMI (Ours)', smooth_win=10, color='orange')
plt.axhline(y=np.mean([perf['baseline_true_reward'] for perf in perf_evals]), linestyle='--', color='gray', label='Random Interfaces (Baseline)')
plt.axhline(y=-0.0726392950518139, linestyle='--', color='green', label='Oracle')
plt.legend(loc='lower right')
plt.ylim([-0.14, None])
plt.savefig(os.path.join(data_dir, 'user-study-truerew-vs-eps.pdf'), bbox_inches='tight')
plt.show()

In [None]:
def plot_trajs(eval_data_of_pol):
  traj = np.array([x[0][:2] for eval_data in eval_data_of_pol for rollout in eval_data[1] for x in rollout])
  plt.scatter(traj[:, 0], traj[:, 1], alpha=0.25, linewidth=0, color='gray')
  plt.gca().set_aspect('equal', adjustable='box')
  plt.xticks([])
  plt.yticks([])
  
def plot_before_trajs(eval_data_of_pol):
  plt.title('<50 Training Episodes')
  plot_trajs(eval_data_of_pol[:5])
  
def plot_after_trajs(eval_data_of_pol):
  plt.title('>150 Training Episodes')
  plot_trajs(eval_data_of_pol[15:])

In [None]:
with open(os.path.join(data_dir, '7', 'gp_results.pkl'), 'rb') as f:
  eval_data_of_pol = pickle.load(f)

In [None]:
plot_before_trajs(eval_data_of_pol)
plt.savefig(os.path.join(data_dir, 'before-trajs.pdf'), bbox_inches='tight')
plt.show()

In [None]:
plot_after_trajs(eval_data_of_pol)
plt.savefig(os.path.join(data_dir, 'after-trajs.pdf'), bbox_inches='tight')
plt.show()