In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from __future__ import division

from copy import deepcopy
import pickle
import time
import os
import uuid

import scipy
import numpy as np

from matplotlib import pyplot as plt
import matplotlib as mpl

from mimi import envs
from mimi import utils
from mimi import user_models
from mimi import opt
from mimi import reward_models
from mimi import models

In [None]:
data_dir = os.path.join(utils.data_dir, 'deepassist')

In [None]:
sess = utils.make_tf_session(gpu_mode=False)

In [None]:
env = envs.DeepAssistEnv()

In [None]:
pilot_ids = ['spike', 'jet', 'faye', 'vicious', 'ed', 'ein', 'julia', 'punch', 'judy', 'lin', 'grencia', 'laughingbull']

In [None]:
deepassist_dir = os.path.join(data_dir, 'raw')

In [None]:
prune_state = lambda state: state[:1]
n_eval_eps = 30

def format_eps(data, method, delta=1):
  eps = []
  if type(data) == list:
    all_rewards, outcomes, trajs, all_actions = data
  else:
    all_rewards = data['rewards'][0]
    outcomes = data['outcomes'][0]
    all_actions = data['actions'][0]
    trajs = data['trajectories'][0]
    
  trajs = trajs[-n_eval_eps:]
  all_actions = all_actions[-n_eval_eps:]
  all_rewards = all_rewards[-n_eval_eps:]
  outcomes = outcomes[-n_eval_eps:]
  
  for i, traj in enumerate(trajs):
    actions = all_actions[i]
    T = len(actions)
    ep = []
    for t in range(T):
      state = traj[t]
      action = utils.onehot_encode(actions[t], env.n_act_dim)
      reward = 0 if t < T - 1 else all_rewards[i]
      next_state = traj[min(T,t+delta)]
      if method == 'pilot_eval':
        format_state = lambda state, action: np.concatenate((prune_state(state), action))
      else:
        format_state = lambda state, action: np.concatenate((prune_state(state), state[-6:]))
      state = format_state(state, action)
      next_state = format_state(next_state, action)
      assert state.size == env.n_obs_dim
      assert next_state.size == env.n_obs_dim
      ep.append((state, action, reward, next_state, False, {}))
    eps.append(ep)
  return eps

def load_eps(filename, method, pilot_id, delta=1):
  with open(filename, 'rb') as f:
    data = pickle.load(f)
    if method == 'pilot_eval':
      data = data[pilot_id]
    else:
      data = list(data.values())[0]
  return format_eps(data, method, delta=delta)

In [None]:
methods = ['pilot_eval', 'reward_logs']
def make_dataset(delta=1):
  rollouts_of_pol = []
  method_of_pol = []
  pilot_of_pol = []
  for method in methods:
    for pilot_id in pilot_ids:
      path = os.path.join(deepassist_dir, '%s_%s.pkl' % (pilot_id, method))
      eps = load_eps(path, method, pilot_id, delta=delta)
      rollouts_of_pol.append(eps)
      method_of_pol.append(method)
      pilot_of_pol.append(pilot_id)
  return rollouts_of_pol, method_of_pol, pilot_of_pol

In [None]:
rollouts_of_pol, method_of_pol, pilot_of_pol = make_dataset(delta=np.inf)

In [None]:
n_conds = len(rollouts_of_pol)
n_steps = sum(len(r) for x in rollouts_of_pol for r in x)
n_steps, n_conds, n_steps / n_conds

In [None]:
mi_model_init_args = [sess]
mi_model_init_kwargs = {
  'n_env_obs_dim': env.n_env_obs_dim,
  'n_user_obs_dim': env.n_user_obs_dim,
  'n_act_dim': env.n_act_dim,
  'n_layers': 2,
  'layer_size': 64
}
mi_model_train_kwargs = {
  'iterations': 1000,
  'ftol': 1e-6,
  'learning_rate': 1e-4,
  'batch_size': 64,
  'val_update_freq': None,
  'verbose': False,
  'warm_start': False
}

In [None]:
reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs
)

In [None]:
ixs_reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs,
  use_next_env_obs=False
)

In [None]:
true_rew_of_rollout = lambda rollout: np.mean([x[2] for x in rollout])
true_reward_model = lambda rollouts: np.mean([true_rew_of_rollout(rollout) for rollout in rollouts])

In [None]:
offline_reward_models = [true_reward_model, reward_model, ixs_reward_model]

In [None]:
n_seeds = 10
def compute_rewards(rollouts_of_pol):
  rewards_of_pol = np.zeros((n_seeds, len(rollouts_of_pol), len(offline_reward_models)))
  for i in range(n_seeds):
    rewards_of_pol[i, :, :] = utils.compute_rews_of_rollouts(
      rollouts_of_pol,
      offline_reward_models,
      verbose=True
    )
  return rewards_of_pol

In [None]:
rewards_of_pol = compute_rewards(rollouts_of_pol)

In [None]:
rewards_of_pol_path = os.path.join(data_dir, 'rewards_of_pol.pkl')

In [None]:
with open(rewards_of_pol_path, 'wb') as f:
  pickle.dump(rewards_of_pol, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(rewards_of_pol_path, 'rb') as f:
  rewards_of_pol = pickle.load(f)

In [None]:
rewards_of_pol[:, :, 1:] = np.maximum(rewards_of_pol[:, :, 1:], 0)

In [None]:
mean_rewards_of_pol = np.mean(rewards_of_pol, axis=0)
mean_rewards_of_pol

In [None]:
mean_rewards_of_pol[:, 2] = np.minimum(mean_rewards_of_pol[:, 2], mean_rewards_of_pol[:, 1])
mean_rewards_of_pol[:, 2] = np.maximum(mean_rewards_of_pol[:, 2], 0)
mean_rewards_of_pol = np.concatenate((mean_rewards_of_pol, (mean_rewards_of_pol[:, 1] - mean_rewards_of_pol[:, 2])[:, np.newaxis]), axis=1)
mean_rewards_of_pol

In [None]:
label_of_method = {
  'pilot_eval': 'Solo Human',
  'reward_logs': 'With Copilot',
}
color_of_method = {
  'pilot_eval': 'gray',
  'reward_logs': 'orange',
}
color_of_pol = [color_of_method[m] for m in method_of_pol]

In [None]:
idxes_of_method = {m: [] for m in methods}
for i, method in enumerate(method_of_pol):
  idxes_of_method[method].append(i)

In [None]:
mpl.rcParams.update({'font.size': 12})

In [None]:
corr = scipy.stats.spearmanr(mean_rewards_of_pol)
corr

In [None]:
rho = corr[0][0, 1]
rho

In [None]:
plt.title(r'Shared Autonomy via Deep RL ($\rho$ = %0.2f)' % rho)
plt.xlabel('True Reward')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}, (\mathbf{s}_t, \mathbf{s}_T))$")
for method, idxes in idxes_of_method.items():
  plt.scatter(
    mean_rewards_of_pol[idxes, 0], 
    mean_rewards_of_pol[idxes, 1],
    color=color_of_method[method],
    label=label_of_method[method],
    s=50
  )
plt.legend(loc='upper center')
plt.savefig(os.path.join(data_dir, 'deep-assist-offline-eval-truerew-vs-mi-per-poluser.pdf'), bbox_inches='tight')
plt.show()

In [None]:
rewards_of_method = [[[] for _ in range(mean_rewards_of_pol.shape[1])] for _ in methods]
idx_of_method = {x: i for i, x in enumerate(methods)}
for i, method in enumerate(method_of_pol):
  for j in range(mean_rewards_of_pol.shape[1]):
    rewards_of_method[idx_of_method[method]][j].append(mean_rewards_of_pol[i, j])
rewards_of_method = [[np.mean(x) for x in y] for y in rewards_of_method]
rewards_of_method = np.array(rewards_of_method)

In [None]:
plt.title('Shared Autonomy via Deep RL')
plt.xlabel('True Reward')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}, (\mathbf{s}_t, \mathbf{s}_T))$")
for i, method in enumerate(methods):
  plt.scatter(
    rewards_of_method[i, 0], 
    rewards_of_method[i, 1],
    color=color_of_method[method],
    label=label_of_method[method],
    s=50
  )
plt.legend(loc='best')
plt.savefig(os.path.join(data_dir, 'deep-assist-offline-eval-truerew-vs-mi-per-pol.pdf'), bbox_inches='tight')
plt.show()

In [None]:
deltas = [1, 10, 20, 100, 200]

In [None]:
rewards_of_pol_of_delta = {}
rewards_of_pol_of_delta[np.inf] = rewards_of_pol

In [None]:
for delta in deltas:
  if delta not in rewards_of_pol_of_delta:
    rewards_of_pol_of_delta[delta] = compute_rewards(make_dataset(delta=delta)[0])

In [None]:
rewards_of_pol_of_delta_path = os.path.join(data_dir, 'rewards_of_pol_of_delta.pkl')

In [None]:
with open(rewards_of_pol_of_delta_path, 'wb') as f:
  pickle.dump(rewards_of_pol_of_delta, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(rewards_of_pol_of_delta_path, 'rb') as f:
  rewards_of_pol_of_delta = pickle.load(f)

In [None]:
deltas = sorted(rewards_of_pol_of_delta.keys())
corrs = []
for delta in deltas:
  rewards_of_pol = rewards_of_pol_of_delta[delta]
  rewards_of_pol[:, :, 1:] = np.maximum(rewards_of_pol[:, :, 1:], 0)
  mean_rewards_of_pol = np.mean(rewards_of_pol, axis=0)
  corr, _ = scipy.stats.spearmanr(mean_rewards_of_pol[:, 0], mean_rewards_of_pol[:, 1])
  corrs.append(corr)

In [None]:
max_ep_len = max(len(r) for rs in rollouts_of_pol for r in rs)

In [None]:
plt.title('Shared Autonomy via Deep RL\n' + r'$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_{t+\Delta}))$ vs. True Reward')
plt.xlabel(r'Time Offset $\Delta$')
plt.ylabel(r"Spearman's Rank Correlation $\rho$")
plt.plot([max_ep_len if d == np.inf else d for d in deltas], corrs, color='orange', marker='o')
plt.axhline(y=0, linestyle='--', color='gray')
plt.xscale('log')
plt.savefig(os.path.join(data_dir, 'deep-assist-offline-eval-corr-vs-delta.pdf'), bbox_inches='tight')
plt.show()