In [4]:
!pip install -r requirements.txt
!pip install pyglet==1.5.11
%load_ext autoreload
%autoreload 2
%matplotlib inline

Collecting pyglet==1.5.11
  Downloading pyglet-1.5.11-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyglet
  Attempting uninstall: pyglet
    Found existing installation: pyglet 2.0.3
    Uninstalling pyglet-2.0.3:
      Successfully uninstalled pyglet-2.0.3
Successfully installed pyglet-1.5.11
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from __future__ import division

from copy import deepcopy
import pickle
import time
import os
import uuid

import scipy
import numpy as np
import sys
sys.path.append('/home/jovyan/work')

from matplotlib import pyplot as plt
import matplotlib as mpl
import sys
print(sys.path)

from mimi import envs
from mimi import utils
from mimi import user_models
from mimi import opt
from mimi import reward_models
from mimi import models

['/home/jovyan/work/notebooks', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/jovyan/.ipython', '/home/jovyan/work', '/home/jovyan/work']


ImportError: 
    Error occurred while running `from pyglet.gl import *`
    HINT: make sure you have OpenGL installed. On Ubuntu, you can run 'apt-get install python-opengl'.
    If you're running on a server, you may need a virtual frame buffer; something like this should work:
    'xvfb-run -s "-screen 0 1400x900x24" python <your_script.py>'
    

In [None]:
data_dir = os.path.join(utils.data_dir, 'asha-bottle')

In [None]:
sess = utils.make_tf_session(gpu_mode=False)

In [None]:
env = envs.ASHABottleEnv()

In [None]:
asha_dir = os.path.join(data_dir, 'raw')

In [None]:
def format_obses(raw_obses):
  obses = []
  for raw_obs in raw_obses:
    obses.append(np.concatenate((raw_obs['raw_obs'], raw_obs['gaze_features'])))
  return obses

def format_eps(data):
  eps = []
  for block in data:
    for raw_ep in block:
      T = raw_ep['rewards'].shape[0]
      obses = format_obses(raw_ep['observations'])
      actions = raw_ep['actions']
      rewards = raw_ep['rewards'][:, 0]
      next_obses = format_obses(raw_ep['next_observations'])
      ep = []
      for t in range(T):
        state = obses[t]
        action = actions[t]
        reward = rewards[t]
        next_state = next_obses[-1]
        ep.append((state, action, reward, next_state, False, {}))
      eps.append(ep)
  return eps

def load_eps(filename):
  with open(filename, 'rb') as f:
    data = pickle.load(f)
  return format_eps(data)

In [None]:
methods = ['A', 'B', 'C']
rollouts_of_pol = []
method_of_pol = []
for method in methods:
  method_path = os.path.join(asha_dir, method)
  user_ids = os.listdir(method_path)
  for user_id in user_ids:
    user_path = os.path.join(method_path, user_id, 'data.pkl')
    if os.path.exists(user_path):
      eps = load_eps(user_path)
      rollouts_of_pol.append(eps)
      method_of_pol.append(method)

In [None]:
n_conds = len(rollouts_of_pol)
n_steps = sum(len(r) for x in rollouts_of_pol for r in x)
n_steps, n_conds, n_steps / n_conds

In [None]:
rollouts_of_pol[0][0][0][0].shape, rollouts_of_pol[0][0][0][1].shape

In [None]:
mi_model_init_args = [sess]
mi_model_init_kwargs = {
  'n_env_obs_dim': env.n_env_obs_dim,
  'n_user_obs_dim': env.n_user_obs_dim,
  'n_act_dim': env.n_act_dim,
  'n_layers': 2,
  'layer_size': 64
}
mi_model_train_kwargs = {
  'iterations': 1000,
  'ftol': 1e-6,
  'learning_rate': 1e-4,
  'batch_size': 64,
  'val_update_freq': None,
  'verbose': False,
  'warm_start': False
}

In [None]:
reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs
)

In [None]:
ixs_reward_model = reward_models.MIRewardModel(
  env,
  mi_model_init_args,
  mi_model_init_kwargs,
  mi_model_train_kwargs,
  use_next_env_obs=False
)

In [None]:
true_rew_of_rollout = lambda rollout: np.mean([x[2] for x in rollout])
true_reward_model = lambda rollouts: np.mean([true_rew_of_rollout(rollout) for rollout in rollouts])

In [None]:
offline_reward_models = [true_reward_model, reward_model, ixs_reward_model]

In [None]:
n_seeds = 10
rewards_of_pol = np.zeros((n_seeds, len(rollouts_of_pol), len(offline_reward_models)))

In [None]:
for i in range(n_seeds):
  rewards_of_pol[i, :, :] = utils.compute_rews_of_rollouts(
    rollouts_of_pol,
    offline_reward_models,
    verbose=True
  )

In [None]:
rewards_of_pol_path = os.path.join(data_dir, 'rewards_of_pol.pkl')

In [None]:
with open(rewards_of_pol_path, 'wb') as f:
  pickle.dump(rewards_of_pol, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(rewards_of_pol_path, 'rb') as f:
  rewards_of_pol = pickle.load(f)

In [None]:
rewards_of_pol[:, :, 1:] = np.maximum(rewards_of_pol[:, :, 1:], 0)

In [None]:
mean_rewards_of_pol = np.mean(rewards_of_pol, axis=0)
mean_rewards_of_pol

In [None]:
mean_rewards_of_pol[:, 2] = np.minimum(mean_rewards_of_pol[:, 2], mean_rewards_of_pol[:, 1])
mean_rewards_of_pol[:, 2] = np.maximum(mean_rewards_of_pol[:, 2], 0)
mean_rewards_of_pol = np.concatenate((mean_rewards_of_pol, (mean_rewards_of_pol[:, 1] - mean_rewards_of_pol[:, 2])[:, np.newaxis]), axis=1)
mean_rewards_of_pol

In [None]:
color_of_method = {
  'A': 'gray',
  'B': 'orange',
  'C': 'teal'
}
label_of_method = {
  'A': 'Non-Adaptive',
  'B': 'ASHA',
  'C': 'ASHA (With Distribution Shift)'
}
color_of_pol = [color_of_method[m] for m in method_of_pol]

In [None]:
idxes_of_method = {m: [] for m in methods}
for i, method in enumerate(method_of_pol):
  idxes_of_method[method].append(i)

In [None]:
mpl.rcParams.update({'font.size': 12})

In [None]:
corr = scipy.stats.spearmanr(mean_rewards_of_pol)
corr

In [None]:
rho = corr[0][0, 1]
rho

In [None]:
plt.title(r'ASHA Bottle ($\rho$ = %0.2f)' % rho)
plt.xlabel('True Reward')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_T))$")
for method, idxes in idxes_of_method.items():
  plt.scatter(
    mean_rewards_of_pol[idxes, 0], 
    mean_rewards_of_pol[idxes, 1],
    color=color_of_method[method],
    label=label_of_method[method],
    s=50
  )
plt.legend(loc='best')
plt.savefig(os.path.join(data_dir, 'asha-bottle-offline-eval-truerew-vs-mi-per-poluser.pdf'), bbox_inches='tight')
plt.show()

In [None]:
rewards_of_method = [[[] for _ in range(mean_rewards_of_pol.shape[1])] for _ in methods]
idx_of_method = {x: i for i, x in enumerate(methods)}
for i, method in enumerate(method_of_pol):
  for j in range(mean_rewards_of_pol.shape[1]):
    rewards_of_method[idx_of_method[method]][j].append(mean_rewards_of_pol[i, j])
rewards_of_method = [[np.mean(x) for x in y] for y in rewards_of_method]
rewards_of_method = np.array(rewards_of_method)

In [None]:
plt.title('ASHA Bottle')
plt.xlabel('True Reward')
plt.ylabel(r"$\mathcal{I}(\mathbf{x}_t, (\mathbf{s}_t, \mathbf{s}_T))$")
for i, method in enumerate(methods):
  plt.scatter(
    rewards_of_method[i, 0], 
    rewards_of_method[i, 1],
    color=color_of_method[method],
    label=label_of_method[method],
    s=50
  )
plt.xticks(fontsize=10)
plt.legend(loc='best')
plt.savefig(os.path.join(data_dir, 'asha-bottle-offline-eval-truerew-vs-mi-per-pol.pdf'), bbox_inches='tight')
plt.show()