# Bias model bootstrapping notebook

Eventually this notebook will (hopefully?) contain all the code necessary to run our final experiments.
For now it just contains a demo of MCE IRL on some gridworlds from the "learning biases" paper.

In [None]:
%matplotlib inline

# These are useful for debugging, but make code slower:
# %load_ext autoreload
# %autoreload 2

import logging

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import jax.experimental.optimizers as jopt
import numpy as np

from pref_bootstrap.envs import gridworld, mdp_interface
from pref_bootstrap.algos import mce_irl

sns.set(context='notebook', style='darkgrid')
logging.basicConfig(level=logging.INFO)
np.set_printoptions(precision=4, linewidth=100)

In [None]:
def alternating_reward_learning():
    # QUESTIONS:
    #
    # - What if I want to share rationality models over environments with different reward functions?
    # - What if I want to share rationality models between different feedback modalities?
    #   (like I'm sharing reward models)
    # - What if I want to create hierarchical reward or rationality models? (e.g. have a pseudo-prior
    #   that is shared between different rationality models, to keep them tied together)
    #
    # Seems like it at least makes sense to have the same treatment for rationality model params &
    # reward model params, so we can share them between models.
    while True:
        # do reward update
        grad_accum = np.zeros_like(reward_parameters)
        for algo in algos:
            grad_accum += algo.grad_wrt_reward(reward_parameters)
        parameters += grad_accum
    for algo in algos:
        algo.grad_wrt_reward(reward_parameters)

# REMARK: these all seem heavily tied to modality. e.g. from an implementation
# perspective, it probably makes sense to group a reward model, bias model,
# and associated priors together for each modality.
reward_model_priors = [reward_prior1(), reward_prior2(), ...]
reward_models = [reward_model_model1(reward_model_priors[i]), reward_model_model2(reward_model_priors[j]), ...]
bias_model_priors = [bias_prior1(), bias_prior2(), ...]
bias_mdoels = [bias_model1(bias_model_priors[k]), bias_model2(bias_model_priors[l]), ...]
behaviour_datasets = [collect_dataset_1(), collect_dataset_2(), ...]

# Now do GD on log likelihood of behaviour datasets!
# Just need some way of associating a bias model with each of them.
# Maybe just pass in a dict? IDK.
# Other things I might need:
# - Computing the actual value of each term in the log posterior. This will be
#   useful for line search, and is probably not too much effort. Not sure
#   whether line search really matters for this task, though, since values are
#   almost as expensive to compute as gradients in IRL (both need planning).
# - Evaluating the joint Hessian for all parameters. Unfortunately doing so is
#   going to be rather complicated, so it might make sense to skip it for this
#   project.

In [None]:
random_gridworld = gridworld.GridworldMdp.generate_random(5, 5, 0.2, 0.1)
env = mdp_interface.GridworldEnvWrapper(random_gridworld, random_gridworld.height + random_gridworld.width)

In [None]:
# generate some optimal demos (represented by a single optimal occupancy measure vector) and run MCE IRL
_, optimal_om = mce_irl.mce_occupancy_measures(env)
rmodel = mce_irl.LinearRewardModel(env.obs_dim)
opt_tup = jopt.momentum(1e-2, 0.9)

In [None]:
rew_params, visitations = mce_irl.mce_irl(env, opt_tup, rmodel, optimal_om, print_interval=100, max_iter=1000)

In [None]:
print('Optimal state visitation frequencies for each grid cell:')
print(optimal_om.reshape((random_gridworld.height, random_gridworld.width)))
print('\nRecovered state visitation frequencies for each grid cell:')
print(visitations.reshape((random_gridworld.height, random_gridworld.width)))