# Bias model bootstrapping notebook

Eventually this notebook will (hopefully?) contain all the code necessary to run our final experiments.
For now it just contains a demo of MCE IRL on some gridworlds from the "learning biases" paper.

First we have some not-very-interesting setup:

In [None]:
%matplotlib inline

# These are useful for debugging, but make code slower:
%load_ext autoreload
%autoreload 2

import logging

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import jax
import jax.experimental.optimizers as jopt
import jax.random as jrandom
import numpy as np

from pref_bootstrap.envs import gridworld, mdp_interface
from pref_bootstrap.algos import mce_irl
import pref_bootstrap.feedback_learner_blind_irl as fbl_blind_irl
import pref_bootstrap.feedback_learner_paired_comparisons as fbl_paired_comp
import pref_bootstrap.reward_models as r_models
import pref_bootstrap.expert_base as experts

sns.set(context='notebook', style='darkgrid')
logging.basicConfig(level=logging.INFO)
np.set_printoptions(precision=4, linewidth=100)

## Very simple example of using environments and MCE IRL code

This code doesn't use the new agent API

In [None]:
random_gridworld = gridworld.GridworldMdp.generate_random(4, 4, 0.2, 0.1)
env = mdp_interface.GridworldEnvWrapper(random_gridworld, random_gridworld.height + random_gridworld.width)

In [None]:
# generate some optimal demos (represented by a single optimal occupancy measure vector) and run MCE IRL
_, optimal_om = mce_irl.mce_occupancy_measures(env)
rmodel = r_models.LinearRewardModel(env.obs_dim)
opt_tup = jopt.momentum(1e-2, 0.9)

In [None]:
rew_params, visitations = mce_irl.mce_irl(env, opt_tup, rmodel, optimal_om, print_interval=100, max_iter=1000)

In [None]:
print('Optimal state visitation frequencies for each grid cell:')
print(optimal_om.reshape((random_gridworld.height, random_gridworld.width)))
print('\nRecovered state visitation frequencies for each grid cell:')
print(visitations.reshape((random_gridworld.height, random_gridworld.width)))

## More complex example showing how to use an EnvFeedbackModel to recover both a reward function + sub-rationality model

This code actually does use the new API to show how to use the 'blind IRL' feedback model (& its associated expert, which doesn't support observation blinding yet).

In [None]:
import pref_bootstrap.feedback_learner_topk as fbl_topk

rmodel = r_models.LinearRewardModel(env.obs_dim)
top_K_feedback_model = fbl_topk.TopKFeedbackModel(env)
top_K_expert = experts.TopKExpert(env, temp=.3, K=.05, seed=42)

In [None]:
# def genereate_top_K_dataset(n_traj): 
import random

n_traj = 1000
rmodel = r_models.LinearRewardModel(env.obs_dim)
rmodel.set_params(env.reward_matrix)
traj = mce_irl.mce_irl_sample(env, n_traj, R=np.ones((env.n_states,)))
labels= top_K_expert.interact(traj, rmodel)

labels_final, traj_final = [], []
for l, t in zip(labels, traj['states']): 

    if not(l):
        if True:#random.random() < (np.sum(labels)/len(labels)):
            labels_final.append(l), traj_final.append(t)
    else: 
        labels_final.append(l), traj_final.append(t)
        
labels_final = np.array(labels_final)
        
print(np.sum(labels)/len(labels))
print(np.sum(labels_final)/len(labels_final))


In [None]:
def generate_topk_dataset(n_traj):
    # def genereate_top_K_dataset(n_traj): 
    import random
    rmodel = r_models.LinearRewardModel(env.obs_dim)
    rmodel.set_params(env.reward_matrix)
    traj = mce_irl.mce_irl_sample(env, n_traj, R=np.ones((env.n_states,)))
    labels= top_K_expert.interact(traj, rmodel)

    labels_final, traj_final = [], []
    for l, t in zip(labels, traj['states']): 
        if not(l):
            if random.random() < top_K_expert.K:#TODO if there are issues update this so that we balance our dataset.
                labels_final.append(l), traj_final.append(t)
        else: 
            labels_final.append(l), traj_final.append(t)

    labels_final = np.array([int(l) for l in labels_final])
    
    return {
        'trajectories':np.array(traj_final), 
        'labels':labels_final
    }

top_K_dataset = generate_topk_dataset(1000)


In [None]:
lr = 1e-3
steps = 1000
loss_prev = float('Inf')
delta = 100
eps = 1e-5

import pref_bootstrap.feedback_learner_topk as fbl_topk
top_K_feedback_model = fbl_topk.TopKFeedbackModel(env)
top_K_expert = experts.TopKExpert(env, temp=.3, K=.05, seed=42)
rmodel = r_models.LinearRewardModel(env.obs_dim)
import jax
import jax.numpy as jnp

rng = jrandom.PRNGKey(23)
rng, top_K_bias_params = top_K_feedback_model.init_bias_params(rng)

top_K_bias_params.shape

In [None]:
def trad_optimize(model, data, rmodel, bias_params, use_bias_prior=False, 
                  eps=1e-9, optimize_bias=True,one_bias=False, iters=1000, lr=1e-2, printz=100):
    """
    Note: Going to add back the projected G.D
    I'm not really sure why he was trying to optimize the probability of the bias terms under the bias prior, 
    i don't really think that makes much sense. You'll basically just push it towards the mean terms even
    thought its not really there...
    
    """
    
    steps = iters
    loss_prev = float('Inf')
    delta = 100
 
    step = 0
    while(step<steps):

        grew = model.log_likelihood_grad_rew(data, rmodel, bias_params)
        new_r = rmodel.get_params() + lr*grew
        
        rmodel.set_params(new_r)
        
        gbias = model.log_likelihood_grad_bias(data, rmodel, bias_params)
       
        if use_bias_prior and optimize_bias: 
            bias_prior_grad = model.bias_prior.log_prior_grad(bias_params)
            gbias = gbias+bias_prior_grad
        if optimize_bias: 
            bias_params = bias_params+lr*gbias
        elif (not optimize_bias) and one_bias: 
            bias_params = jnp.ones_like(bias_params) # TODO: 
        
        if use_bias_prior: 
            bias_params = model.bias_prior.project_to_support(bias_params)
        
        loss = model.log_likelihood(data, rmodel, bias_params)
        if step % printz == 0:
            print('step %d loss %.3f' %(step, loss))
        if step > 0: 
            delta = loss-loss_prev
        loss_prev = loss
        step +=1

        

    if np.abs(delta) <= .0001: 
        print('terminated due to delta')
    else: 
        print('terminated due to steps exceeding %d' %steps)
        
    return model, rmodel, bias_params
rmodel = r_models.LinearRewardModel(env.obs_dim)

_, rmodel, _ = trad_optimize(top_K_feedback_model, top_K_dataset, rmodel, top_K_bias_params, optimize_bias=True, 
                            use_bias_prior=True, iters=1000)

In [None]:
def evaluate(rmodel, name):


    _, topk_om = mce_irl.mce_occupancy_measures(env, R=rmodel.get_params())

    print('Optimal state visitation frequencies for each grid cell:')
    print(optimal_om.reshape((random_gridworld.height, random_gridworld.width)))

    print('Inferred ')
    print(topk_om.reshape((random_gridworld.height, random_gridworld.width)))
    
    visited_states = np.nonzero((optimal_om > 1e-5) | (topk_om > 1e-5))[0]
    plt.plot(visited_states, rmodel.get_params()[visited_states], label='est')
    plt.plot(visited_states, env.reward_matrix[visited_states], label='real_reward')
    plt.legend()
    plt.title('Estimated Reward Function %s' %name)
    plt.savefig('./images/reward_%s.png' %name)
    
evaluate(rmodel, name='Top-K feedback model')

In [None]:
pc_feedback_model = fbl_paired_comp.PairedCompFeedbackModel(env)
rng, pc_bias_params = pc_feedback_model.init_bias_params(rng)
pc_expert = experts.PairedComparisonExpert(env, boltz_temp=1.0, seed=42)

# generate some random trajectories & compare a random subset of them
def generate_comparison_dataset(pc_ntraj):
    pc_trajectories = mce_irl.mce_irl_sample(env, pc_ntraj, R=np.ones((env.n_states, )))
    to_compare_first = np.arange(len(pc_trajectories['states']))
    comparisons = []
    for first_idx in range(pc_ntraj):
        second_idx = np.random.randint(pc_ntraj - 1)
        if second_idx >= first_idx:
            second_idx += 1
        traj1_is_better = pc_expert.interact(
            dict(states=pc_trajectories['states'][first_idx]),
            dict(states=pc_trajectories['states'][second_idx]))
        if traj1_is_better:
            # the better trajectory comes before the worse one
            comparisons.append((first_idx, second_idx))
        else:
            comparisons.append((second_idx, first_idx))
    return {
        'trajectories': pc_trajectories,
        'comparisons': np.asarray(comparisons),
    }

comparison_dataset = generate_comparison_dataset(100)

In [None]:
rmodel = r_models.LinearRewardModel(env.obs_dim)
_, rmodel, _ = trad_optimize(pc_feedback_model, comparison_dataset, rmodel, pc_bias_params, use_bias_prior=True, 
                            optimize_bias=True, one_bias=False)
evaluate(rmodel, name='Paired Comparisons')

In [None]:
irl_feedback_model = fbl_blind_irl.BlindIRLFeedbackModel(env)
rmodel = r_models.LinearRewardModel(env.obs_dim)
rng = jrandom.PRNGKey(42)
rng, irl_bias_params = irl_feedback_model.init_bias_params(rng)
irl_expert = experts.MEDemonstratorExpert(env, np.random.randint((1 << 31) - 1))
# we'll do IRL based on 10 trajectories
irl_dataset = irl_expert.interact(10)
_, rmodel, _ = trad_optimize(irl_feedback_model, irl_dataset, rmodel, irl_bias_params, use_bias_prior=True, optimize_bias=True, 
                            one_bias=False)
evaluate(rmodel, name='Blind IRL')

In [None]:
from pref_bootstrap import feedback_learner_scalar as fbl_scalar
s_feedback_model = fbl_scalar.ScalarFeedbackModel(env)

rng, s_feedback_params = s_feedback_model.init_bias_params(rng)
s_expert = experts.ScalarFeedbackExpert(env, seed=3)

# generate some random trajectories & compare a random subset of them
def generate_corrupted_ds(ntraj=20):
    trajectories = mce_irl.mce_irl_sample(env, ntraj, R=np.ones((env.n_states, )))
    return s_expert.interact(trajectories)
s_dataset = generate_corrupted_ds(20)
rmodel = r_models.LinearRewardModel(env.obs_dim)
_, rmodel, _ = trad_optimize(s_feedback_model, s_dataset, rmodel, s_feedback_params, use_bias_prior=False, 
                            optimize_bias=True, one_bias=False, iters=200)
evaluate(rmodel, name='Scalar feedback')

In [None]:
def multi_optimize(model_list, data_list, rmodel, bias_list, use_bias_list, optimize_reward=True, 
                   optimize_bias=True, one_bias=False): 
    
    lr = 1e-2
    steps = 1000
    loss_prev = float('Inf')
    delta = 100
    step = 0
    
    while(step<steps):
    
        grew = jnp.zeros_like(env.reward_matrix)
        
        if optimize_reward: 
            for model, data, bias_params in zip(model_list, data_list, bias_list):
                grew += model.log_likelihood_grad_rew(data, rmodel, bias_params)
        new_r = rmodel.get_params() + lr*grew
        rmodel.set_params(new_r)
        
        for k, (model, data, bias_params, use_bias_prior) in enumerate(zip(model_list, data_list, bias_list, use_bias_list)):
            gbias = model.log_likelihood_grad_bias(data, rmodel, bias_params)
            if use_bias_prior and optimize_bias: 
                bias_prior_grad = model.bias_prior.log_prior_grad(bias_params)
                gbias = gbias+bias_prior_grad
            if optimize_bias: 
                bias_params = bias_params + lr*gbias
            elif (not optimize_bias) and one_bias: 
                bias_params = jnp.ones_like(bias_params) # TODO: 
                
            if use_bias_prior: 
                bias_params = model.bias_prior.project_to_support(bias_params)
            bias_list[k] = bias_params
            
       
            
        for k, (model, data, bias_params) in enumerate(zip(model_list, data_list, bias_list)):
            loss = model.log_likelihood(data, rmodel, bias_params)
          
            if step % 100 == 0:
                print('step %d loss %.3f model %d' %(step, loss, k))
                print('---', bias_params)
        if step > 0: 
            delta = loss-loss_prev
        loss_prev = loss
        step +=1
        
    return model_list, rmodel, bias_list

In [None]:
# Reinitializing all the models

def init_models(opt_reward=False):
    irl_feedback_model = fbl_blind_irl.BlindIRLFeedbackModel(env)
    rmodel = r_models.LinearRewardModel(env.obs_dim)
    rng = jrandom.PRNGKey(42)
    rng, irl_bias_params = irl_feedback_model.init_bias_params(rng)
    irl_expert = experts.MEDemonstratorExpert(env, np.random.randint((1 << 31) - 1))
    # we'll do IRL based on 10 trajectories
    irl_dataset = irl_expert.interact(20)

    pc_feedback_model = fbl_paired_comp.PairedCompFeedbackModel(env)
    rng, pc_bias_params = pc_feedback_model.init_bias_params(rng)
    pc_expert = experts.PairedComparisonExpert(env, boltz_temp=1.0, seed=42)

    # generate some random trajectories & compare a random subset of them
    def generate_comparison_dataset(pc_ntraj):
        pc_trajectories = mce_irl.mce_irl_sample(env, pc_ntraj, R=np.ones((env.n_states, )))
        to_compare_first = np.arange(len(pc_trajectories['states']))
        comparisons = []
        for first_idx in range(pc_ntraj):
            second_idx = np.random.randint(pc_ntraj - 1)
            if second_idx >= first_idx:
                second_idx += 1
            traj1_is_better = pc_expert.interact(
                dict(states=pc_trajectories['states'][first_idx]),
                dict(states=pc_trajectories['states'][second_idx]))
            if traj1_is_better:
                # the better trajectory comes before the worse one
                comparisons.append((first_idx, second_idx))
            else:
                comparisons.append((second_idx, first_idx))
        return {
            'trajectories': pc_trajectories,
            'comparisons': np.asarray(comparisons),
        }

    comparison_dataset = generate_comparison_dataset(20)

    top_K_expert = experts.TopKExpert(env, temp=.3, K=.01, seed=42)
    def generate_topk_dataset(n_traj):
        # def genereate_top_K_dataset(n_traj): 
        import random
        rmodel = r_models.LinearRewardModel(env.obs_dim)
        rmodel.set_params(env.reward_matrix)
        traj = mce_irl.mce_irl_sample(env, n_traj, R=np.ones((env.n_states,)))
        labels= top_K_expert.interact(traj, rmodel)

        labels_final, traj_final = [], []
        for l, t in zip(labels, traj['states']): 
            if not(l):
                if random.random() < top_K_expert.K:#TODO if there are issues update this so that we balance our dataset.
                    labels_final.append(l), traj_final.append(t)
            else: 
                labels_final.append(l), traj_final.append(t)

        labels_final = np.array([int(l) for l in labels_final])

        return {
            'trajectories':np.array(traj_final), 
            'labels':labels_final
        }

    top_K_dataset = generate_topk_dataset(1000)

    import pref_bootstrap.feedback_learner_topk as fbl_topk
    top_K_feedback_model = fbl_topk.TopKFeedbackModel(env)

    rmodel = r_models.LinearRewardModel(env.obs_dim)
    import jax
    import jax.numpy as jnp

    rng = jrandom.PRNGKey(23)
    rng, top_K_bias_params = top_K_feedback_model.init_bias_params(rng)
    
    
    
    from pref_bootstrap import feedback_learner_scalar as fbl_scalar
    s_feedback_model = fbl_scalar.ScalarFeedbackModel(env)

    rng, s_feedback_params = s_feedback_model.init_bias_params(rng)
    s_expert = experts.ScalarFeedbackExpert(env, seed=3)

    # generate some random trajectories & compare a random subset of them
    def generate_corrupted_ds(ntraj=20):
        trajectories = mce_irl.mce_irl_sample(env, ntraj, R=np.ones((env.n_states, )))
        return s_expert.interact(trajectories)
    s_dataset = generate_corrupted_ds(20)
    rmodel = r_models.LinearRewardModel(env.obs_dim)
#     _, rmodel, _ = trad_optimize(s_feedback_model, s_dataset, rmodel, s_feedback_params, use_bias_prior=False, 
#                                 optimize_bias=True, one_bias=False, iters=200)
#     evaluate(rmodel, name='Scalar feedback')

    model_list = [top_K_feedback_model, pc_feedback_model, irl_feedback_model, s_feedback_model]
    data_list = [top_K_dataset, comparison_dataset, irl_dataset, s_dataset]
    bias_list = [top_K_bias_params, pc_bias_params, irl_bias_params, s_feedback_params]
    use_bias_list = [True, True, True, True]
    rmodel = r_models.LinearRewardModel(env.obs_dim)
    if opt_reward:
        rmodel.set_params(env.reward_matrix)
        
    names = ['top_K', 'paired_comparisons', 'blind_irl', 'scalar_feedback']
    
    biases_actual = []
    
    # TOP K biases 
    biases_actual.append((top_K_expert.temp, top_K_expert.cutoff))
    
    # pc actual bias
    biases_actual.append((pc_expert.boltz_temp))
    
    #blind irl actual bias
    biases_actual.append((irl_bias_params))
    
    biases_actual.append((0, 1))
        
    return model_list, data_list, rmodel, bias_list, use_bias_list, names, biases_actual

In [None]:
model_list, data_list, rmodel, bias_list, use_bias_list, names, _ = init_models(opt_reward=True)
# models, rmodel, biases = multi_optimize(model_list, data_list, rmodel, bias_list, use_bias_list, optimize_reward=False)

In [None]:
# plt.plot(biases[-1], label="recovered")
# plt.plot(irl_bias_params, label='Actual')
plt.legend()
plt.title('Blind IRL')

In [None]:
# Plot the distribution of rewards and labels: 
def topK_dist(top_K_dataset): 
    states = top_K_dataset['trajectories']
    flat_states = states.flatten()
    all_fn_values = rmodel.get_params() #(self.env.observation_matrix)
    rew_est = (all_fn_values[flat_states]) # hopefully jax can do this, if not...need 1-hot.
    per_obs_rew  = jnp.reshape(rew_est, states.shape[:2] + rew_est.shape[1:])
    per_traj_rew_est = jnp.sum(per_obs_rew, axis=1)
    return per_traj_rew_est

In [None]:
# Plot 1. Method comparison. 
model_list = [top_K_feedback_model, pc_feedback_model, irl_feedback_model]
data_list = [top_K_dataset, comparison_dataset, irl_dataset]
bias_list = [top_K_bias_params, pc_bias_params, irl_bias_params]
use_bias_list = [False, True, True]
rmodel = r_models.LinearRewardModel(env.obs_dim)
rmodel.set_params(env.reward_matrix)

In [None]:
import pandas as pd

def get_rew(trajs, rmodel): 
    states = trajs['states']
    flat_states = states.flatten()
    all_fn_values = rmodel
    rew_est = (all_fn_values[flat_states]) # hopefully jax can do this, if not...need 1-hot.
    per_obs_rew  = jnp.reshape(rew_est, states.shape[:2] + rew_est.shape[1:])
    per_traj_rew_est = jnp.sum(per_obs_rew, axis=1)
    return np.mean(per_traj_rew_est)

def evaluate_full(rmodel): 
    _, om = mce_irl.mce_occupancy_measures(env, R=rmodel.get_params())
    trajs = mce_irl.mce_irl_sample(env, 100, R=rmodel.get_params())
    rews = get_rew(trajs, rmodel.get_params())

    
    return rmodel.get_params(), rews, om



In [None]:
biases_recovered, biases_actual, recovered_reward, mean_reward_obtained, obs_obtained, method, fold = [],[],[],[],[],[], []



for _ in range(3): 

    # eval each method
        # reset reward_model

    model_list, data_list, rmodel, bias_list, use_bias_list, names, biases_actual = init_models()
    use_bias_list = [True, True, True, True]
    obs = [True, True, True, True]
    iterz = [1000, 1000, 1000, 200]
    for model, dataset, bias_params, ub, name, b_actual, ob, it in zip(model_list, data_list, bias_list, use_bias_list, names, biases_actual, obs, iterz):     
        rmodel = r_models.LinearRewardModel(env.obs_dim)
        _, rmodel, bias_p_recovered = trad_optimize(model, dataset, rmodel, bias_params, use_bias_prior=ub, 
                                                   optimize_bias=ob, one_bias=False, printz=10, lr=1e-3,
                                                   iters=it)

        recovered_reward_vec, mean_reward, obs_f = evaluate_full(rmodel)

        biases_recovered.append(bias_p_recovered)
        biases_actual.append(b_actual)
        recovered_reward.append(recovered_reward_vec)
        mean_reward_obtained.append(mean_reward)
        obs_obtained.append(obs_f)
        method.append(name)
        fold.append(_)
        

In [None]:
rmodel.set_params(env.reward_matrix)
r, mean_rew_opt, obs_f = evaluate_full(rmodel)
obs_optimal = [obs_f]*len(method)
optimal_rew = [mean_rew_opt]*len(method)
real_rew = [r]*len(method)

res = pd.DataFrame({
    'biases_recoverd':biases_recovered,
    'recovered_reward':recovered_reward,
    'mean_reward_obtained':mean_reward_obtained,
    'obs_obtained':obs_obtained,
    'obs_optimal':obs_optimal, 
    'optimal_rew':optimal_rew, 
    'method':method
})

from datetime import datetime
now = datetime.now()
ts = now.strftime('%Y_%m_%d_%H_%M_%S')

res.to_pickle('./results/single_model_res_%s.pkl' %ts)

In [None]:
import seaborn as sns
from datetime import datetime



res = pd.read_pickle('./results/single_model_res_%s.pkl' %ts)

res

In [None]:
sns.barplot(x='method', y='mean_reward_obtained', data=res)
plt.axhline(optimal_rew[0], label='optimal rew')
plt.legend()
plt.ylim([0, optimal_rew[0]+1])
plt.savefig('./images/final-results-optimize-bias_%s.png' %ts)
plt.show()


for r, m in zip(res['recovered_reward'].values, res['method'].values): 
    print(m)
    rmodel = r_models.LinearRewardModel(env.obs_dim)
    rmodel.set_params(r)
    evaluate(rmodel, name=m)




#


for m in set(list(res['method'].values)):
    obs = res.loc[res['method']==m]['obs_obtained']
    obs = obs.values
    res_L = []
    for o in obs: 
        res_L.append(o)
        plt.imshow(o.reshape((random_gridworld.height, random_gridworld.width)))
        plt.title(m)
        plt.show()
    res_l = np.array(res_L)
    res_l = np.median(res_l, axis=0)
    

In [None]:
biases_recovered, biases_actual, recovered_reward, mean_reward_obtained, obs_obtained, method, fold = [],[],[],[],[],[], []

for _ in range(3): 

    # eval each method
        # reset reward_model

    model_list, data_list, rmodel, bias_list, use_bias_list, names, biases_actual = init_models()
    use_bias_list = [True, True, True]
    obs = [True, True, True]
    rmodel = r_models.LinearRewardModel(env.obs_dim)
    _, rmodel, bias_p_recovered = multi_optimize(model_list, data_list, rmodel, bias_list, use_bias_list,
                                               optimize_bias=True, one_bias=False)

    recovered_reward_vec, mean_reward, obs_f = evaluate_full(rmodel)

    biases_recovered.append(bias_p_recovered)
    biases_actual.append(b_actual)
    recovered_reward.append(recovered_reward_vec)
    mean_reward_obtained.append(mean_reward)
    obs_obtained.append(obs_f)
    method.append(name)
    fold.append(_)
    
print(len(biases_recovered), len(real_rew), len(recovered_reward), len(mean_reward_obtained), 
     len(obs_obtained), len(optimal_rew), len(biases_actual))

rmodel.set_params(env.reward_matrix)
r, mean_rew_opt, obs_f = evaluate_full(rmodel)
obs_optimal = [obs_f]*len(method)
optimal_rew = [mean_rew_opt]*len(method)
real_rew = [r]*len(method)

In [None]:
res = pd.DataFrame({
    'biases_recoverd':biases_recovered,
    'recovered_reward':recovered_reward,
    'mean_reward_obtained':mean_reward_obtained,
    'obs_obtained':obs_obtained,
    'obs_optimal':obs_optimal, 
    'optimal_rew':optimal_rew, 
    'method':['combined' for m in method]
})
        

In [None]:
import seaborn as sns
from datetime import datetime

now = datetime.now()
ts = now.strftime('%Y_%m_%d_%H_%M_%S')

sns.barplot(x='method', y='mean_reward_obtained', data=res)
plt.axhline(optimal_rew[0], label='optimal rew')
plt.legend()
plt.ylim([0, optimal_rew[0]+4])
plt.savefig('./images/final-results-multi_train_%s.png' %ts)
plt.show()


for r, m in zip(res['recovered_reward'].values, res['method'].values): 
    print(m)
    
#     evaluate(r, name=m)




res.to_pickle('./results/full_multi_train_%s.pkl' %ts)

for m in set(list(res['method'].values)):
    obs = res.loc[res['method']==m]['obs_obtained']
    obs = obs.values
    res_L = []
    for o in obs: 
        res_L.append(o)
        plt.imshow(o.reshape((random_gridworld.height, random_gridworld.width)))
        plt.title(m)
        plt.show()
    res_l = np.array(res_L)
    res_l = np.median(res_l, axis=0)

In [None]:
res

In [None]:
# compare shared bias  params vs the other ones

In [None]:
df_single = pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/single_model_res_2020_12_16_09_49_03.pkl')

df_single = pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/single_model_res_2020_12_16_10_54_52.pkl')

In [None]:
df_shared = pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/full_multi_train_2020_12_16_11_14_54.pkl')
# df_shared = pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/full_multi_train_2020_12_16_10_10_16.pkl')
#pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/full_multi_train_2020_12_07_13_57_28.pkl')

In [None]:
res = pd.read_pickle('/userdata/smetzger/repos/cvx_project/bootstrapping-bias-learning/results/scalar_feedback_res_2020_12_07_23_13_09.pkl')

In [None]:
endv_single = df_single['biases_recoverd'].values

In [None]:
endv_shared = []
g = df_shared['biases_recoverd'].values
for gg in g:
    endv_shared.extend(gg)

In [None]:
endv_shared

In [None]:
endv_shared

In [None]:
endv_single

In [None]:
shares = np.array(endv_shared[::4])

b = top_K_expert.bias
print(b.shape)
b = list(np.array(b))
actual = np.array(b + [top_K_expert.cutoff])
print(shares.shape, actual.shape)
b_err_shared = np.linalg.norm(shares-np.array(actual), axis=-1)
print(b_err_shared.shape)
endv_single[::4]
singles = endv_single[::4]
singles = np.array([s for s in singles])


b_err_single = np.linalg.norm(singles-np.array(actual), axis=-1)
print(np.linalg.norm(shares-np.array(actual), axis=-1))
sns.barplot(x='Optimization Scheme', y='Bias Error', data = pd.DataFrame({'Optimization Scheme':['Single']*3 + ['Shared']*3,
                                                                         'Bias Error': np.concatenate((b_err_single, b_err_shared), axis=0)
                                                                        }))
plt.title('Top-K bias parameter error')
plt.savefig('./images/shared_bias_params_topK')

In [None]:
shares = np.array(endv_shared[1::4])

actual = (1.0)
b_err_shared = np.linalg.norm(np.array(shares)-np.array(actual), axis=-1)
endv_single[1::4]
singles = endv_single[1::4]
singles = np.array([s for s in singles])

b_err_single = np.linalg.norm(singles-np.array(actual), axis=-1)
print(b_err_single, b_err_shared)
sns.barplot(x='Optimization Scheme', y='Bias Error', data = pd.DataFrame({'Optimization Scheme':['Single']*3 + ['Shared']*3,
                                                                         'Bias Error': np.concatenate((b_err_single,
                                                                                                       b_err_shared), axis=0)
                                                                        }))
plt.title('Paired Comparisons bias parameter error')
plt.savefig('./images/shared_bias_params_topK')

In [None]:
shares = np.array(endv_shared[3::4])

actual = (1.0)
b_err_shared = np.linalg.norm(np.array(shares)-np.array(actual), axis=-1)
endv_single[3::4]
singles = endv_single[3::4]
singles = np.array([s for s in singles])

b_err_single = np.linalg.norm(singles-np.array(actual), axis=-1)
print(b_err_single, b_err_shared)
sns.barplot(x='Optimization Scheme', y='Bias Error', data = pd.DataFrame({'Optimization Scheme':['Single']*3 + ['Shared']*3,
                                                                         'Bias Error': np.concatenate((b_err_single,
                                                                                                       b_err_shared), axis=0)
                                                                        }))
plt.title('Paired Comparisons bias parameter error')
plt.savefig('./images/shared_bias_params_scaled')

In [None]:
sns.heatmap(env.reward_matrix.reshape(4, 4))
plt.title('Simple Gridworld')
plt.savefig('./images/simple_gridworld')

In [None]:
df_single
df_single = df_single.append(res)


In [None]:
df_single['inf norm, visitation'] = df_single['obs_obtained'] - df_single['obs_optimal']

In [None]:
df_single['inf norm, visitation'] = df_single['inf norm, visitation'].apply(lambda x: np.linalg.norm(np.array(x), ord=np.inf))

In [None]:
df_single

In [None]:
sns.barplot(x='method', y='inf norm, visitation', data = df_single)
plt.title('Visitation Frequency Comparison')
plt.savefig('./images/visitation_freq')

In [None]:
df_shared['inf norm, visitation'] = df_shared['obs_obtained'] - df_shared['obs_optimal']
df_shared['inf norm, visitation'] = df_shared['inf norm, visitation'].apply(lambda x: np.linalg.norm(x, ord=np.inf))

In [None]:
df_shared.head()

In [None]:


df_combo = df_single.append(df_shared)

In [None]:
sns.barplot(x='method', y='inf norm, visitation', data = df_combo)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('./images/vistation_results.png')

In [None]:
sns.barplot(x='method', y='mean_reward_obtained', data=df_combo)
plt.axhline(df_combo['optimal_rew'].values[0], label='MCE IRL mean r')
plt.xticks(rotation=45)
plt.title('Obtained Reward With Recovered Policy')
plt.tight_layout()
plt.savefig('./images/reward_results.png')

In [None]:
df_combo

In [None]:
plt.figure(figsize=(15, 3))
for k in range(3):
    plt.subplot(150+k+1)
    plt.imshow(df_single['obs_obtained'].values[k+6].reshape(4, 4))
    plt.title(['Top K', 'Paired Comparisons', 'Blind IRL'][k])
    if k == 0: 
        plt.xlabel('State X')
        plt.ylabel('State Y')
plt.subplot(150+4)
plt.imshow(df_single['obs_obtained'].values[-2].reshape(4, 4))
plt.title('Scalar Feedback')
if k == 0: 
    plt.xlabel('State X')
    plt.ylabel('State Y')
plt.subplot(155)
plt.imshow(df_single['obs_optimal'].values[3].reshape(4, 4))
plt.title('Optimal (MCE IRL)')
plt.savefig('./images/vistation_example')

In [None]:
plt.imshow(df_single['obs_optimal'].values[3].reshape(4, 4))

In [None]:
df_single