In [313]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2
import numpy as np

from src.argument_parser import get_parser, parse_args_str
from src.GridWorldMDP.utils import draw_path, generate_demonstrations, init_grid_world

PARSER = get_parser()

ARGS = """
--exp_name test
--height 6
--width 6
--gamma 0.8
--act_random 0.3
--n_trajs 10
--l_traj 6
--learning_rate 0.1
--n_iters 100
--alpha 0.1
--n_query 1
--r_max 1
--error 0.01
--grad_clip 0.5
--weight_decay 10
--hiddens 64 32
--device cpu
--verbose 2
"""

args = parse_args_str(PARSER, ARGS)
args.exp_name

'test'

In [314]:
init_start_pos = [(4, 1)]
coor_rates = [
    ((args.height-2, args.width-2), 1.0), 
    ((0, args.width-1), 0.5), 
    ((1, 1), 0.5)
]
gw, P_a, rewards_gt, values_gt, policy_gt = init_grid_world(args, coor_rates)

# use identity matrix as feature
feat_map = np.eye(args.height * args.width)
if init_start_pos is None:
    trajs = generate_demonstrations(gw, policy_gt, 
                                    n_trajs=args.n_query, 
                                    len_traj=args.l_traj, 
                                    rand_start=True, 
                                    start_pos=None)
else:
    if isinstance(init_start_pos[0], list) or isinstance(init_start_pos[0], tuple):
        # multiple start points
        trajs = []
        for sp in init_start_pos:
            t = generate_demonstrations(gw, policy_gt, 
                                        n_trajs=args.n_query, 
                                        len_traj=args.l_traj, 
                                        rand_start=False, 
                                        start_pos=sp)
            trajs.extend(t)
    else:
        # type(init_start_pos[0]) == int
        trajs = generate_demonstrations(gw, policy_gt, 
                                        n_trajs=args.n_query, 
                                        len_traj=args.l_traj, 
                                        rand_start=False, 
                                        start_pos=init_start_pos)
print(draw_path(trajs[0], gw))

[INFO] Initialize Grid World
[INFO] Getting ground truth values and policy via value teration
s=(4, 1), a=r, r=0.0, s'=(4, 2) -> 
s=(4, 2), a=r, r=0.0, s'=(4, 3) -> 
s=(4, 3), a=r, r=0.0, s'=(4, 4) -> 
s=(4, 4), a=s, r=1.0, s'=(4, 4) -> 
s=(4, 4), a=s, r=1.0, s'=(4, 4) -> 
s=(4, 4), a=s, r=1.0, s'=(4, 3)


## Torch Gradient

In [197]:
import torch
import torch.autograd as autograd
import torch.nn as nn

a = torch.tensor([[1., 2.]]).float().requires_grad_()
b = torch.tensor([[3.], [4.]]).float().requires_grad_()
print(f'a size={a.size()}, b size={b.size()}')
c = torch.matmul(a, b)
print(f'a @ b =\n {c}')
print(f'c size={c.size()}')

print(c.backward(-torch.ones_like(c), retain_graph=True))
print(f'a.grad =\n {a.grad}')
print(f'b.grad =\n {b.grad}')
print()
a_grad = autograd.grad(c, a, grad_outputs=-torch.ones_like(c), retain_graph=True)[0]
print(f'a_grad =\n {a_grad}')
b_grad = autograd.grad(c, b, grad_outputs=-torch.ones_like(c), retain_graph=True)[0]
print(f'b_grad =\n {b_grad}')

a size=torch.Size([1, 2]), b size=torch.Size([2, 1])
a @ b =
 tensor([[11.]], grad_fn=<MmBackward0>)
c size=torch.Size([1, 1])
None
a.grad =
 tensor([[-3., -4.]])
b.grad =
 tensor([[-1.],
        [-2.]])

a_grad =
 tensor([[-3., -4.]])
b_grad =
 tensor([[-1.],
        [-2.]])


## Check function is working


$$ \mathcal{L}(\theta) = \log P(\mathcal{D}, \theta \vert r) = \log P(\mathcal{D} \vert r) + \log P(\theta \vert r) $$

* first term: loglikelihood of trajs given rewards
* second term: parameter l2 loss

$$ \dfrac{\partial \mathcal{L}}{\partial \theta} = \dfrac{\partial \mathcal{L}_{\mathcal{D}}}{\partial \theta} + \dfrac{\partial \mathcal{L}_{\theta}}{\partial \theta}$$

first term can be decomposed as:

$$\begin{aligned}
\dfrac{\partial \mathcal{L}_{\mathcal{D}}}{\partial \theta} &= \dfrac{\partial \mathcal{L}_{\mathcal{D}}}{\partial r} \dfrac{\partial r}{\partial \theta} \\
&= (\mu_{\mathcal{D}} - \Bbb{E}[\mu]) \cdot \dfrac{\partial g(f, \theta)}{\partial \theta}
\end{aligned}$$

where $r = g(f, \theta)$, $f$ is the features.

In [289]:
import numpy as np
import torch
import torch.nn as nn

from src.deepmaxent_irl import DeepIRLFC, demo_svf, compute_state_visition_freq
from src.GridWorldMDP.value_iteration import value_iteration

device = torch.device('cpu')
n_states = args.height * args.width
torch.manual_seed(0)

model = DeepIRLFC(input_dim=feat_map.shape[1], hiddens=[3, 3]).to(device)

mu_D = demo_svf(trajs, n_states)
inputs = torch.from_numpy(feat_map).float().to(device)
mu_D

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.,
       1., 0.])

In [290]:
rewards = model(inputs)
rewards_numpy = rewards.view(-1).detach().numpy()
print(rewards_numpy.round(4))

[-0.0275  0.01    0.028   0.0278 -0.0097 -0.0675 -0.0177 -0.0824 -0.0662
 -0.0164  0.0196 -0.0329 -0.0114 -0.0172 -0.0055 -0.0141 -0.0122 -0.0514
  0.0153 -0.0151 -0.0065 -0.051  -0.0644 -0.0724  0.0047 -0.0111 -0.0811
 -0.0152  0.0291 -0.0023 -0.0282 -0.0455 -0.0222  0.0067 -0.0502 -0.0366]


In [291]:
# during iteration
# approximate value iteration
_, policy = value_iteration(P_a, rewards_numpy, gamma=args.gamma, alpha=args.alpha, error=args.error, deterministic=True)
# propagate policy
mu_exp = compute_state_visition_freq(P_a, trajs, policy, deterministic=True)
print(f'poilcy = {policy}')
print(f'mu_D = {mu_D.round(4)}')

poilcy = [2. 2. 2. 1. 3. 3. 0. 1. 1. 1. 4. 3. 0. 3. 2. 1. 1. 3. 3. 3. 3. 0. 0. 0.
 1. 3. 2. 2. 4. 3. 1. 1. 2. 1. 1. 1.]
mu_D = [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 1. 0.]


In [292]:
grad_r = mu_D - mu_exp
grad_r = torch.from_numpy(grad_r).float().view(-1, 1).to(device)
grad_r.view(-1)

tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -6.8180e-02,
        -4.9768e-03,  0.0000e+00,  0.0000e+00,  0.0000e+00, -6.3830e-02,
        -2.3854e+00, -2.1033e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        -4.0740e-03,  8.0147e-01, -2.4139e-02,  0.0000e+00,  0.0000e+00,
         0.0000e+00, -2.5955e-04,  9.8884e-01, -1.9051e-03,  0.0000e+00,
         0.0000e+00,  0.0000e+00, -2.0356e-04,  1.9873e+00, -1.5267e-03,
         0.0000e+00,  0.0000e+00,  0.0000e+00, -9.8496e-06,  9.9955e-01,
        -4.9248e-05])

In [293]:
all_grads = torch.autograd.grad(rewards, model.parameters(), grad_outputs=-grad_r, retain_graph=True)
all_grads[0][-5:, -5:]

tensor([[ 0.0000e+00,  0.0000e+00, -1.0467e-06,  1.1638e-01, -5.5125e-06],
        [ 0.0000e+00,  0.0000e+00,  1.2956e-06, -1.3760e-01,  7.2481e-06],
        [ 0.0000e+00,  0.0000e+00,  1.6622e-06, -1.5529e-01,  7.6223e-06]])

In [294]:
l2_loss = torch.stack([torch.sum(p.pow(2))/2 for p in model.parameters()]).sum()
l2_grad = torch.autograd.grad(l2_loss, model.parameters(), retain_graph=True)
print(l2_loss)
print(l2_grad[0][-5:, -5:])
all_grad_l2 = [args.weight_decay*l2_grad[i]+all_grads[i] for i in range(len(all_grads))]
print('grad l2')
print(all_grad_l2[0][-5:, -5:])

tensor(1.3350, grad_fn=<SumBackward0>)
tensor([[ 0.1440, -0.1080, -0.0767, -0.1164, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1284, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1255, -0.1286]])
grad l2
tensor([[ 1.4400, -1.0803, -0.7672, -1.0480, -1.5609],
        [ 0.3862,  1.0340,  1.6003, -1.4220, -0.6108],
        [-0.4195, -1.5877, -0.0300, -1.4104, -1.2856]])


In [295]:
import tensorflow as tf

grad_theta, _ = tf.clip_by_global_norm(all_grad_l2, args.grad_clip)
grad_theta[0].numpy()[-5:, -5:]

array([[ 0.04346468, -0.03260752, -0.02315768, -0.03163309, -0.04711509],
       [ 0.01165621,  0.03120885,  0.04830385, -0.0429206 , -0.01843552],
       [-0.01266133, -0.04792169, -0.00090416, -0.04257084, -0.03880388]],
      dtype=float32)

In [296]:
# https://www.tensorflow.org/api_docs/python/tf/clip_by_global_norm
clip_norm = args.grad_clip
global_norm = torch.sqrt(torch.stack([torch.norm(g).pow(2) for g in all_grad_l2]).sum())
clip_coef = clip_norm / max(global_norm, clip_norm)
grad_theta = [g * clip_coef for g in all_grad_l2]
grad_theta[0].numpy()[-5:, -5:]

array([[ 0.04346468, -0.03260752, -0.02315768, -0.03163309, -0.04711509],
       [ 0.01165621,  0.03120885,  0.04830385, -0.0429206 , -0.01843552],
       [-0.01266133, -0.04792169, -0.00090416, -0.04257084, -0.03880388]],
      dtype=float32)

In [297]:
param = list(model.parameters())[0]
print('param')
print(f'{param.detach().cpu().numpy().round(4)[-5:, -5:]}')
print()
print('grad')
print(f'{all_grads[0].detach().cpu().numpy().round(4)[-5:, -5:]}')
print('apply gradient without l2')
print(f'{(param - args.learning_rate * all_grads[0]).detach().cpu().numpy().round(4)[-5:, -5:]}')
print('grad with l2')
print(f'{all_grad_l2[0].detach().cpu().numpy().round(4)[-5:, -5:]}')
print('apply gradient with l2')
print(f'{(param - args.learning_rate * all_grad_l2[0]).detach().cpu().numpy().round(4)[-5:, -5:]}')
print('grad with clip and l2')
print(f'{grad_theta[0].detach().cpu().numpy().round(4)[-5:, -5:]}')
print('apply gradient with clip and l2')
print(f'{(param - args.learning_rate * grad_theta[0]).detach().cpu().numpy().round(4)[-5:, -5:]}')

param
[[ 0.144  -0.108  -0.0767 -0.1164 -0.1561]
 [ 0.0386  0.1034  0.16   -0.1284 -0.0611]
 [-0.0419 -0.1588 -0.003  -0.1255 -0.1286]]

grad
[[ 0.      0.     -0.      0.1164 -0.    ]
 [ 0.      0.      0.     -0.1376  0.    ]
 [ 0.      0.      0.     -0.1553  0.    ]]
apply gradient without l2
[[ 0.144  -0.108  -0.0767 -0.1281 -0.1561]
 [ 0.0386  0.1034  0.16   -0.1147 -0.0611]
 [-0.0419 -0.1588 -0.003  -0.11   -0.1286]]
grad with l2
[[ 1.44   -1.0803 -0.7672 -1.048  -1.5609]
 [ 0.3862  1.034   1.6003 -1.422  -0.6108]
 [-0.4195 -1.5877 -0.03   -1.4104 -1.2856]]
apply gradient with l2
[[ 0.      0.      0.     -0.0116  0.    ]
 [ 0.     -0.     -0.      0.0138 -0.    ]
 [ 0.      0.     -0.      0.0155 -0.    ]]
grad with clip and l2
[[ 0.0435 -0.0326 -0.0232 -0.0316 -0.0471]
 [ 0.0117  0.0312  0.0483 -0.0429 -0.0184]
 [-0.0127 -0.0479 -0.0009 -0.0426 -0.0388]]
apply gradient with clip and l2
[[ 0.1397 -0.1048 -0.0744 -0.1133 -0.1514]
 [ 0.0375  0.1003  0.1552 -0.1241 -0.0592]
 [-0.0

In [299]:
torch.manual_seed(0)

model = DeepIRLFC(input_dim=feat_map.shape[1], hiddens=[3, 3]).to(device)
param = list(model.parameters())[0]
print('param')
print(f'{param[-5:, -5:]}')

optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=0.0)
optimizer.zero_grad()
rewards = model(inputs)
rewards_numpy = rewards.view(-1).detach().numpy()
_, policy = value_iteration(P_a, rewards_numpy, gamma=args.gamma, alpha=args.alpha, error=args.error, deterministic=True)
# propagate policy
mu_exp = compute_state_visition_freq(P_a, trajs, policy, deterministic=True)
grad_r = mu_D - mu_exp
grad_r = torch.from_numpy(grad_r).float().view(-1, 1).to(device)

rewards.backward(-grad_r, retain_graph=True)
# nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
# using optimizer
optimizer.step()
param = list(model.parameters())[0]
print('grad\n', param.grad[-5:, -5:])
print('apply gradient\n', param[-5:, -5:])

param
tensor([[ 0.1440, -0.1080, -0.0767, -0.1164, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1284, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1255, -0.1286]],
       grad_fn=<SliceBackward0>)
grad
 tensor([[ 0.0000e+00,  0.0000e+00, -1.0467e-06,  1.1638e-01, -5.5125e-06],
        [ 0.0000e+00,  0.0000e+00,  1.2956e-06, -1.3760e-01,  7.2481e-06],
        [ 0.0000e+00,  0.0000e+00,  1.6622e-06, -1.5529e-01,  7.6223e-06]])
apply gradient
 tensor([[ 0.1440, -0.1080, -0.0767, -0.1281, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1147, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1100, -0.1286]],
       grad_fn=<SliceBackward0>)


In [300]:
torch.manual_seed(0)

model = DeepIRLFC(input_dim=feat_map.shape[1], hiddens=[3, 3]).to(device)
param = list(model.parameters())[0]
print('param')
print(f'{param[-5:, -5:]}')

optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
optimizer.zero_grad()
rewards = model(inputs)
rewards_numpy = rewards.view(-1).detach().numpy()
_, policy = value_iteration(P_a, rewards_numpy, gamma=args.gamma, alpha=args.alpha, error=args.error, deterministic=True)
# propagate policy
mu_exp = compute_state_visition_freq(P_a, trajs, policy, deterministic=True)
grad_r = mu_D - mu_exp
grad_r = torch.from_numpy(grad_r).float().view(-1, 1).to(device)

rewards.backward(-grad_r, retain_graph=True)

l2_loss = torch.stack([torch.sum(p.pow(2))/2 for p in model.parameters()]).sum()
l2_grad = torch.autograd.grad(l2_loss, model.parameters(), retain_graph=True)
# manual 
print('manual grad\n')
mg = (param - args.learning_rate*(param.grad + args.weight_decay*l2_grad[0]))[-5:, -5:].clone()
print(mg)
# using optimizer
optimizer.step()
param = list(model.parameters())[0]
print('grad\n', param.grad[-5:, -5:])
print('apply gradient\n', param[-5:, -5:])
print(param[-5:, -5:] == mg)

param
tensor([[ 0.1440, -0.1080, -0.0767, -0.1164, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1284, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1255, -0.1286]],
       grad_fn=<SliceBackward0>)
manual grad

tensor([[ 0.0000e+00,  7.4506e-09,  1.1176e-07, -1.1638e-02,  5.5134e-07],
        [ 0.0000e+00, -7.4506e-09, -1.3411e-07,  1.3760e-02, -7.2271e-07],
        [ 0.0000e+00,  1.4901e-08, -1.6601e-07,  1.5529e-02, -7.5996e-07]],
       grad_fn=<CloneBackward0>)
grad
 tensor([[ 0.0000e+00,  0.0000e+00, -1.0467e-06,  1.1638e-01, -5.5125e-06],
        [ 0.0000e+00,  0.0000e+00,  1.2956e-06, -1.3760e-01,  7.2481e-06],
        [ 0.0000e+00,  0.0000e+00,  1.6622e-06, -1.5529e-01,  7.6223e-06]])
apply gradient
 tensor([[ 0.0000e+00,  7.4506e-09,  1.1176e-07, -1.1638e-02,  5.5134e-07],
        [ 0.0000e+00, -7.4506e-09, -1.3411e-07,  1.3760e-02, -7.2271e-07],
        [ 0.0000e+00,  1.4901e-08, -1.6601e-07,  1.5529e-02, -7.5996e-07]],
       grad_fn=<SliceBackward0>)
tensor([[True, T

In [302]:
def get_grad_theta(args, rewards, model, grad_r):
    all_grads = torch.autograd.grad(rewards, model.parameters(), grad_outputs=-grad_r, retain_graph=True)
    l2_loss = torch.stack([torch.sum(p.pow(2))/2 for p in model.parameters()]).sum()
    l2_grad = torch.autograd.grad(l2_loss, model.parameters(), retain_graph=True)
    all_grad_l2 = [args.weight_decay*l2_grad[i]+all_grads[i] for i in range(len(all_grads))]
    global_norm = torch.sqrt(torch.stack([torch.norm(g).pow(2) for g in all_grad_l2]).sum())
    clip_coef = args.grad_clip / max(global_norm, args.grad_clip)
    grad_theta = [g * clip_coef for g in all_grad_l2]
    return grad_theta, l2_loss

def apply_gradient(model, grad_theta, args):
    for p, g in zip(model.parameters(), grad_theta):
        p.data -= args.learning_rate * g

In [304]:
# without optimizer
torch.manual_seed(0)

model = DeepIRLFC(input_dim=feat_map.shape[1], hiddens=[3, 3]).to(device)
param = list(model.parameters())[0]
print('param')
print(f'{param[-5:, -5:]}')

rewards = model(inputs)
rewards_numpy = rewards.view(-1).detach().numpy()
_, policy = value_iteration(P_a, rewards_numpy, gamma=args.gamma, alpha=args.alpha, error=args.error, deterministic=True)
# propagate policy
mu_exp = compute_state_visition_freq(P_a, trajs, policy, deterministic=True)
grad_r = mu_D - mu_exp
grad_r = torch.from_numpy(grad_r).float().view(-1, 1).to(device)
grad_theta, l2_loss = get_grad_theta(args, rewards, model, grad_r)
apply_gradient(model, grad_theta, args)
print('grad\n', grad_theta[0][-5:, -5:])
print('apply gradient with clip and l2\n', param[-5:, -5:])

param
tensor([[ 0.1440, -0.1080, -0.0767, -0.1164, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1284, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1255, -0.1286]],
       grad_fn=<SliceBackward0>)
grad
 tensor([[ 0.0435, -0.0326, -0.0232, -0.0316, -0.0471],
        [ 0.0117,  0.0312,  0.0483, -0.0429, -0.0184],
        [-0.0127, -0.0479, -0.0009, -0.0426, -0.0388]])
apply gradient with clip and l2
 tensor([[ 0.1397, -0.1048, -0.0744, -0.1133, -0.1514],
        [ 0.0375,  0.1003,  0.1552, -0.1241, -0.0592],
        [-0.0407, -0.1540, -0.0029, -0.1213, -0.1247]],
       grad_fn=<SliceBackward0>)


In [287]:
torch.manual_seed(0)

model = DeepIRLFC(input_dim=feat_map.shape[1], hiddens=[3, 3]).to(device)
param = list(model.parameters())[0]
print('param')
print(f'{param[-5:, -5:]}')

optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
optimizer.zero_grad()
rewards = model(inputs)
rewards_numpy = rewards.view(-1).detach().numpy()
_, policy = value_iteration(P_a, rewards_numpy, gamma=args.gamma, alpha=args.alpha, error=args.error, deterministic=True)
# propagate policy
mu_exp = compute_state_visition_freq(P_a, trajs, policy, deterministic=True)
grad_r = mu_D - mu_exp
grad_r = torch.from_numpy(grad_r).float().view(-1, 1).to(device)

rewards.backward(-grad_r, retain_graph=True)
nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
# using optimizer
optimizer.step()
print('grad\n', param.grad[-5:, -5:])
print('apply gradient with clip and l2\n', param[-5:, -5:])

param
tensor([[ 0.1440, -0.1080, -0.0767, -0.1164, -0.1561],
        [ 0.0386,  0.1034,  0.1600, -0.1284, -0.0611],
        [-0.0419, -0.1588, -0.0030, -0.1255, -0.1286]],
       grad_fn=<SliceBackward0>)
grad
 tensor([[ 0.0000e+00,  0.0000e+00, -1.9331e-07,  2.1494e-02, -1.0181e-06],
        [ 0.0000e+00,  0.0000e+00,  2.3927e-07, -2.5413e-02,  1.3386e-06],
        [ 0.0000e+00,  0.0000e+00,  3.0697e-07, -2.8679e-02,  1.4077e-06]])
apply gradient with clip and l2
 tensor([[ 0.0000e+00,  7.4506e-09,  2.2352e-08, -2.1494e-03,  1.0431e-07],
        [ 0.0000e+00, -7.4506e-09, -2.9802e-08,  2.5413e-03, -1.2666e-07],
        [ 0.0000e+00,  1.4901e-08, -3.0734e-08,  2.8679e-03, -1.3411e-07]],
       grad_fn=<SliceBackward0>)


---

In [310]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import torch
from src.argument_parser import get_parser, parse_args_str
from src.GridWorldMDP.utils import draw_path, generate_demonstrations, init_grid_world
from src.GridWorldMDP.value_iteration import value_iteration
from src.GridWorldMDP.policy_iteration import finite_policy_iteration, policy_evaluation
from src.deepmaxent_irl import DeepIRLFC, demo_svf, compute_state_visition_freq, deepmaxent_irl

PARSER = get_parser()

ARGS = """
--exp_name test
--height 6
--width 6
--gamma 0.9
--act_random 0.3
--n_trajs 200
--l_traj 6
--learning_rate 0.02
--n_iters 20
--alpha 0.1
--n_query 1
--r_max 1
--error 0.001
--grad_clip 100
--weight_decay 10
--hiddens 3 3
--device cpu
--verbose 2
"""
# learning 조절 잘해야함
args = parse_args_str(PARSER, ARGS)
args.exp_name

coor_rates = [
    ((args.height-2, args.width-2), 1.0), 
    ((0, args.width-1), 0.5), 
    ((1, 1), 0.5)
]
gw, P_a, rewards_gt, values_gt, policy_gt = init_grid_world(args, coor_rates)
# use identity matrix as feature
feat_map = np.eye(args.height * args.width)
trajs = generate_demonstrations(gw, policy_gt, 
                                n_trajs=args.n_trajs, 
                                len_traj=args.l_traj, 
                                rand_start=True, 
                                start_pos=None)

rewards, policy, l2_loss = deepmaxent_irl(feat_map, P_a, trajs, args)

[INFO] Initialize Grid World
[INFO] Getting ground truth values and policy via value teration


  0%|          | 0/20 [00:00<?, ?it/s]

In [311]:
rewards_gt

array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 1. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. ])

In [312]:
rewards

array([0.35287154, 0.47049537, 0.88217884, 0.5293073 , 0.9998027 ,
       0.5293073 , 0.6469312 , 0.05881192, 0.5881192 , 0.47049537,
       0.17643577, 0.17643577, 0.17643577, 0.05881192, 0.6469312 ,
       0.5881192 , 0.2940596 , 0.17643577, 0.88217884, 0.82336694,
       0.47049537, 0.17643577, 0.764555  , 0.5293073 , 0.11762384,
       0.82336694, 0.82336694, 0.17643577, 0.764555  , 0.        ,
       0.41168347, 0.82336694, 0.5881192 , 0.05881192, 0.2940596 ,
       0.17643577], dtype=float32)

# 