In [11]:
%load_ext autoreload
%autoreload 2
import argparse
import numpy as np

from src.maxent_irl_gridworld import run_maxent_irl, draw_path
from src.img_utils import get_infos, draw_maps, draw_acq_maps, draw_evd
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

PARSER = argparse.ArgumentParser(description=None)
PARSER.add_argument('-hei', '--height', default=5, type=int, help='height of the gridworld')
PARSER.add_argument('-wid', '--width', default=5, type=int, help='width of the gridworld')
PARSER.add_argument('-g', '--gamma', default=0.8, type=float, help='discount factor')
PARSER.add_argument('-a', '--act_random', default=0.3, type=float, help='probability of acting randomly')
PARSER.add_argument('-t', '--n_trajs', default=100, type=int, help='number of expert trajectories')
PARSER.add_argument('-l', '--l_traj', default=20, type=int, help='length of expert trajectory')
PARSER.add_argument('--rand_start', dest='rand_start', action='store_true', help='when sampling trajectories, randomly pick start positions')
PARSER.add_argument('--no-rand_start', dest='rand_start',action='store_false', help='when sampling trajectories, fix start positions')
PARSER.set_defaults(rand_start=False)
PARSER.add_argument('-lr', '--learning_rate', default=0.01, type=float, help='learning rate')
PARSER.add_argument('-ni', '--n_iters', default=20, type=int, help='number of iterations')
PARSER.add_argument('-act', '--active', action='store_true', help='active learning setting')  # store true
PARSER.add_argument('-al', '--alpha', default=1.0, type=float, help='temperature parameter for value iteration')
PARSER.add_argument('-nq', '--n_query', default=1, type=int, help='number of queries to the expert(n_demonstrations)')
PARSER.add_argument('-rm', '--r_max', default=1, type=int, help='maximum reward value')
PARSER.add_argument('-er', '--error', default=0.01, type=float, help='error threshold for policy evaluation and value iteration')
PARSER.add_argument('-c', '--grad_clip', default=0.5, type=float, help='Gradient Clipping maximum L1 norm')
PARSER.add_argument('-lam', '--lam', default=0.1, type=float, help='Regularizing Constant')

def parse_args_str(args_str):
    args = PARSER.parse_args(args_str.split())
    return args

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Baseline Settings: 

```python
COMMOM_ARGS = """
--height 5
--width 5
--gamma 0.8
--act_random 0.3
--n_trajs 400
--l_traj 15
--learning_rate 0.01
--n_iters 20
--alpha 0.25
--n_query 1
--r_max 1
--error 0.01
"""
```
* Alpha = 1.0 일때 exploitation 하는 경향이 있음
* Alpha = 0.1 일때 gamma 을 0.95 보다 높게 주면 MaxEnt IRL 시 value iteration에서 converge가 안됨..

In [12]:
BASE_ARGS = """
--height 5
--width 5
--gamma 0.8
--act_random 0.3
--n_trajs 400
--l_traj 15
--learning_rate 0.01
--n_iters 20
--alpha 0.25
--n_query 1
--r_max 1
--error 0.01
-c 0.5
-lam 0.1
"""

COMMOM_ARGS = """
--height 6
--width 6
--gamma 0.8
--act_random 0.3
--n_trajs 10
--l_traj 6
--learning_rate 0.1
--n_iters 100
--alpha 0.1
--n_query 1
--r_max 1
--error 0.001
-c 0.0
-lam 0.5
"""

def get_exp_name(BASE_ARGS, COMMOM_ARGS):
    # compare base_args and common_args only get different arguments
    # named with [arg1]_[value1]-[arg2]_[value2] without '--'
    # e.g. height_5-width_5
    base_args = BASE_ARGS.split('\n')
    common_args = COMMOM_ARGS.split('\n')
    exp_name = ''
    for arg in common_args:
        if arg in base_args:
            continue
        else:
            arg_name = arg.split()[0][2:]
            arg_value = arg.split()[1]
            exp_name += f'{arg_name}_{arg_value}-'
    exp_name = exp_name[:-1]
    return exp_name

exp_name = get_exp_name(BASE_ARGS, COMMOM_ARGS)
exp_name

'height_6-width_6-n_trajs_10-l_traj_6-learning_rate_0.1-n_iters_100-alpha_0.1-error_0.001-_0.0-am_0.5'

In [None]:
# active learning setting
args = parse_args_str(COMMOM_ARGS + '--active')
print(args)
init_start_pose = np.random.randint(0, args.height, size=(args.n_query, 2)).tolist()

history_act = run_maxent_irl(args, init_start_pose=init_start_pose)

args = parse_args_str(COMMOM_ARGS)
print(args)
init_start_pose = np.random.randint(0, args.height, size=(args.n_query, 2)).tolist()
history_rand = run_maxent_irl(args, init_start_pose=init_start_pose)

In [9]:
from IPython.display import clear_output

vd_act_list = []
vd_rand_list = [] 

for i in range(100):
    # active learning setting
    args = parse_args_str(COMMOM_ARGS + '--active')
    print(args)
    init_start_pose = np.random.randint(0, args.height, size=(args.n_query, 2)).tolist()

    history_act = run_maxent_irl(args, init_start_pose=init_start_pose)

    args = parse_args_str(COMMOM_ARGS)
    print(args)
    init_start_pose = np.random.randint(0, args.height, size=(args.n_query, 2)).tolist()
    history_rand = run_maxent_irl(args, init_start_pose=init_start_pose)

    idxs_act, vd_act_temp, info_act = get_infos(history_act, active=True, search_idx=None)
    vd_act_list.append(vd_act_temp['mean'])
    idxs_rand, vd_rand_temp, info_rand = get_infos(history_rand, active=False, search_idx=None)
    vd_rand_list.append(vd_rand_temp['mean'])
    clear_output(wait=True)
vd_act_array = np.array(vd_act_list)
vd_rand_array = np.array(vd_rand_list)
vd_act = dict()
vd_rand = dict()
vd_act['mean'] = vd_act_array.mean(axis = 0)
vd_act['std'] = vd_act_array.std(axis = 0)
vd_rand['mean'] = vd_rand_array.mean(axis = 0)
vd_rand['std'] = vd_rand_array.std(axis = 0)

file_path = Path(f'./exp_figs/{exp_name}').resolve()
if not file_path.exists():
    file_path.mkdir(parents=True)
draw_evd(idxs_act, idxs_rand, vd_act, vd_rand, search_idx=None, file_path=file_path)

    

Namespace(height=6, width=6, gamma=0.8, act_random=0.3, n_trajs=10, l_traj=6, rand_start=False, learning_rate=0.1, n_iters=100, active=True, alpha=0.1, n_query=1, r_max=1, error=0.001, grad_clip=0.0, lam=0.5)
[INFO] Initialize Grid World
[INFO] Getting ground truth values and policy via value teration
[INFO] Initialize trajectories
[INFO] Trajectory length(Include inital starting point) = 7, First trajectories.
(2, 5)(a=2, r=0.0)->(3, 5)(a=2, r=0.0)->(4, 5)(a=2, r=0.0)->(4, 5)(a=1, r=0.0)->(4, 5)(a=1, r=0.0)->(4, 4)(a=1, r=1.0)->(3, 4)(a=4, r=0.0)
[INFO] Start Learning
[INFO - 00001 ] Training MaxEnt IRL
iteration: 0/100
theta: 
 [[0.326 0.613 0.927 0.666 0.502 0.141]
 [0.849 0.127 0.8   0.151 0.105 0.037]
 [0.707 0.792 0.355 0.793 0.76  0.291]
 [0.348 0.894 0.437 0.388 0.901 0.141]
 [0.902 0.406 0.406 0.957 0.834 0.328]
 [0.675 0.107 0.041 0.461 0.85  0.244]]
grad: 
 [[-0.171 -0.322 -0.488 -0.35  -0.264 -0.074]
 [-0.447 -0.067 -0.421 -0.079 -0.055 -0.019]
 [-0.372 -0.417 -0.187 -0.418

KeyboardInterrupt: 

In [None]:
import pickle
with open(f'./exps/history_act-{exp_name}.pkl', 'wb') as file:
    pickle.dump(history_act, file)
with open(f'./exps/history_rand-{exp_name}.pkl', 'wb') as file:
    pickle.dump(history_rand, file)

In [None]:
import pickle
with open(f'./exps/history_act-{exp_name}.pkl', 'rb') as file:
    history_act = pickle.load(file)
with open(f'./exps/history_rand-{exp_name}.pkl', 'rb') as file:
    history_rand = pickle.load(file)

---

In [None]:
file_path = Path(f'./exp_figs/{exp_name}').resolve()
draw_evd(idxs_act, idxs_rand, vd_act, vd_rand, search_idx=None, file_path=file_path)

In [None]:
idxs_act, vd_act, info_act = get_infos(history_act, active=True, search_idx=None)
idxs_rand, vd_rand, info_rand = get_infos(history_rand, active=False, search_idx=None)

In [None]:
file_path = Path(f'./exp_figs/{exp_name}').resolve()
if not file_path.exists():
    file_path.mkdir(parents=True)
draw_maps(args, info_act, active=True, search_idx=None, file_path=file_path)
draw_maps(args, info_rand, active=False, search_idx=None, file_path=file_path)  # same to info_act
draw_evd(idxs_act, idxs_rand, vd_act, vd_rand, search_idx=None, file_path=file_path)
draw_acq_maps(args, info_act, search_idx=None, file_path=file_path)

In [None]:
search_idx = 2
idxs_act, vd_act, info_act = get_infos(history_act, active=True, search_idx=search_idx)
idxs_rand, vd_rand, info_rand = get_infos(history_rand, active=False, search_idx=search_idx)

In [None]:
draw_maps(args, info_act, active=True, search_idx=search_idx)
draw_maps(args, info_rand, active=False, search_idx=search_idx)

In [None]:
for e in range(len(history_act[0]['trajs'])):
    print('Episode {}:'.format(e), end=' ')
    print(draw_path(history_act[0]['trajs'][e]))

In [None]:
for e in range(len(history_rand[0]['trajs'])):
    print('Episode {}:'.format(e), end=' ')
    print(draw_path(history_rand[0]['trajs'][e]))

In [None]:
draw_acq_maps_w_trajs(args, info_act, history_act, num_trajs=1)
draw_acq_maps_w_trajs(args, info_act, history_act, num_trajs=2)