In [1]:
import pickle
import random
from collections import namedtuple

import gym
import numpy as np
import ptan
import torch
import torch.optim as optim
from ptan.agent import float32_preprocessor

from util import PGN, RewardNet

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
from matplotlib import pyplot as plt

In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.01
EPISODES_TO_TRAIN = 4
DEMO_BATCH = 50
seed = 0
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

EpisodeStep = namedtuple('EpisodeStep', field_names=['state', 'action', 'reward', 'next_state'])
Trajectory = namedtuple('Trajectory', field_names=['prob', 'episode_steps'])

In [3]:
def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r
        res.append(sum_r)
    return list(reversed(res))


def process_demonstrations(demo_samples):
    traj_states, traj_actions, traj_qvals, traj_prob = [], [], [], []
    for traj in demo_samples:
        states, actions, rewards, qvals = [], [], [], []
        traj_prob.append(traj.prob)
        for step in traj.episode_steps:
            states.append(step.state)
            actions.append(step.action)
            rewards.append(step.reward)
        qvals.extend(calc_qvals(rewards))

        traj_states.append(states)
        traj_actions.append(actions)
        traj_qvals.append(qvals)
    traj_states = np.array(traj_states, dtype=np.object)
    traj_actions = np.array(traj_actions, dtype=np.object)
    traj_qvals = np.array(traj_qvals, dtype=np.object)
    traj_prob = np.array(traj_prob, dtype=np.float)
    return {'states': traj_states, 'actions': traj_actions, 'qvals': traj_qvals, 'traj_probs': traj_prob}

In [4]:
env = gym.make('CartPole-v1')
agent_net = PGN(env.observation_space.shape[0], env.action_space.n)
reward_net = RewardNet(env.observation_space.shape[0] + 1)
agent = ptan.agent.PolicyAgent(agent_net, preprocessor=float32_preprocessor, apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
optimizer_agent = optim.Adam(agent_net.parameters(), lr=LEARNING_RATE)
optimizer_reward = optim.Adam(reward_net.parameters(), lr=1e-2, weight_decay=1e-4)

In [5]:
with open('demonstrations.list.pkl', 'rb') as f:
    demonstrations = pickle.load(f)
assert (len(demonstrations) > DEMO_BATCH)
print(f'Number of demonstrations: {len(demonstrations)}')
demonstrations = process_demonstrations(demonstrations)

Number of demonstrations: 100


In [6]:
total_rewards = []
step_idx = 0
done_episodes = 0

batch_episodes = 0
batch_states, batch_actions, batch_qvals = [], [], []
cur_rewards = []
loss_rwd = 0.

for step_idx, exp in enumerate(exp_source):
    batch_states.append(exp.state)
    batch_actions.append(int(exp.action))
    x = torch.cat([float32_preprocessor(exp.state), float32_preprocessor([int(exp.action)])]).view(1, -1)
    reward = reward_net(x)
    cur_rewards.append(reward.item())

    if exp.last_state is None:
        batch_qvals.extend(calc_qvals(cur_rewards))
        cur_rewards.clear()
        batch_episodes += 1

    new_rewards = exp_source.pop_total_rewards()
    if new_rewards:
        done_episodes += 1
        reward = new_rewards[0]
        total_rewards.append(reward)
        mean_rewards = float(np.mean(total_rewards[-100:]))
        writer.add_scalar('reward', reward, done_episodes)
        writer.add_scalar('mean_reward', mean_rewards, done_episodes)
        writer.add_scalar('loss_rwd', loss_rwd, done_episodes) 
        print(f'{step_idx}: reward: {reward:6.2f}, mean_100: {mean_rewards:6.2f}, '
              f'episodes: {done_episodes}, reward function loss: {loss_rwd:6.4f}')
        
        if done_episodes%100 == 0 or mean_rewards>=500:
            N_samp = 20
            S1 = np.linspace(-5, 5, N_samp)
            S2 = np.linspace(-3.1457, 3.1457, N_samp)
            S3 = 0*S1
            S4 = 0*S1
            
            S5 = np.ones(N_samp)
            
            for action in [0,1]:
                Reward = np.zeros((N_samp,N_samp))
                for i in range(N_samp):
                    for j in range(N_samp):
                        state = [S1[i], S2[j], 0, 0]
#                         action = 1
                        x = torch.cat([float32_preprocessor(state), float32_preprocessor([int(action)])]).view(1, -1)
                        r = reward_net(x)
                        Reward[i,j] = r

                X, Y = np.meshgrid(S1, S2)
                Z = Reward
                fig = plt.figure()
                ax = plt.axes(projection='3d')
                ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                                cmap='viridis', edgecolor='none')
                ax.set_title('surface');
                ax.set_xlabel('x')
                ax.set_ylabel('y')
                ax.set_zlabel('z');
                ax.view_init(azim=90, elev=90)
                if action ==0:
                    writer.add_figure('x1 vs x2 with a = 0', fig, global_step=done_episodes/100)
                if action ==1:
                    writer.add_figure('x1 vs x2 with a = 1', fig, global_step=done_episodes/100)    
            
        if mean_rewards >= 600:
            print(f'Solved in {step_idx} steps and {done_episodes} episodes!')
            torch.save(agent_net.state_dict(), 'cartpole_learner.mod')
            torch.save(reward_net.state_dict(), 'cartpole-v1_reward_func.mod')
            break
        

    if batch_episodes < EPISODES_TO_TRAIN:
        continue

    states_v = torch.FloatTensor(batch_states)
    batch_actions_t = torch.LongTensor(batch_actions)
    batch_qvals_v = torch.FloatTensor(batch_qvals)

    # reward function learning
    demo_states = demonstrations['states']
    demo_actions = demonstrations['actions']
    demo_probs = demonstrations['traj_probs']
    for rf_i in range(10):
        # Check is preprocessing is required 
        selected = np.random.choice(len(demonstrations), DEMO_BATCH)
#         print(selected)
        demo_states = demo_states[selected]
        demo_actions = demo_actions[selected]
#         demo_probs = demo_probs[selected]
        demo_batch_states, demo_batch_actions = [], []
        for idx in range(len(demo_states)):
            demo_batch_states.extend(demo_states[idx])
            demo_batch_actions.extend(demo_actions[idx])
        demo_batch_states = torch.FloatTensor(demo_batch_states)
        demo_batch_actions = torch.FloatTensor(demo_batch_actions)
        D_demo = torch.cat([demo_batch_states, demo_batch_actions.view(-1, 1)], dim=-1)
        D_samp = torch.cat([states_v, batch_actions_t.float().view(-1, 1)], dim=-1)
        D_samp = torch.cat([D_demo, D_samp])
        # dummy importance weights - fix later
        z = torch.ones((D_samp.shape[0], 1))

        # objective
        D_demo_out = reward_net(D_demo)
        D_samp_out = reward_net(D_samp)
        D_samp_out = z * torch.exp(D_samp_out)
        loss_rwd = torch.mean(D_demo_out) - torch.log(torch.mean(D_samp_out))
        loss_rwd = -loss_rwd  # for maximization

        # update parameters
        optimizer_reward.zero_grad()
        loss_rwd.backward()
        optimizer_reward.step()

    # agent
    optimizer_agent.zero_grad()
    logits_v = agent_net(states_v)
    log_prob_v = torch.log_softmax(logits_v, dim=1)
    # REINFORCE
    log_prob_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]
    loss_v = -log_prob_actions_v.mean()
    
    writer.add_scalar('loss_agent_net', loss_v, done_episodes) 
    
    loss_v.backward()
    optimizer_agent.step()

    batch_episodes = 0
    batch_states.clear()
    batch_actions.clear()
    batch_qvals.clear()
env.close()
writer.close()

10: reward:  10.00, mean_100:  10.00, episodes: 1, reward function loss: 0.0000
23: reward:  13.00, mean_100:  11.50, episodes: 2, reward function loss: 0.0000
38: reward:  15.00, mean_100:  12.67, episodes: 3, reward function loss: 0.0000
72: reward:  34.00, mean_100:  18.00, episodes: 4, reward function loss: 0.0001
84: reward:  12.00, mean_100:  16.80, episodes: 5, reward function loss: 0.0001
101: reward:  17.00, mean_100:  16.83, episodes: 6, reward function loss: 0.0001
112: reward:  11.00, mean_100:  16.00, episodes: 7, reward function loss: 0.0001
142: reward:  30.00, mean_100:  17.75, episodes: 8, reward function loss: -0.0000
161: reward:  19.00, mean_100:  17.89, episodes: 9, reward function loss: -0.0000
177: reward:  16.00, mean_100:  17.70, episodes: 10, reward function loss: -0.0000
186: reward:   9.00, mean_100:  16.91, episodes: 11, reward function loss: -0.0000
205: reward:  19.00, mean_100:  17.08, episodes: 12, reward function loss: -0.0000
216: reward:  11.00, mean

2945: reward:  36.00, mean_100:  29.45, episodes: 100, reward function loss: -0.0009
2987: reward:  42.00, mean_100:  29.77, episodes: 101, reward function loss: -0.0009
3002: reward:  15.00, mean_100:  29.79, episodes: 102, reward function loss: -0.0009
3082: reward:  80.00, mean_100:  30.44, episodes: 103, reward function loss: -0.0009
3134: reward:  52.00, mean_100:  30.62, episodes: 104, reward function loss: -0.0000
3174: reward:  40.00, mean_100:  30.90, episodes: 105, reward function loss: -0.0000
3288: reward: 114.00, mean_100:  31.87, episodes: 106, reward function loss: -0.0000
3335: reward:  47.00, mean_100:  32.23, episodes: 107, reward function loss: -0.0000
3450: reward: 115.00, mean_100:  33.08, episodes: 108, reward function loss: -0.0004
3516: reward:  66.00, mean_100:  33.55, episodes: 109, reward function loss: -0.0004
3565: reward:  49.00, mean_100:  33.88, episodes: 110, reward function loss: -0.0004
3635: reward:  70.00, mean_100:  34.49, episodes: 111, reward fun

21887: reward: 500.00, mean_100: 190.97, episodes: 197, reward function loss: -0.0005
22163: reward: 276.00, mean_100: 192.96, episodes: 198, reward function loss: -0.0005
22620: reward: 457.00, mean_100: 197.11, episodes: 199, reward function loss: -0.0005
23010: reward: 390.00, mean_100: 200.65, episodes: 200, reward function loss: -0.0001
23510: reward: 500.00, mean_100: 205.23, episodes: 201, reward function loss: -0.0001
24010: reward: 500.00, mean_100: 210.08, episodes: 202, reward function loss: -0.0001
24324: reward: 314.00, mean_100: 212.42, episodes: 203, reward function loss: -0.0001
24824: reward: 500.00, mean_100: 216.90, episodes: 204, reward function loss: -0.0000
24903: reward:  79.00, mean_100: 217.29, episodes: 205, reward function loss: -0.0000
25403: reward: 500.00, mean_100: 221.15, episodes: 206, reward function loss: -0.0000
25858: reward: 455.00, mean_100: 225.23, episodes: 207, reward function loss: -0.0000
26358: reward: 500.00, mean_100: 229.08, episodes: 208

41432: reward: 131.00, mean_100: 200.45, episodes: 296, reward function loss: -0.0004
41543: reward: 111.00, mean_100: 196.56, episodes: 297, reward function loss: -0.0004
41645: reward: 102.00, mean_100: 194.82, episodes: 298, reward function loss: -0.0004
41767: reward: 122.00, mean_100: 191.47, episodes: 299, reward function loss: -0.0004
41871: reward: 104.00, mean_100: 188.61, episodes: 300, reward function loss: -0.0017
41896: reward:  25.00, mean_100: 183.86, episodes: 301, reward function loss: -0.0017
42007: reward: 111.00, mean_100: 179.97, episodes: 302, reward function loss: -0.0017
42113: reward: 106.00, mean_100: 177.89, episodes: 303, reward function loss: -0.0017
42138: reward:  25.00, mean_100: 173.14, episodes: 304, reward function loss: -0.0017
42166: reward:  28.00, mean_100: 172.63, episodes: 305, reward function loss: -0.0017
42278: reward: 112.00, mean_100: 168.75, episodes: 306, reward function loss: -0.0017
42299: reward:  21.00, mean_100: 164.41, episodes: 307

49974: reward: 139.00, mean_100:  89.51, episodes: 392, reward function loss: -0.0110
50119: reward: 145.00, mean_100:  89.76, episodes: 393, reward function loss: -0.0110
50253: reward: 134.00, mean_100:  89.83, episodes: 394, reward function loss: -0.0110
50400: reward: 147.00, mean_100:  90.99, episodes: 395, reward function loss: -0.0110
50528: reward: 128.00, mean_100:  90.96, episodes: 396, reward function loss: -0.0115
50673: reward: 145.00, mean_100:  91.30, episodes: 397, reward function loss: -0.0115
50808: reward: 135.00, mean_100:  91.63, episodes: 398, reward function loss: -0.0115
50952: reward: 144.00, mean_100:  91.85, episodes: 399, reward function loss: -0.0115
51094: reward: 142.00, mean_100:  92.23, episodes: 400, reward function loss: -0.0104
51237: reward: 143.00, mean_100:  93.41, episodes: 401, reward function loss: -0.0104
51365: reward: 128.00, mean_100:  93.58, episodes: 402, reward function loss: -0.0104
51516: reward: 151.00, mean_100:  94.03, episodes: 403

68516: reward: 359.00, mean_100: 190.71, episodes: 488, reward function loss: -0.0222
68894: reward: 378.00, mean_100: 193.25, episodes: 489, reward function loss: -0.0222
69264: reward: 370.00, mean_100: 195.64, episodes: 490, reward function loss: -0.0222
69618: reward: 354.00, mean_100: 197.83, episodes: 491, reward function loss: -0.0222
69984: reward: 366.00, mean_100: 200.10, episodes: 492, reward function loss: -0.0228
70416: reward: 432.00, mean_100: 202.97, episodes: 493, reward function loss: -0.0228
70812: reward: 396.00, mean_100: 205.59, episodes: 494, reward function loss: -0.0228
71222: reward: 410.00, mean_100: 208.22, episodes: 495, reward function loss: -0.0228
71644: reward: 422.00, mean_100: 211.16, episodes: 496, reward function loss: -0.0279
72144: reward: 500.00, mean_100: 214.71, episodes: 497, reward function loss: -0.0279
72623: reward: 479.00, mean_100: 218.15, episodes: 498, reward function loss: -0.0279
73059: reward: 436.00, mean_100: 221.07, episodes: 499

114948: reward: 500.00, mean_100: 481.57, episodes: 583, reward function loss: -0.0001
115448: reward: 500.00, mean_100: 483.47, episodes: 584, reward function loss: -0.0001
115948: reward: 500.00, mean_100: 484.89, episodes: 585, reward function loss: -0.0001
116448: reward: 500.00, mean_100: 486.70, episodes: 586, reward function loss: -0.0001
116948: reward: 500.00, mean_100: 487.91, episodes: 587, reward function loss: -0.0001
117448: reward: 500.00, mean_100: 489.32, episodes: 588, reward function loss: -0.0002
117948: reward: 500.00, mean_100: 490.54, episodes: 589, reward function loss: -0.0002
118448: reward: 500.00, mean_100: 491.84, episodes: 590, reward function loss: -0.0002
118948: reward: 500.00, mean_100: 493.30, episodes: 591, reward function loss: -0.0002
119448: reward: 500.00, mean_100: 494.64, episodes: 592, reward function loss: -0.0018
119948: reward: 500.00, mean_100: 495.32, episodes: 593, reward function loss: -0.0018
120448: reward: 500.00, mean_100: 496.36, e

161147: reward: 500.00, mean_100: 486.99, episodes: 678, reward function loss: -0.0474
161647: reward: 500.00, mean_100: 486.99, episodes: 679, reward function loss: -0.0474
162147: reward: 500.00, mean_100: 486.99, episodes: 680, reward function loss: -0.0476
162647: reward: 500.00, mean_100: 486.99, episodes: 681, reward function loss: -0.0476
163147: reward: 500.00, mean_100: 486.99, episodes: 682, reward function loss: -0.0476
163647: reward: 500.00, mean_100: 486.99, episodes: 683, reward function loss: -0.0476
164147: reward: 500.00, mean_100: 486.99, episodes: 684, reward function loss: -0.0477
164647: reward: 500.00, mean_100: 486.99, episodes: 685, reward function loss: -0.0477
165147: reward: 500.00, mean_100: 486.99, episodes: 686, reward function loss: -0.0477
165647: reward: 500.00, mean_100: 486.99, episodes: 687, reward function loss: -0.0477
166147: reward: 500.00, mean_100: 486.99, episodes: 688, reward function loss: -0.0476
166647: reward: 500.00, mean_100: 486.99, e

206224: reward: 500.00, mean_100: 475.77, episodes: 773, reward function loss: -0.0477
206724: reward: 500.00, mean_100: 475.77, episodes: 774, reward function loss: -0.0477
207113: reward: 389.00, mean_100: 474.66, episodes: 775, reward function loss: -0.0477
207613: reward: 500.00, mean_100: 474.66, episodes: 776, reward function loss: -0.0449
208113: reward: 500.00, mean_100: 474.66, episodes: 777, reward function loss: -0.0449
208613: reward: 500.00, mean_100: 474.66, episodes: 778, reward function loss: -0.0449
209113: reward: 500.00, mean_100: 474.66, episodes: 779, reward function loss: -0.0449
209304: reward: 191.00, mean_100: 471.57, episodes: 780, reward function loss: -0.0408
209804: reward: 500.00, mean_100: 471.57, episodes: 781, reward function loss: -0.0408
210304: reward: 500.00, mean_100: 471.57, episodes: 782, reward function loss: -0.0408
210804: reward: 500.00, mean_100: 471.57, episodes: 783, reward function loss: -0.0408
211304: reward: 500.00, mean_100: 471.57, e

253304: reward: 500.00, mean_100: 495.80, episodes: 868, reward function loss: -0.0473
253804: reward: 500.00, mean_100: 495.80, episodes: 869, reward function loss: -0.0473
254304: reward: 500.00, mean_100: 495.80, episodes: 870, reward function loss: -0.0473
254804: reward: 500.00, mean_100: 495.80, episodes: 871, reward function loss: -0.0473
255304: reward: 500.00, mean_100: 495.80, episodes: 872, reward function loss: -0.0477
255804: reward: 500.00, mean_100: 495.80, episodes: 873, reward function loss: -0.0477
255873: reward:  69.00, mean_100: 491.49, episodes: 874, reward function loss: -0.0477
256373: reward: 500.00, mean_100: 492.60, episodes: 875, reward function loss: -0.0477
256873: reward: 500.00, mean_100: 492.60, episodes: 876, reward function loss: -0.0377
257373: reward: 500.00, mean_100: 492.60, episodes: 877, reward function loss: -0.0377
257873: reward: 500.00, mean_100: 492.60, episodes: 878, reward function loss: -0.0377
258373: reward: 500.00, mean_100: 492.60, e

300234: reward: 500.00, mean_100: 494.30, episodes: 963, reward function loss: -0.0470
300734: reward: 500.00, mean_100: 494.30, episodes: 964, reward function loss: -0.0474
301234: reward: 500.00, mean_100: 494.30, episodes: 965, reward function loss: -0.0474
301672: reward: 438.00, mean_100: 493.68, episodes: 966, reward function loss: -0.0474
302172: reward: 500.00, mean_100: 493.68, episodes: 967, reward function loss: -0.0474
302672: reward: 500.00, mean_100: 493.68, episodes: 968, reward function loss: -0.0460
303172: reward: 500.00, mean_100: 493.68, episodes: 969, reward function loss: -0.0460
303672: reward: 500.00, mean_100: 493.68, episodes: 970, reward function loss: -0.0460
304172: reward: 500.00, mean_100: 493.68, episodes: 971, reward function loss: -0.0460
304672: reward: 500.00, mean_100: 493.68, episodes: 972, reward function loss: -0.0476
305172: reward: 500.00, mean_100: 493.68, episodes: 973, reward function loss: -0.0476
305672: reward: 500.00, mean_100: 497.99, e

347084: reward: 500.00, mean_100: 498.50, episodes: 1057, reward function loss: -0.0465
347584: reward: 500.00, mean_100: 498.50, episodes: 1058, reward function loss: -0.0465
348084: reward: 500.00, mean_100: 498.50, episodes: 1059, reward function loss: -0.0465
348584: reward: 500.00, mean_100: 498.50, episodes: 1060, reward function loss: -0.0476
349084: reward: 500.00, mean_100: 498.50, episodes: 1061, reward function loss: -0.0476
349584: reward: 500.00, mean_100: 498.50, episodes: 1062, reward function loss: -0.0476
350084: reward: 500.00, mean_100: 498.50, episodes: 1063, reward function loss: -0.0476
350584: reward: 500.00, mean_100: 498.50, episodes: 1064, reward function loss: -0.0474
351084: reward: 500.00, mean_100: 498.50, episodes: 1065, reward function loss: -0.0474
351584: reward: 500.00, mean_100: 499.12, episodes: 1066, reward function loss: -0.0474
352084: reward: 500.00, mean_100: 499.12, episodes: 1067, reward function loss: -0.0474
352584: reward: 500.00, mean_100

393902: reward: 500.00, mean_100: 498.18, episodes: 1151, reward function loss: -0.0001
394402: reward: 500.00, mean_100: 498.18, episodes: 1152, reward function loss: -0.0014
394902: reward: 500.00, mean_100: 498.18, episodes: 1153, reward function loss: -0.0014
395402: reward: 500.00, mean_100: 498.18, episodes: 1154, reward function loss: -0.0014
395902: reward: 500.00, mean_100: 498.18, episodes: 1155, reward function loss: -0.0014
396402: reward: 500.00, mean_100: 498.18, episodes: 1156, reward function loss: -0.0015
396902: reward: 500.00, mean_100: 498.18, episodes: 1157, reward function loss: -0.0015
397402: reward: 500.00, mean_100: 498.18, episodes: 1158, reward function loss: -0.0015
397902: reward: 500.00, mean_100: 498.18, episodes: 1159, reward function loss: -0.0015
398402: reward: 500.00, mean_100: 498.18, episodes: 1160, reward function loss: -0.0046
398902: reward: 500.00, mean_100: 498.18, episodes: 1161, reward function loss: -0.0046
399402: reward: 500.00, mean_100

432992: reward: 500.00, mean_100: 420.90, episodes: 1245, reward function loss: -0.0156
433492: reward: 500.00, mean_100: 420.90, episodes: 1246, reward function loss: -0.0156
433510: reward:  18.00, mean_100: 416.08, episodes: 1247, reward function loss: -0.0156
434010: reward: 500.00, mean_100: 416.08, episodes: 1248, reward function loss: -0.0187
434510: reward: 500.00, mean_100: 416.08, episodes: 1249, reward function loss: -0.0187
435010: reward: 500.00, mean_100: 416.08, episodes: 1250, reward function loss: -0.0187
435510: reward: 500.00, mean_100: 416.08, episodes: 1251, reward function loss: -0.0187
436010: reward: 500.00, mean_100: 416.08, episodes: 1252, reward function loss: -0.0244
436510: reward: 500.00, mean_100: 416.08, episodes: 1253, reward function loss: -0.0244
437010: reward: 500.00, mean_100: 416.08, episodes: 1254, reward function loss: -0.0244
437510: reward: 500.00, mean_100: 416.08, episodes: 1255, reward function loss: -0.0244
438010: reward: 500.00, mean_100

472806: reward: 500.00, mean_100: 419.20, episodes: 1339, reward function loss: -0.0001
473306: reward: 500.00, mean_100: 419.20, episodes: 1340, reward function loss: -0.0001
473806: reward: 500.00, mean_100: 423.53, episodes: 1341, reward function loss: -0.0001
474306: reward: 500.00, mean_100: 423.53, episodes: 1342, reward function loss: -0.0001
474776: reward: 470.00, mean_100: 423.23, episodes: 1343, reward function loss: -0.0001
475026: reward: 250.00, mean_100: 425.34, episodes: 1344, reward function loss: -0.0003
475418: reward: 392.00, mean_100: 424.26, episodes: 1345, reward function loss: -0.0003
475918: reward: 500.00, mean_100: 424.26, episodes: 1346, reward function loss: -0.0003
476148: reward: 230.00, mean_100: 426.38, episodes: 1347, reward function loss: -0.0003
476648: reward: 500.00, mean_100: 426.38, episodes: 1348, reward function loss: -0.0013
477077: reward: 429.00, mean_100: 425.67, episodes: 1349, reward function loss: -0.0013
477577: reward: 500.00, mean_100

506023: reward: 500.00, mean_100: 358.12, episodes: 1433, reward function loss: 0.0000
506156: reward: 133.00, mean_100: 354.45, episodes: 1434, reward function loss: 0.0000
506656: reward: 500.00, mean_100: 355.19, episodes: 1435, reward function loss: 0.0000
507156: reward: 500.00, mean_100: 357.40, episodes: 1436, reward function loss: 0.0000
507656: reward: 500.00, mean_100: 358.50, episodes: 1437, reward function loss: 0.0000
508156: reward: 500.00, mean_100: 358.50, episodes: 1438, reward function loss: 0.0000
508656: reward: 500.00, mean_100: 358.50, episodes: 1439, reward function loss: 0.0000
509156: reward: 500.00, mean_100: 358.50, episodes: 1440, reward function loss: 0.0000
509408: reward: 252.00, mean_100: 356.02, episodes: 1441, reward function loss: 0.0000
509908: reward: 500.00, mean_100: 356.02, episodes: 1442, reward function loss: 0.0000
510408: reward: 500.00, mean_100: 356.32, episodes: 1443, reward function loss: 0.0000
510908: reward: 500.00, mean_100: 358.82, e

552655: reward: 500.00, mean_100: 491.32, episodes: 1528, reward function loss: 0.0000
553155: reward: 500.00, mean_100: 491.32, episodes: 1529, reward function loss: 0.0000
553655: reward: 500.00, mean_100: 491.32, episodes: 1530, reward function loss: 0.0000
554155: reward: 500.00, mean_100: 491.32, episodes: 1531, reward function loss: 0.0000
554655: reward: 500.00, mean_100: 491.32, episodes: 1532, reward function loss: 0.0000
555155: reward: 500.00, mean_100: 491.32, episodes: 1533, reward function loss: 0.0000
555655: reward: 500.00, mean_100: 494.99, episodes: 1534, reward function loss: 0.0000
556155: reward: 500.00, mean_100: 494.99, episodes: 1535, reward function loss: 0.0000
556655: reward: 500.00, mean_100: 494.99, episodes: 1536, reward function loss: 0.0000
557155: reward: 500.00, mean_100: 494.99, episodes: 1537, reward function loss: 0.0000
557655: reward: 500.00, mean_100: 494.99, episodes: 1538, reward function loss: 0.0000
558155: reward: 500.00, mean_100: 494.99, e

598873: reward: 500.00, mean_100: 487.18, episodes: 1623, reward function loss: 0.0000
599244: reward: 371.00, mean_100: 485.89, episodes: 1624, reward function loss: -0.0000
599744: reward: 500.00, mean_100: 485.89, episodes: 1625, reward function loss: -0.0000
600244: reward: 500.00, mean_100: 485.89, episodes: 1626, reward function loss: -0.0000
600744: reward: 500.00, mean_100: 485.89, episodes: 1627, reward function loss: -0.0000
601244: reward: 500.00, mean_100: 485.89, episodes: 1628, reward function loss: 0.0000
601681: reward: 437.00, mean_100: 485.26, episodes: 1629, reward function loss: 0.0000
602055: reward: 374.00, mean_100: 484.00, episodes: 1630, reward function loss: 0.0000
602555: reward: 500.00, mean_100: 484.00, episodes: 1631, reward function loss: 0.0000
603055: reward: 500.00, mean_100: 484.00, episodes: 1632, reward function loss: -0.0000
603555: reward: 500.00, mean_100: 484.00, episodes: 1633, reward function loss: -0.0000
604055: reward: 500.00, mean_100: 484

639817: reward: 500.00, mean_100: 438.45, episodes: 1717, reward function loss: -0.0394
640196: reward: 379.00, mean_100: 437.24, episodes: 1718, reward function loss: -0.0394
640496: reward: 300.00, mean_100: 435.24, episodes: 1719, reward function loss: -0.0394
640996: reward: 500.00, mean_100: 435.24, episodes: 1720, reward function loss: -0.0405
641293: reward: 297.00, mean_100: 433.21, episodes: 1721, reward function loss: -0.0405
641793: reward: 500.00, mean_100: 434.20, episodes: 1722, reward function loss: -0.0405
642156: reward: 363.00, mean_100: 432.83, episodes: 1723, reward function loss: -0.0405
642584: reward: 428.00, mean_100: 433.40, episodes: 1724, reward function loss: -0.0384
643079: reward: 495.00, mean_100: 433.35, episodes: 1725, reward function loss: -0.0384
643308: reward: 229.00, mean_100: 430.64, episodes: 1726, reward function loss: -0.0384
643808: reward: 500.00, mean_100: 430.64, episodes: 1727, reward function loss: -0.0384
644301: reward: 493.00, mean_100

676143: reward: 374.00, mean_100: 389.77, episodes: 1811, reward function loss: -0.0298
676529: reward: 386.00, mean_100: 388.67, episodes: 1812, reward function loss: -0.0360
676823: reward: 294.00, mean_100: 387.88, episodes: 1813, reward function loss: -0.0360
677323: reward: 500.00, mean_100: 387.98, episodes: 1814, reward function loss: -0.0360
677819: reward: 496.00, mean_100: 388.50, episodes: 1815, reward function loss: -0.0360
678134: reward: 315.00, mean_100: 388.17, episodes: 1816, reward function loss: -0.0387
678508: reward: 374.00, mean_100: 386.91, episodes: 1817, reward function loss: -0.0387
679008: reward: 500.00, mean_100: 388.12, episodes: 1818, reward function loss: -0.0387
679502: reward: 494.00, mean_100: 390.06, episodes: 1819, reward function loss: -0.0387
679809: reward: 307.00, mean_100: 388.13, episodes: 1820, reward function loss: -0.0400
680129: reward: 320.00, mean_100: 388.36, episodes: 1821, reward function loss: -0.0400
680499: reward: 370.00, mean_100

712879: reward: 500.00, mean_100: 385.46, episodes: 1906, reward function loss: -0.0339
713184: reward: 305.00, mean_100: 384.90, episodes: 1907, reward function loss: -0.0339
713509: reward: 325.00, mean_100: 385.09, episodes: 1908, reward function loss: -0.0347
714009: reward: 500.00, mean_100: 387.03, episodes: 1909, reward function loss: -0.0347
714309: reward: 300.00, mean_100: 385.40, episodes: 1910, reward function loss: -0.0347
714750: reward: 441.00, mean_100: 386.07, episodes: 1911, reward function loss: -0.0347
715214: reward: 464.00, mean_100: 386.85, episodes: 1912, reward function loss: -0.0411
715648: reward: 434.00, mean_100: 388.25, episodes: 1913, reward function loss: -0.0411
716033: reward: 385.00, mean_100: 387.10, episodes: 1914, reward function loss: -0.0411
716342: reward: 309.00, mean_100: 385.23, episodes: 1915, reward function loss: -0.0411
716800: reward: 458.00, mean_100: 386.66, episodes: 1916, reward function loss: -0.0383
717110: reward: 310.00, mean_100

748469: reward: 315.00, mean_100: 378.20, episodes: 2000, reward function loss: -0.0345
748969: reward: 500.00, mean_100: 378.78, episodes: 2001, reward function loss: -0.0345
749359: reward: 390.00, mean_100: 379.61, episodes: 2002, reward function loss: -0.0345
749679: reward: 320.00, mean_100: 379.09, episodes: 2003, reward function loss: -0.0345
749999: reward: 320.00, mean_100: 379.31, episodes: 2004, reward function loss: -0.0360
750439: reward: 440.00, mean_100: 380.60, episodes: 2005, reward function loss: -0.0360
750827: reward: 388.00, mean_100: 379.48, episodes: 2006, reward function loss: -0.0360
751283: reward: 456.00, mean_100: 380.99, episodes: 2007, reward function loss: -0.0360
751783: reward: 500.00, mean_100: 382.74, episodes: 2008, reward function loss: -0.0424
752235: reward: 452.00, mean_100: 382.26, episodes: 2009, reward function loss: -0.0424
752695: reward: 460.00, mean_100: 383.86, episodes: 2010, reward function loss: -0.0424
753195: reward: 500.00, mean_100

785871: reward: 447.00, mean_100: 395.77, episodes: 2094, reward function loss: -0.0310
786188: reward: 317.00, mean_100: 393.94, episodes: 2095, reward function loss: -0.0310
786562: reward: 374.00, mean_100: 395.25, episodes: 2096, reward function loss: -0.0371
786950: reward: 388.00, mean_100: 394.13, episodes: 2097, reward function loss: -0.0371
787318: reward: 368.00, mean_100: 394.67, episodes: 2098, reward function loss: -0.0371
787642: reward: 324.00, mean_100: 394.88, episodes: 2099, reward function loss: -0.0371
788142: reward: 500.00, mean_100: 396.73, episodes: 2100, reward function loss: -0.0381
788529: reward: 387.00, mean_100: 395.60, episodes: 2101, reward function loss: -0.0381
788961: reward: 432.00, mean_100: 396.02, episodes: 2102, reward function loss: -0.0381
789410: reward: 449.00, mean_100: 397.31, episodes: 2103, reward function loss: -0.0381
789910: reward: 500.00, mean_100: 399.11, episodes: 2104, reward function loss: -0.0418
790363: reward: 453.00, mean_100

822829: reward: 500.00, mean_100: 391.32, episodes: 2188, reward function loss: -0.0306
823329: reward: 500.00, mean_100: 393.20, episodes: 2189, reward function loss: -0.0306
823767: reward: 438.00, mean_100: 394.37, episodes: 2190, reward function loss: -0.0306
824092: reward: 325.00, mean_100: 393.03, episodes: 2191, reward function loss: -0.0306
824477: reward: 385.00, mean_100: 394.48, episodes: 2192, reward function loss: -0.0382
824977: reward: 500.00, mean_100: 395.53, episodes: 2193, reward function loss: -0.0382
825477: reward: 500.00, mean_100: 396.06, episodes: 2194, reward function loss: -0.0382
825977: reward: 500.00, mean_100: 397.89, episodes: 2195, reward function loss: -0.0382
826358: reward: 381.00, mean_100: 397.96, episodes: 2196, reward function loss: -0.0438
826858: reward: 500.00, mean_100: 399.08, episodes: 2197, reward function loss: -0.0438
827220: reward: 362.00, mean_100: 399.02, episodes: 2198, reward function loss: -0.0438
827597: reward: 377.00, mean_100

860640: reward: 375.00, mean_100: 399.83, episodes: 2282, reward function loss: -0.0386
861099: reward: 459.00, mean_100: 400.78, episodes: 2283, reward function loss: -0.0386
861430: reward: 331.00, mean_100: 399.09, episodes: 2284, reward function loss: -0.0375
861750: reward: 320.00, mean_100: 399.33, episodes: 2285, reward function loss: -0.0375
862123: reward: 373.00, mean_100: 400.49, episodes: 2286, reward function loss: -0.0375
862614: reward: 491.00, mean_100: 402.85, episodes: 2287, reward function loss: -0.0375
863114: reward: 500.00, mean_100: 402.85, episodes: 2288, reward function loss: -0.0403
863517: reward: 403.00, mean_100: 401.88, episodes: 2289, reward function loss: -0.0403
864017: reward: 500.00, mean_100: 402.50, episodes: 2290, reward function loss: -0.0403
864420: reward: 403.00, mean_100: 403.28, episodes: 2291, reward function loss: -0.0403
864742: reward: 322.00, mean_100: 402.65, episodes: 2292, reward function loss: -0.0368
865086: reward: 344.00, mean_100

898727: reward: 500.00, mean_100: 404.92, episodes: 2376, reward function loss: -0.0402
898996: reward: 269.00, mean_100: 403.92, episodes: 2377, reward function loss: -0.0402
899494: reward: 498.00, mean_100: 404.93, episodes: 2378, reward function loss: -0.0402
899914: reward: 420.00, mean_100: 405.33, episodes: 2379, reward function loss: -0.0402
900394: reward: 480.00, mean_100: 405.13, episodes: 2380, reward function loss: -0.0371
900894: reward: 500.00, mean_100: 406.29, episodes: 2381, reward function loss: -0.0371
901291: reward: 397.00, mean_100: 406.51, episodes: 2382, reward function loss: -0.0371
901773: reward: 482.00, mean_100: 406.74, episodes: 2383, reward function loss: -0.0371
902273: reward: 500.00, mean_100: 408.43, episodes: 2384, reward function loss: -0.0443
902773: reward: 500.00, mean_100: 410.23, episodes: 2385, reward function loss: -0.0443
903273: reward: 500.00, mean_100: 411.50, episodes: 2386, reward function loss: -0.0443
903773: reward: 500.00, mean_100

941667: reward: 429.00, mean_100: 455.33, episodes: 2470, reward function loss: -0.0380
942167: reward: 500.00, mean_100: 455.74, episodes: 2471, reward function loss: -0.0380
942667: reward: 500.00, mean_100: 456.23, episodes: 2472, reward function loss: -0.0387
943167: reward: 500.00, mean_100: 457.24, episodes: 2473, reward function loss: -0.0387
943667: reward: 500.00, mean_100: 458.27, episodes: 2474, reward function loss: -0.0387
944167: reward: 500.00, mean_100: 459.40, episodes: 2475, reward function loss: -0.0387
944515: reward: 348.00, mean_100: 457.88, episodes: 2476, reward function loss: -0.0359
945015: reward: 500.00, mean_100: 460.19, episodes: 2477, reward function loss: -0.0359
945515: reward: 500.00, mean_100: 460.21, episodes: 2478, reward function loss: -0.0359
946015: reward: 500.00, mean_100: 461.01, episodes: 2479, reward function loss: -0.0359
946481: reward: 466.00, mean_100: 460.87, episodes: 2480, reward function loss: -0.0375
946981: reward: 500.00, mean_100

983831: reward: 500.00, mean_100: 448.71, episodes: 2564, reward function loss: -0.0345
984331: reward: 500.00, mean_100: 449.93, episodes: 2565, reward function loss: -0.0345
984831: reward: 500.00, mean_100: 449.93, episodes: 2566, reward function loss: -0.0345
985331: reward: 500.00, mean_100: 450.93, episodes: 2567, reward function loss: -0.0345
985831: reward: 500.00, mean_100: 450.93, episodes: 2568, reward function loss: -0.0422
986331: reward: 500.00, mean_100: 450.93, episodes: 2569, reward function loss: -0.0422
986831: reward: 500.00, mean_100: 451.64, episodes: 2570, reward function loss: -0.0422
987331: reward: 500.00, mean_100: 451.64, episodes: 2571, reward function loss: -0.0422
987831: reward: 500.00, mean_100: 451.64, episodes: 2572, reward function loss: -0.0434
988331: reward: 500.00, mean_100: 451.64, episodes: 2573, reward function loss: -0.0434
988831: reward: 500.00, mean_100: 451.64, episodes: 2574, reward function loss: -0.0434
989331: reward: 500.00, mean_100

1019768: reward: 500.00, mean_100: 394.37, episodes: 2657, reward function loss: -0.0193
1019923: reward: 155.00, mean_100: 390.92, episodes: 2658, reward function loss: -0.0193
1020423: reward: 500.00, mean_100: 390.92, episodes: 2659, reward function loss: -0.0193
1020579: reward: 156.00, mean_100: 387.48, episodes: 2660, reward function loss: -0.0288
1021079: reward: 500.00, mean_100: 387.48, episodes: 2661, reward function loss: -0.0288
1021579: reward: 500.00, mean_100: 387.48, episodes: 2662, reward function loss: -0.0288
1021729: reward: 150.00, mean_100: 383.98, episodes: 2663, reward function loss: -0.0288
1021883: reward: 154.00, mean_100: 380.52, episodes: 2664, reward function loss: -0.0288
1022041: reward: 158.00, mean_100: 377.10, episodes: 2665, reward function loss: -0.0288
1022541: reward: 500.00, mean_100: 377.10, episodes: 2666, reward function loss: -0.0288
1023041: reward: 500.00, mean_100: 377.10, episodes: 2667, reward function loss: -0.0288
1023541: reward: 500.

1053426: reward: 500.00, mean_100: 357.80, episodes: 2750, reward function loss: -0.0443
1053926: reward: 500.00, mean_100: 361.17, episodes: 2751, reward function loss: -0.0443
1054426: reward: 500.00, mean_100: 361.17, episodes: 2752, reward function loss: -0.0435
1054926: reward: 500.00, mean_100: 364.71, episodes: 2753, reward function loss: -0.0435
1055426: reward: 500.00, mean_100: 368.06, episodes: 2754, reward function loss: -0.0435
1055926: reward: 500.00, mean_100: 371.58, episodes: 2755, reward function loss: -0.0435
1056426: reward: 500.00, mean_100: 371.58, episodes: 2756, reward function loss: -0.0432
1056926: reward: 500.00, mean_100: 371.58, episodes: 2757, reward function loss: -0.0432
1057426: reward: 500.00, mean_100: 375.03, episodes: 2758, reward function loss: -0.0432
1057926: reward: 500.00, mean_100: 375.03, episodes: 2759, reward function loss: -0.0432
1058426: reward: 500.00, mean_100: 378.47, episodes: 2760, reward function loss: -0.0439
1058926: reward: 500.

1099534: reward: 488.00, mean_100: 496.08, episodes: 2843, reward function loss: -0.0191
1100006: reward: 472.00, mean_100: 495.80, episodes: 2844, reward function loss: -0.0144
1100506: reward: 500.00, mean_100: 495.80, episodes: 2845, reward function loss: -0.0144
1101006: reward: 500.00, mean_100: 495.80, episodes: 2846, reward function loss: -0.0144
1101506: reward: 500.00, mean_100: 495.80, episodes: 2847, reward function loss: -0.0144
1102006: reward: 500.00, mean_100: 495.80, episodes: 2848, reward function loss: -0.0088
1102506: reward: 500.00, mean_100: 495.80, episodes: 2849, reward function loss: -0.0088
1102952: reward: 446.00, mean_100: 495.26, episodes: 2850, reward function loss: -0.0088
1103326: reward: 374.00, mean_100: 494.00, episodes: 2851, reward function loss: -0.0088
1103696: reward: 370.00, mean_100: 492.70, episodes: 2852, reward function loss: -0.0093
1104079: reward: 383.00, mean_100: 491.53, episodes: 2853, reward function loss: -0.0093
1104579: reward: 500.

1145032: reward: 500.00, mean_100: 487.76, episodes: 2936, reward function loss: -0.0475
1145532: reward: 500.00, mean_100: 488.04, episodes: 2937, reward function loss: -0.0475
1146032: reward: 500.00, mean_100: 488.04, episodes: 2938, reward function loss: -0.0475
1146532: reward: 500.00, mean_100: 489.29, episodes: 2939, reward function loss: -0.0475
1147032: reward: 500.00, mean_100: 489.48, episodes: 2940, reward function loss: -0.0475
1147532: reward: 500.00, mean_100: 489.86, episodes: 2941, reward function loss: -0.0475
1148032: reward: 500.00, mean_100: 489.86, episodes: 2942, reward function loss: -0.0475
1148532: reward: 500.00, mean_100: 489.98, episodes: 2943, reward function loss: -0.0475
1149032: reward: 500.00, mean_100: 490.26, episodes: 2944, reward function loss: -0.0473
1149532: reward: 500.00, mean_100: 490.26, episodes: 2945, reward function loss: -0.0473
1150032: reward: 500.00, mean_100: 490.26, episodes: 2946, reward function loss: -0.0473
1150532: reward: 500.

1191532: reward: 500.00, mean_100: 500.00, episodes: 3029, reward function loss: -0.0475
1192032: reward: 500.00, mean_100: 500.00, episodes: 3030, reward function loss: -0.0475
1192532: reward: 500.00, mean_100: 500.00, episodes: 3031, reward function loss: -0.0475
1193032: reward: 500.00, mean_100: 500.00, episodes: 3032, reward function loss: -0.0475
1193532: reward: 500.00, mean_100: 500.00, episodes: 3033, reward function loss: -0.0475
1194032: reward: 500.00, mean_100: 500.00, episodes: 3034, reward function loss: -0.0475
1194532: reward: 500.00, mean_100: 500.00, episodes: 3035, reward function loss: -0.0475
1195032: reward: 500.00, mean_100: 500.00, episodes: 3036, reward function loss: -0.0475
1195532: reward: 500.00, mean_100: 500.00, episodes: 3037, reward function loss: -0.0475
1196032: reward: 500.00, mean_100: 500.00, episodes: 3038, reward function loss: -0.0475
1196532: reward: 500.00, mean_100: 500.00, episodes: 3039, reward function loss: -0.0475
1197032: reward: 500.

1238032: reward: 500.00, mean_100: 500.00, episodes: 3122, reward function loss: -0.0478
1238532: reward: 500.00, mean_100: 500.00, episodes: 3123, reward function loss: -0.0478
1239032: reward: 500.00, mean_100: 500.00, episodes: 3124, reward function loss: -0.0475
1239532: reward: 500.00, mean_100: 500.00, episodes: 3125, reward function loss: -0.0475
1240032: reward: 500.00, mean_100: 500.00, episodes: 3126, reward function loss: -0.0475
1240532: reward: 500.00, mean_100: 500.00, episodes: 3127, reward function loss: -0.0475
1241032: reward: 500.00, mean_100: 500.00, episodes: 3128, reward function loss: -0.0475
1241532: reward: 500.00, mean_100: 500.00, episodes: 3129, reward function loss: -0.0475
1242032: reward: 500.00, mean_100: 500.00, episodes: 3130, reward function loss: -0.0475
1242532: reward: 500.00, mean_100: 500.00, episodes: 3131, reward function loss: -0.0475
1243032: reward: 500.00, mean_100: 500.00, episodes: 3132, reward function loss: -0.0474
1243532: reward: 500.

1284532: reward: 500.00, mean_100: 500.00, episodes: 3215, reward function loss: -0.0473
1285032: reward: 500.00, mean_100: 500.00, episodes: 3216, reward function loss: -0.0476
1285532: reward: 500.00, mean_100: 500.00, episodes: 3217, reward function loss: -0.0476
1286032: reward: 500.00, mean_100: 500.00, episodes: 3218, reward function loss: -0.0476
1286532: reward: 500.00, mean_100: 500.00, episodes: 3219, reward function loss: -0.0476
1287032: reward: 500.00, mean_100: 500.00, episodes: 3220, reward function loss: -0.0474
1287532: reward: 500.00, mean_100: 500.00, episodes: 3221, reward function loss: -0.0474
1288032: reward: 500.00, mean_100: 500.00, episodes: 3222, reward function loss: -0.0474
1288532: reward: 500.00, mean_100: 500.00, episodes: 3223, reward function loss: -0.0474
1289032: reward: 500.00, mean_100: 500.00, episodes: 3224, reward function loss: -0.0479
1289532: reward: 500.00, mean_100: 500.00, episodes: 3225, reward function loss: -0.0479
1290032: reward: 500.

1331032: reward: 500.00, mean_100: 500.00, episodes: 3308, reward function loss: -0.0471
1331532: reward: 500.00, mean_100: 500.00, episodes: 3309, reward function loss: -0.0471
1332032: reward: 500.00, mean_100: 500.00, episodes: 3310, reward function loss: -0.0471
1332532: reward: 500.00, mean_100: 500.00, episodes: 3311, reward function loss: -0.0471
1333032: reward: 500.00, mean_100: 500.00, episodes: 3312, reward function loss: -0.0427
1333532: reward: 500.00, mean_100: 500.00, episodes: 3313, reward function loss: -0.0427
1334032: reward: 500.00, mean_100: 500.00, episodes: 3314, reward function loss: -0.0427
1334532: reward: 500.00, mean_100: 500.00, episodes: 3315, reward function loss: -0.0427
1335032: reward: 500.00, mean_100: 500.00, episodes: 3316, reward function loss: -0.0415
1335532: reward: 500.00, mean_100: 500.00, episodes: 3317, reward function loss: -0.0415
1336032: reward: 500.00, mean_100: 500.00, episodes: 3318, reward function loss: -0.0415
1336532: reward: 500.

1377532: reward: 500.00, mean_100: 500.00, episodes: 3401, reward function loss: -0.0317
1378032: reward: 500.00, mean_100: 500.00, episodes: 3402, reward function loss: -0.0317
1378532: reward: 500.00, mean_100: 500.00, episodes: 3403, reward function loss: -0.0317
1378577: reward:  45.00, mean_100: 495.45, episodes: 3404, reward function loss: -0.0211
1379077: reward: 500.00, mean_100: 495.45, episodes: 3405, reward function loss: -0.0211
1379577: reward: 500.00, mean_100: 495.45, episodes: 3406, reward function loss: -0.0211
1380077: reward: 500.00, mean_100: 495.45, episodes: 3407, reward function loss: -0.0211
1380577: reward: 500.00, mean_100: 495.45, episodes: 3408, reward function loss: -0.0065
1380855: reward: 278.00, mean_100: 493.23, episodes: 3409, reward function loss: -0.0065
1381355: reward: 500.00, mean_100: 493.23, episodes: 3410, reward function loss: -0.0065
1381855: reward: 500.00, mean_100: 493.23, episodes: 3411, reward function loss: -0.0065
1382355: reward: 500.

1389523: reward:  10.00, mean_100: 144.91, episodes: 3496, reward function loss: -0.0000
1389533: reward:  10.00, mean_100: 140.01, episodes: 3497, reward function loss: -0.0000
1389543: reward:  10.00, mean_100: 135.11, episodes: 3498, reward function loss: -0.0000
1389552: reward:   9.00, mean_100: 130.20, episodes: 3499, reward function loss: -0.0000
1389563: reward:  11.00, mean_100: 125.31, episodes: 3500, reward function loss: -0.0000
1389572: reward:   9.00, mean_100: 120.40, episodes: 3501, reward function loss: -0.0000
1389581: reward:   9.00, mean_100: 115.49, episodes: 3502, reward function loss: -0.0000
1389591: reward:  10.00, mean_100: 110.59, episodes: 3503, reward function loss: -0.0000
1389697: reward: 106.00, mean_100: 111.20, episodes: 3504, reward function loss: -0.0000
1389706: reward:   9.00, mean_100: 106.29, episodes: 3505, reward function loss: -0.0000
1389715: reward:   9.00, mean_100: 101.38, episodes: 3506, reward function loss: -0.0000
1389725: reward:  10.

1390531: reward:  10.00, mean_100:  10.46, episodes: 3592, reward function loss: -0.0000
1390540: reward:   9.00, mean_100:  10.45, episodes: 3593, reward function loss: -0.0000
1390549: reward:   9.00, mean_100:  10.45, episodes: 3594, reward function loss: -0.0000
1390557: reward:   8.00, mean_100:  10.44, episodes: 3595, reward function loss: -0.0000
1390567: reward:  10.00, mean_100:  10.44, episodes: 3596, reward function loss: -0.0000
1390576: reward:   9.00, mean_100:  10.43, episodes: 3597, reward function loss: -0.0000
1390586: reward:  10.00, mean_100:  10.43, episodes: 3598, reward function loss: -0.0000
1390596: reward:  10.00, mean_100:  10.44, episodes: 3599, reward function loss: -0.0000
1390605: reward:   9.00, mean_100:  10.42, episodes: 3600, reward function loss: -0.0000
1390614: reward:   9.00, mean_100:  10.42, episodes: 3601, reward function loss: -0.0000
1390624: reward:  10.00, mean_100:  10.43, episodes: 3602, reward function loss: -0.0000
1390633: reward:   9.

KeyboardInterrupt: 

In [None]:
## Testing 
from util import Agent
agent_net.eval()
agent_ = Agent(agent_net, apply_softmax=True, preprocessor=ptan.agent.float32_preprocessor)

for i in range(10):
    state = env.reset()
    Reward = 0
    done = False
    while not done:
        env.render()
        action = agent_(state)
        new_state, reward, done, _ = env.step(int(action))
        Reward += reward
    print("Trial :", i, " Reward: ", Reward)
env.close()

In [None]:
selected, len(demonstrations), DEMO_BATCH

In [None]:
%matplotlib
S1 = np.linspace(-5, 5, 100)
S2 = np.linspace(-3.1457, 3.1457, 100)
S3 = 0*S1
S4 = 0*S1
S5 = np.ones(100)
Reward = np.zeros((100,100))
for i in range(100):
    for j in range(100):
        state = [S1[i], S2[j], 0, 0]
        action = 1
        x = torch.cat([float32_preprocessor(state), float32_preprocessor([int(action)])]).view(1, -1)
        r = reward_net(x)
        Reward[i,j] = r
        
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                cmap='viridis', edgecolor='none')
ax.set_title('surface');
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z');
ax.view_init(azim=90, elev=90)

In [None]:
demonstrations['states'][0]