## 02 frozenlake_naive.py

In [18]:
import random
import gym
import gym.spaces
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [19]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70

In [3]:
# 코드를 재활용하기 위해서 gym의 ObservationWrapper를 DiscreteOneHotWrapper 클래스에 상속시킨다.
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [4]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [5]:
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [6]:
def iterate_batches(env, net, batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, _ = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs

In [7]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))

    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [9]:
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-frozenlake-naive")

for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    if reward_m > 0.8:
        print("Solved!")
        break
    if iter_no > 3000: ########################################################
        print("Wrapper does not improve the score!!!") ########################
        break
writer.close()

0: loss=1.390, reward_mean=0.0, reward_bound=0.0
1: loss=1.356, reward_mean=0.0, reward_bound=0.0
2: loss=1.348, reward_mean=0.0, reward_bound=0.0
3: loss=1.352, reward_mean=0.0, reward_bound=0.0
4: loss=1.277, reward_mean=0.0, reward_bound=0.0
5: loss=1.288, reward_mean=0.1, reward_bound=0.0
6: loss=1.242, reward_mean=0.1, reward_bound=0.0
7: loss=1.122, reward_mean=0.0, reward_bound=0.0
8: loss=1.195, reward_mean=0.0, reward_bound=0.0
9: loss=1.045, reward_mean=0.0, reward_bound=0.0
10: loss=1.045, reward_mean=0.0, reward_bound=0.0
11: loss=1.038, reward_mean=0.0, reward_bound=0.0
12: loss=0.958, reward_mean=0.0, reward_bound=0.0
13: loss=0.958, reward_mean=0.0, reward_bound=0.0
14: loss=0.949, reward_mean=0.0, reward_bound=0.0
15: loss=0.914, reward_mean=0.0, reward_bound=0.0
16: loss=0.973, reward_mean=0.0, reward_bound=0.0
17: loss=0.921, reward_mean=0.0, reward_bound=0.0
18: loss=0.890, reward_mean=0.0, reward_bound=0.0
19: loss=0.866, reward_mean=0.0, reward_bound=0.0
20: loss=0

166: loss=1.026, reward_mean=0.0, reward_bound=0.0
167: loss=1.097, reward_mean=0.0, reward_bound=0.0
168: loss=1.168, reward_mean=0.0, reward_bound=0.0
169: loss=1.055, reward_mean=0.0, reward_bound=0.0
170: loss=1.095, reward_mean=0.0, reward_bound=0.0
171: loss=1.085, reward_mean=0.1, reward_bound=0.0
172: loss=1.087, reward_mean=0.0, reward_bound=0.0
173: loss=1.118, reward_mean=0.0, reward_bound=0.0
174: loss=1.095, reward_mean=0.0, reward_bound=0.0
175: loss=1.188, reward_mean=0.0, reward_bound=0.0
176: loss=1.130, reward_mean=0.0, reward_bound=0.0
177: loss=1.074, reward_mean=0.0, reward_bound=0.0
178: loss=1.244, reward_mean=0.0, reward_bound=0.0
179: loss=1.266, reward_mean=0.0, reward_bound=0.0
180: loss=1.161, reward_mean=0.0, reward_bound=0.0
181: loss=1.222, reward_mean=0.0, reward_bound=0.0
182: loss=1.193, reward_mean=0.0, reward_bound=0.0
183: loss=1.205, reward_mean=0.0, reward_bound=0.0
184: loss=1.276, reward_mean=0.0, reward_bound=0.0
185: loss=1.271, reward_mean=0.

334: loss=1.003, reward_mean=0.0, reward_bound=0.0
335: loss=0.944, reward_mean=0.0, reward_bound=0.0
336: loss=1.101, reward_mean=0.0, reward_bound=0.0
337: loss=0.958, reward_mean=0.0, reward_bound=0.0
338: loss=0.981, reward_mean=0.0, reward_bound=0.0
339: loss=1.021, reward_mean=0.1, reward_bound=0.0
340: loss=1.079, reward_mean=0.0, reward_bound=0.0
341: loss=1.001, reward_mean=0.0, reward_bound=0.0
342: loss=1.051, reward_mean=0.0, reward_bound=0.0
343: loss=0.933, reward_mean=0.0, reward_bound=0.0
344: loss=1.001, reward_mean=0.0, reward_bound=0.0
345: loss=1.028, reward_mean=0.0, reward_bound=0.0
346: loss=0.962, reward_mean=0.1, reward_bound=0.0
347: loss=0.930, reward_mean=0.0, reward_bound=0.0
348: loss=0.949, reward_mean=0.1, reward_bound=0.0
349: loss=1.041, reward_mean=0.0, reward_bound=0.0
350: loss=1.049, reward_mean=0.0, reward_bound=0.0
351: loss=1.034, reward_mean=0.0, reward_bound=0.0
352: loss=0.946, reward_mean=0.0, reward_bound=0.0
353: loss=0.975, reward_mean=0.

502: loss=1.115, reward_mean=0.0, reward_bound=0.0
503: loss=1.035, reward_mean=0.0, reward_bound=0.0
504: loss=0.970, reward_mean=0.1, reward_bound=0.0
505: loss=0.992, reward_mean=0.0, reward_bound=0.0
506: loss=1.087, reward_mean=0.0, reward_bound=0.0
507: loss=1.057, reward_mean=0.0, reward_bound=0.0
508: loss=1.158, reward_mean=0.0, reward_bound=0.0
509: loss=1.054, reward_mean=0.0, reward_bound=0.0
510: loss=1.117, reward_mean=0.0, reward_bound=0.0
511: loss=1.068, reward_mean=0.0, reward_bound=0.0
512: loss=1.065, reward_mean=0.0, reward_bound=0.0
513: loss=0.966, reward_mean=0.0, reward_bound=0.0
514: loss=1.134, reward_mean=0.0, reward_bound=0.0
515: loss=0.928, reward_mean=0.0, reward_bound=0.0
516: loss=1.022, reward_mean=0.0, reward_bound=0.0
517: loss=1.107, reward_mean=0.0, reward_bound=0.0
518: loss=1.050, reward_mean=0.0, reward_bound=0.0
519: loss=1.112, reward_mean=0.0, reward_bound=0.0
520: loss=1.012, reward_mean=0.0, reward_bound=0.0
521: loss=1.103, reward_mean=0.

664: loss=0.776, reward_mean=0.0, reward_bound=0.0
665: loss=0.793, reward_mean=0.0, reward_bound=0.0
666: loss=0.833, reward_mean=0.0, reward_bound=0.0
667: loss=0.868, reward_mean=0.0, reward_bound=0.0
668: loss=0.729, reward_mean=0.0, reward_bound=0.0
669: loss=0.874, reward_mean=0.0, reward_bound=0.0
670: loss=0.854, reward_mean=0.0, reward_bound=0.0
671: loss=0.915, reward_mean=0.0, reward_bound=0.0
672: loss=0.968, reward_mean=0.0, reward_bound=0.0
673: loss=0.801, reward_mean=0.0, reward_bound=0.0
674: loss=0.881, reward_mean=0.0, reward_bound=0.0
675: loss=0.827, reward_mean=0.0, reward_bound=0.0
676: loss=0.844, reward_mean=0.0, reward_bound=0.0
677: loss=0.929, reward_mean=0.0, reward_bound=0.0
678: loss=0.777, reward_mean=0.0, reward_bound=0.0
679: loss=0.994, reward_mean=0.0, reward_bound=0.0
680: loss=0.891, reward_mean=0.0, reward_bound=0.0
681: loss=0.759, reward_mean=0.0, reward_bound=0.0
682: loss=0.816, reward_mean=0.0, reward_bound=0.0
683: loss=0.854, reward_mean=0.

832: loss=0.705, reward_mean=0.1, reward_bound=0.0
833: loss=0.692, reward_mean=0.0, reward_bound=0.0
834: loss=0.581, reward_mean=0.0, reward_bound=0.0
835: loss=0.508, reward_mean=0.0, reward_bound=0.0
836: loss=0.695, reward_mean=0.0, reward_bound=0.0
837: loss=0.727, reward_mean=0.0, reward_bound=0.0
838: loss=0.743, reward_mean=0.1, reward_bound=0.0
839: loss=0.637, reward_mean=0.0, reward_bound=0.0
840: loss=0.679, reward_mean=0.0, reward_bound=0.0
841: loss=0.805, reward_mean=0.0, reward_bound=0.0
842: loss=0.619, reward_mean=0.0, reward_bound=0.0
843: loss=0.560, reward_mean=0.0, reward_bound=0.0
844: loss=0.787, reward_mean=0.0, reward_bound=0.0
845: loss=0.728, reward_mean=0.0, reward_bound=0.0
846: loss=0.689, reward_mean=0.0, reward_bound=0.0
847: loss=0.658, reward_mean=0.0, reward_bound=0.0
848: loss=0.790, reward_mean=0.0, reward_bound=0.0
849: loss=0.700, reward_mean=0.0, reward_bound=0.0
850: loss=0.778, reward_mean=0.0, reward_bound=0.0
851: loss=0.759, reward_mean=0.

994: loss=0.773, reward_mean=0.0, reward_bound=0.0
995: loss=0.668, reward_mean=0.0, reward_bound=0.0
996: loss=0.672, reward_mean=0.0, reward_bound=0.0
997: loss=0.702, reward_mean=0.0, reward_bound=0.0
998: loss=0.860, reward_mean=0.0, reward_bound=0.0
999: loss=0.717, reward_mean=0.0, reward_bound=0.0
1000: loss=0.743, reward_mean=0.0, reward_bound=0.0
1001: loss=0.801, reward_mean=0.0, reward_bound=0.0
1002: loss=0.766, reward_mean=0.0, reward_bound=0.0
1003: loss=0.783, reward_mean=0.0, reward_bound=0.0
1004: loss=0.861, reward_mean=0.0, reward_bound=0.0
1005: loss=0.842, reward_mean=0.0, reward_bound=0.0
1006: loss=0.756, reward_mean=0.0, reward_bound=0.0
1007: loss=0.836, reward_mean=0.0, reward_bound=0.0
1008: loss=0.928, reward_mean=0.0, reward_bound=0.0
1009: loss=0.881, reward_mean=0.0, reward_bound=0.0
1010: loss=0.856, reward_mean=0.0, reward_bound=0.0
1011: loss=0.886, reward_mean=0.0, reward_bound=0.0
1012: loss=0.922, reward_mean=0.0, reward_bound=0.0
1013: loss=0.792, 

1158: loss=0.862, reward_mean=0.0, reward_bound=0.0
1159: loss=0.831, reward_mean=0.0, reward_bound=0.0
1160: loss=0.845, reward_mean=0.0, reward_bound=0.0
1161: loss=0.802, reward_mean=0.0, reward_bound=0.0
1162: loss=0.867, reward_mean=0.0, reward_bound=0.0
1163: loss=0.922, reward_mean=0.0, reward_bound=0.0
1164: loss=0.801, reward_mean=0.0, reward_bound=0.0
1165: loss=0.769, reward_mean=0.0, reward_bound=0.0
1166: loss=0.774, reward_mean=0.0, reward_bound=0.0
1167: loss=0.869, reward_mean=0.0, reward_bound=0.0
1168: loss=0.797, reward_mean=0.0, reward_bound=0.0
1169: loss=0.772, reward_mean=0.0, reward_bound=0.0
1170: loss=0.752, reward_mean=0.0, reward_bound=0.0
1171: loss=0.865, reward_mean=0.0, reward_bound=0.0
1172: loss=0.729, reward_mean=0.0, reward_bound=0.0
1173: loss=0.838, reward_mean=0.0, reward_bound=0.0
1174: loss=0.842, reward_mean=0.0, reward_bound=0.0
1175: loss=0.787, reward_mean=0.0, reward_bound=0.0
1176: loss=0.787, reward_mean=0.0, reward_bound=0.0
1177: loss=0

1319: loss=0.790, reward_mean=0.0, reward_bound=0.0
1320: loss=0.933, reward_mean=0.0, reward_bound=0.0
1321: loss=0.814, reward_mean=0.0, reward_bound=0.0
1322: loss=0.674, reward_mean=0.0, reward_bound=0.0
1323: loss=0.685, reward_mean=0.0, reward_bound=0.0
1324: loss=0.829, reward_mean=0.0, reward_bound=0.0
1325: loss=0.739, reward_mean=0.0, reward_bound=0.0
1326: loss=0.777, reward_mean=0.0, reward_bound=0.0
1327: loss=0.767, reward_mean=0.0, reward_bound=0.0
1328: loss=0.761, reward_mean=0.0, reward_bound=0.0
1329: loss=0.733, reward_mean=0.0, reward_bound=0.0
1330: loss=0.611, reward_mean=0.1, reward_bound=0.0
1331: loss=0.709, reward_mean=0.0, reward_bound=0.0
1332: loss=0.776, reward_mean=0.0, reward_bound=0.0
1333: loss=0.807, reward_mean=0.0, reward_bound=0.0
1334: loss=0.749, reward_mean=0.1, reward_bound=0.0
1335: loss=0.766, reward_mean=0.0, reward_bound=0.0
1336: loss=0.745, reward_mean=0.0, reward_bound=0.0
1337: loss=0.861, reward_mean=0.0, reward_bound=0.0
1338: loss=0

1480: loss=0.408, reward_mean=0.0, reward_bound=0.0
1481: loss=0.470, reward_mean=0.0, reward_bound=0.0
1482: loss=0.505, reward_mean=0.0, reward_bound=0.0
1483: loss=0.558, reward_mean=0.0, reward_bound=0.0
1484: loss=0.606, reward_mean=0.0, reward_bound=0.0
1485: loss=0.501, reward_mean=0.0, reward_bound=0.0
1486: loss=0.742, reward_mean=0.1, reward_bound=0.0
1487: loss=0.617, reward_mean=0.1, reward_bound=0.0
1488: loss=0.546, reward_mean=0.0, reward_bound=0.0
1489: loss=0.480, reward_mean=0.0, reward_bound=0.0
1490: loss=0.415, reward_mean=0.1, reward_bound=0.0
1491: loss=0.428, reward_mean=0.1, reward_bound=0.0
1492: loss=0.454, reward_mean=0.0, reward_bound=0.0
1493: loss=0.410, reward_mean=0.1, reward_bound=0.0
1494: loss=0.494, reward_mean=0.0, reward_bound=0.0
1495: loss=0.607, reward_mean=0.0, reward_bound=0.0
1496: loss=0.582, reward_mean=0.0, reward_bound=0.0
1497: loss=0.500, reward_mean=0.0, reward_bound=0.0
1498: loss=0.517, reward_mean=0.0, reward_bound=0.0
1499: loss=0

1643: loss=0.382, reward_mean=0.0, reward_bound=0.0
1644: loss=0.440, reward_mean=0.0, reward_bound=0.0
1645: loss=0.426, reward_mean=0.1, reward_bound=0.0
1646: loss=0.439, reward_mean=0.0, reward_bound=0.0
1647: loss=0.250, reward_mean=0.0, reward_bound=0.0
1648: loss=0.547, reward_mean=0.1, reward_bound=0.0
1649: loss=0.332, reward_mean=0.1, reward_bound=0.0
1650: loss=0.488, reward_mean=0.0, reward_bound=0.0
1651: loss=0.446, reward_mean=0.1, reward_bound=0.0
1652: loss=0.307, reward_mean=0.1, reward_bound=0.0
1653: loss=0.294, reward_mean=0.0, reward_bound=0.0
1654: loss=0.634, reward_mean=0.0, reward_bound=0.0
1655: loss=0.442, reward_mean=0.0, reward_bound=0.0
1656: loss=0.431, reward_mean=0.0, reward_bound=0.0
1657: loss=0.500, reward_mean=0.1, reward_bound=0.0
1658: loss=0.384, reward_mean=0.0, reward_bound=0.0
1659: loss=0.338, reward_mean=0.0, reward_bound=0.0
1660: loss=0.400, reward_mean=0.1, reward_bound=0.0
1661: loss=0.394, reward_mean=0.0, reward_bound=0.0
1662: loss=0

1801: loss=0.209, reward_mean=0.0, reward_bound=0.0
1802: loss=0.373, reward_mean=0.0, reward_bound=0.0
1803: loss=0.218, reward_mean=0.0, reward_bound=0.0
1804: loss=0.313, reward_mean=0.0, reward_bound=0.0
1805: loss=0.132, reward_mean=0.0, reward_bound=0.0
1806: loss=0.336, reward_mean=0.0, reward_bound=0.0
1807: loss=0.183, reward_mean=0.0, reward_bound=0.0
1808: loss=0.309, reward_mean=0.0, reward_bound=0.0
1809: loss=0.239, reward_mean=0.0, reward_bound=0.0
1810: loss=0.220, reward_mean=0.0, reward_bound=0.0
1811: loss=0.355, reward_mean=0.0, reward_bound=0.0
1812: loss=0.200, reward_mean=0.0, reward_bound=0.0
1813: loss=0.312, reward_mean=0.0, reward_bound=0.0
1814: loss=0.322, reward_mean=0.0, reward_bound=0.0
1815: loss=0.113, reward_mean=0.0, reward_bound=0.0
1816: loss=0.240, reward_mean=0.0, reward_bound=0.0
1817: loss=0.314, reward_mean=0.0, reward_bound=0.0
1818: loss=0.296, reward_mean=0.0, reward_bound=0.0
1819: loss=0.113, reward_mean=0.0, reward_bound=0.0
1820: loss=0

1963: loss=0.359, reward_mean=0.0, reward_bound=0.0
1964: loss=0.445, reward_mean=0.0, reward_bound=0.0
1965: loss=0.471, reward_mean=0.0, reward_bound=0.0
1966: loss=0.514, reward_mean=0.1, reward_bound=0.0
1967: loss=0.390, reward_mean=0.0, reward_bound=0.0
1968: loss=0.475, reward_mean=0.0, reward_bound=0.0
1969: loss=0.305, reward_mean=0.0, reward_bound=0.0
1970: loss=0.244, reward_mean=0.0, reward_bound=0.0
1971: loss=0.550, reward_mean=0.0, reward_bound=0.0
1972: loss=0.506, reward_mean=0.0, reward_bound=0.0
1973: loss=0.490, reward_mean=0.0, reward_bound=0.0
1974: loss=0.335, reward_mean=0.0, reward_bound=0.0
1975: loss=0.546, reward_mean=0.0, reward_bound=0.0
1976: loss=0.450, reward_mean=0.0, reward_bound=0.0
1977: loss=0.542, reward_mean=0.0, reward_bound=0.0
1978: loss=0.483, reward_mean=0.0, reward_bound=0.0
1979: loss=0.415, reward_mean=0.0, reward_bound=0.0
1980: loss=0.380, reward_mean=0.0, reward_bound=0.0
1981: loss=0.389, reward_mean=0.0, reward_bound=0.0
1982: loss=0

2130: loss=0.624, reward_mean=0.0, reward_bound=0.0
2131: loss=0.519, reward_mean=0.0, reward_bound=0.0
2132: loss=0.450, reward_mean=0.0, reward_bound=0.0
2133: loss=0.653, reward_mean=0.0, reward_bound=0.0
2134: loss=0.550, reward_mean=0.0, reward_bound=0.0
2135: loss=0.543, reward_mean=0.0, reward_bound=0.0
2136: loss=0.430, reward_mean=0.0, reward_bound=0.0
2137: loss=0.516, reward_mean=0.0, reward_bound=0.0
2138: loss=0.583, reward_mean=0.0, reward_bound=0.0
2139: loss=0.388, reward_mean=0.0, reward_bound=0.0
2140: loss=0.492, reward_mean=0.0, reward_bound=0.0
2141: loss=0.553, reward_mean=0.0, reward_bound=0.0
2142: loss=0.721, reward_mean=0.1, reward_bound=0.0
2143: loss=0.456, reward_mean=0.0, reward_bound=0.0
2144: loss=0.416, reward_mean=0.0, reward_bound=0.0
2145: loss=0.429, reward_mean=0.0, reward_bound=0.0
2146: loss=0.636, reward_mean=0.0, reward_bound=0.0
2147: loss=0.566, reward_mean=0.0, reward_bound=0.0
2148: loss=0.410, reward_mean=0.0, reward_bound=0.0
2149: loss=0

2298: loss=0.690, reward_mean=0.0, reward_bound=0.0
2299: loss=0.566, reward_mean=0.0, reward_bound=0.0
2300: loss=0.451, reward_mean=0.0, reward_bound=0.0
2301: loss=0.543, reward_mean=0.0, reward_bound=0.0
2302: loss=0.418, reward_mean=0.0, reward_bound=0.0
2303: loss=0.470, reward_mean=0.0, reward_bound=0.0
2304: loss=0.512, reward_mean=0.0, reward_bound=0.0
2305: loss=0.553, reward_mean=0.0, reward_bound=0.0
2306: loss=0.516, reward_mean=0.0, reward_bound=0.0
2307: loss=0.459, reward_mean=0.0, reward_bound=0.0
2308: loss=0.457, reward_mean=0.0, reward_bound=0.0
2309: loss=0.516, reward_mean=0.0, reward_bound=0.0
2310: loss=0.591, reward_mean=0.0, reward_bound=0.0
2311: loss=0.450, reward_mean=0.0, reward_bound=0.0
2312: loss=0.395, reward_mean=0.0, reward_bound=0.0
2313: loss=0.366, reward_mean=0.0, reward_bound=0.0
2314: loss=0.496, reward_mean=0.0, reward_bound=0.0
2315: loss=0.402, reward_mean=0.0, reward_bound=0.0
2316: loss=0.485, reward_mean=0.0, reward_bound=0.0
2317: loss=0

2466: loss=0.482, reward_mean=0.0, reward_bound=0.0
2467: loss=0.405, reward_mean=0.0, reward_bound=0.0
2468: loss=0.547, reward_mean=0.0, reward_bound=0.0
2469: loss=0.549, reward_mean=0.0, reward_bound=0.0
2470: loss=0.382, reward_mean=0.0, reward_bound=0.0
2471: loss=0.503, reward_mean=0.0, reward_bound=0.0
2472: loss=0.664, reward_mean=0.0, reward_bound=0.0
2473: loss=0.531, reward_mean=0.0, reward_bound=0.0
2474: loss=0.597, reward_mean=0.0, reward_bound=0.0
2475: loss=0.569, reward_mean=0.0, reward_bound=0.0
2476: loss=0.552, reward_mean=0.0, reward_bound=0.0
2477: loss=0.563, reward_mean=0.0, reward_bound=0.0
2478: loss=0.546, reward_mean=0.0, reward_bound=0.0
2479: loss=0.667, reward_mean=0.0, reward_bound=0.0
2480: loss=0.496, reward_mean=0.0, reward_bound=0.0
2481: loss=0.405, reward_mean=0.0, reward_bound=0.0
2482: loss=0.437, reward_mean=0.0, reward_bound=0.0
2483: loss=0.406, reward_mean=0.0, reward_bound=0.0
2484: loss=0.492, reward_mean=0.0, reward_bound=0.0
2485: loss=0

2624: loss=0.644, reward_mean=0.0, reward_bound=0.0
2625: loss=0.633, reward_mean=0.0, reward_bound=0.0
2626: loss=0.547, reward_mean=0.0, reward_bound=0.0
2627: loss=0.522, reward_mean=0.0, reward_bound=0.0
2628: loss=0.789, reward_mean=0.0, reward_bound=0.0
2629: loss=0.485, reward_mean=0.0, reward_bound=0.0
2630: loss=0.634, reward_mean=0.0, reward_bound=0.0
2631: loss=0.581, reward_mean=0.0, reward_bound=0.0
2632: loss=0.573, reward_mean=0.0, reward_bound=0.0
2633: loss=0.646, reward_mean=0.0, reward_bound=0.0
2634: loss=0.641, reward_mean=0.0, reward_bound=0.0
2635: loss=0.590, reward_mean=0.0, reward_bound=0.0
2636: loss=0.662, reward_mean=0.0, reward_bound=0.0
2637: loss=0.551, reward_mean=0.0, reward_bound=0.0
2638: loss=0.592, reward_mean=0.0, reward_bound=0.0
2639: loss=0.407, reward_mean=0.0, reward_bound=0.0
2640: loss=0.517, reward_mean=0.0, reward_bound=0.0
2641: loss=0.649, reward_mean=0.0, reward_bound=0.0
2642: loss=0.659, reward_mean=0.0, reward_bound=0.0
2643: loss=0

2788: loss=0.595, reward_mean=0.0, reward_bound=0.0
2789: loss=0.637, reward_mean=0.0, reward_bound=0.0
2790: loss=0.639, reward_mean=0.0, reward_bound=0.0
2791: loss=0.734, reward_mean=0.0, reward_bound=0.0
2792: loss=0.619, reward_mean=0.0, reward_bound=0.0
2793: loss=0.840, reward_mean=0.0, reward_bound=0.0
2794: loss=0.628, reward_mean=0.0, reward_bound=0.0
2795: loss=0.719, reward_mean=0.0, reward_bound=0.0
2796: loss=0.630, reward_mean=0.0, reward_bound=0.0
2797: loss=0.770, reward_mean=0.0, reward_bound=0.0
2798: loss=0.671, reward_mean=0.0, reward_bound=0.0
2799: loss=0.574, reward_mean=0.0, reward_bound=0.0
2800: loss=0.889, reward_mean=0.0, reward_bound=0.0
2801: loss=0.596, reward_mean=0.0, reward_bound=0.0
2802: loss=0.763, reward_mean=0.0, reward_bound=0.0
2803: loss=0.698, reward_mean=0.0, reward_bound=0.0
2804: loss=0.636, reward_mean=0.0, reward_bound=0.0
2805: loss=0.733, reward_mean=0.0, reward_bound=0.0
2806: loss=0.759, reward_mean=0.0, reward_bound=0.0
2807: loss=0

2949: loss=0.536, reward_mean=0.0, reward_bound=0.0
2950: loss=0.511, reward_mean=0.0, reward_bound=0.0
2951: loss=0.448, reward_mean=0.0, reward_bound=0.0
2952: loss=0.552, reward_mean=0.0, reward_bound=0.0
2953: loss=0.412, reward_mean=0.0, reward_bound=0.0
2954: loss=0.653, reward_mean=0.0, reward_bound=0.0
2955: loss=0.554, reward_mean=0.0, reward_bound=0.0
2956: loss=0.497, reward_mean=0.0, reward_bound=0.0
2957: loss=0.463, reward_mean=0.0, reward_bound=0.0
2958: loss=0.681, reward_mean=0.0, reward_bound=0.0
2959: loss=0.726, reward_mean=0.0, reward_bound=0.0
2960: loss=0.520, reward_mean=0.0, reward_bound=0.0
2961: loss=0.492, reward_mean=0.0, reward_bound=0.0
2962: loss=0.520, reward_mean=0.0, reward_bound=0.0
2963: loss=0.599, reward_mean=0.0, reward_bound=0.0
2964: loss=0.484, reward_mean=0.0, reward_bound=0.0
2965: loss=0.601, reward_mean=0.0, reward_bound=0.0
2966: loss=0.433, reward_mean=0.0, reward_bound=0.0
2967: loss=0.563, reward_mean=0.0, reward_bound=0.0
2968: loss=0

<img src="./images/LackofConvergence.png" />

Cartpole과 비교했을 때 FrozenLake는 환경, 에피소드, 보상이 다르다.
<img src="./images/RewardCartPole.png" />
보상 1.0을 받기 위해서는 목표지점에 도착해야만 가능할 뿐만 아니라, 이런 보상이 각 에피소드가 얼만큼 유익했는지(예: 목표지점까지 신속하거나 효율적으로 이동했는지)알려주질 않는다는 점이 차이가 난다. 오로지 성공 (1) 또는 실패(0)만 있을 뿐이다.  
엘리트 에피소드를 퍼센티지로 선택하는 방식은 Frozen Lake 문제에서는 완전히 잘못되었고, 훈련시키는데 나쁜 예만을 가져다 준다.
<img src="./images/RewardFrozenLake.png" />

## 03 frozenlake_tweaked.py

In [7]:
# BATCH_SIZE = 16
# PERCENTILE = 70

BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9

교차 엔트로피로 Lake 문제를 해결하기 위해서는 코드 상으로 몇가지 수정이 필요하다.  
Cartpole에서는 배치크기를 16으로 하는 것으로 충분하였지만, Lake에서는 최소 100이상의 배치가 필요하다.  
오래걸리는 에피소드보다 적은시간이 소요되는 에피소드에 보상이 상대적으로 높아야 한다. 이는 할인(discount)개념을 써서 구현한다.  
FrozenLake는 Cartpole보다 성공적인 에피소드의 숫자가 적으므로 바로 바로 에피소드를 환경으로부터 샘플링하고 버릴 수 없고, 몇번의 반복동안 성공적인 에피소드를 가지고 있어야 한다.  
학습율을 줄여서 더 많은 훈련샘플들을 평균할 시간을 준다.  
성공적인 에피소드들이 50%이상 쌓이기 위해서는 적어도 5000번의 반복이 필요하다.

In [8]:
def filter_batch(batch, percentile):
#     rewards = list(map(lambda s: s.reward, batch))
#     reward_bound = np.percentile(rewards, percentile)
#     reward_mean = float(np.mean(rewards))
    disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
    ## discounted reward를 계산해준다.
    
    reward_bound = np.percentile(disc_rewards, percentile)

    train_obs = []
    train_act = []
    
    elite_batch = [] ## 엘리트 에피소드를 가지고 있다가 다음 학습 iteration으로 넘겨주는 리스트
    
#     for example in batch:
#         if example.reward < reward_bound:
#             continue
#         train_obs.extend(map(lambda step: step.observation, example.steps))
#         train_act.extend(map(lambda step: step.action, example.steps))
    
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)
    
#     train_obs_v = torch.FloatTensor(train_obs)
#     train_act_v = torch.LongTensor(train_act)
#     return train_obs_v, train_act_v, reward_bound, reward_mean


    return elite_batch, train_obs, train_act, reward_bound

In [13]:
random.seed(12345)
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-tweaked")

full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
    if not full_batch:
        continue
    obs_v = torch.FloatTensor(obs)
    acts_v = torch.LongTensor(acts)
    full_batch = full_batch[-500:]
    
    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)
    if reward_mean > 0.8:
        print("Solved!")
        break

writer.close()

0: loss=1.364, reward_mean=0.010, reward_bound=0.000, batch=1
1: loss=1.374, reward_mean=0.050, reward_bound=0.000, batch=6
2: loss=1.373, reward_mean=0.010, reward_bound=0.000, batch=7
3: loss=1.371, reward_mean=0.000, reward_bound=0.000, batch=7
4: loss=1.366, reward_mean=0.020, reward_bound=0.000, batch=9
5: loss=1.364, reward_mean=0.000, reward_bound=0.000, batch=9
6: loss=1.372, reward_mean=0.030, reward_bound=0.000, batch=12
7: loss=1.376, reward_mean=0.050, reward_bound=0.000, batch=17
8: loss=1.376, reward_mean=0.010, reward_bound=0.000, batch=18
9: loss=1.377, reward_mean=0.020, reward_bound=0.000, batch=20
10: loss=1.378, reward_mean=0.020, reward_bound=0.000, batch=22
11: loss=1.381, reward_mean=0.010, reward_bound=0.000, batch=23
12: loss=1.378, reward_mean=0.030, reward_bound=0.000, batch=26
13: loss=1.376, reward_mean=0.010, reward_bound=0.000, batch=27
14: loss=1.373, reward_mean=0.010, reward_bound=0.000, batch=28
15: loss=1.372, reward_mean=0.000, reward_bound=0.000, b

128: loss=1.262, reward_mean=0.040, reward_bound=0.185, batch=229
129: loss=1.262, reward_mean=0.010, reward_bound=0.082, batch=230
130: loss=1.261, reward_mean=0.010, reward_bound=0.144, batch=231
131: loss=1.255, reward_mean=0.060, reward_bound=0.206, batch=221
132: loss=1.255, reward_mean=0.020, reward_bound=0.000, batch=223
133: loss=1.255, reward_mean=0.040, reward_bound=0.098, batch=226
134: loss=1.255, reward_mean=0.050, reward_bound=0.143, batch=228
135: loss=1.251, reward_mean=0.050, reward_bound=0.206, batch=228
136: loss=1.249, reward_mean=0.030, reward_bound=0.229, batch=206
137: loss=1.249, reward_mean=0.020, reward_bound=0.000, batch=208
138: loss=1.248, reward_mean=0.010, reward_bound=0.000, batch=209
139: loss=1.246, reward_mean=0.010, reward_bound=0.000, batch=210
140: loss=1.244, reward_mean=0.070, reward_bound=0.008, batch=217
141: loss=1.244, reward_mean=0.060, reward_bound=0.030, batch=222
142: loss=1.243, reward_mean=0.020, reward_bound=0.000, batch=224
143: loss=

254: loss=1.125, reward_mean=0.050, reward_bound=0.282, batch=220
255: loss=1.125, reward_mean=0.050, reward_bound=0.153, batch=224
256: loss=1.125, reward_mean=0.060, reward_bound=0.280, batch=227
257: loss=1.123, reward_mean=0.050, reward_bound=0.237, batch=229
258: loss=1.123, reward_mean=0.050, reward_bound=0.254, batch=229
259: loss=1.121, reward_mean=0.050, reward_bound=0.282, batch=229
260: loss=1.121, reward_mean=0.060, reward_bound=0.314, batch=221
261: loss=1.120, reward_mean=0.060, reward_bound=0.229, batch=224
262: loss=1.124, reward_mean=0.060, reward_bound=0.349, batch=214
263: loss=1.127, reward_mean=0.050, reward_bound=0.000, batch=219
264: loss=1.129, reward_mean=0.030, reward_bound=0.000, batch=222
265: loss=1.132, reward_mean=0.030, reward_bound=0.007, batch=225
266: loss=1.131, reward_mean=0.050, reward_bound=0.091, batch=227
267: loss=1.126, reward_mean=0.040, reward_bound=0.135, batch=228
268: loss=1.123, reward_mean=0.070, reward_bound=0.231, batch=229
269: loss=

380: loss=1.093, reward_mean=0.070, reward_bound=0.308, batch=229
381: loss=1.104, reward_mean=0.060, reward_bound=0.314, batch=225
382: loss=1.103, reward_mean=0.090, reward_bound=0.314, batch=226
383: loss=1.105, reward_mean=0.040, reward_bound=0.229, batch=228
384: loss=1.102, reward_mean=0.040, reward_bound=0.349, batch=222
385: loss=1.101, reward_mean=0.060, reward_bound=0.238, batch=225
386: loss=1.102, reward_mean=0.040, reward_bound=0.289, batch=227
387: loss=1.099, reward_mean=0.050, reward_bound=0.284, batch=229
388: loss=1.100, reward_mean=0.050, reward_bound=0.328, batch=230
389: loss=1.099, reward_mean=0.060, reward_bound=0.376, batch=231
390: loss=1.097, reward_mean=0.060, reward_bound=0.387, batch=222
391: loss=1.098, reward_mean=0.080, reward_bound=0.324, batch=225
392: loss=1.097, reward_mean=0.070, reward_bound=0.387, batch=225
393: loss=1.096, reward_mean=0.050, reward_bound=0.289, batch=227
394: loss=1.094, reward_mean=0.060, reward_bound=0.342, batch=229
395: loss=

506: loss=1.067, reward_mean=0.110, reward_bound=0.000, batch=181
507: loss=1.058, reward_mean=0.170, reward_bound=0.052, batch=195
508: loss=1.052, reward_mean=0.110, reward_bound=0.007, batch=206
509: loss=1.056, reward_mean=0.130, reward_bound=0.098, batch=213
510: loss=1.055, reward_mean=0.070, reward_bound=0.091, batch=219
511: loss=1.055, reward_mean=0.050, reward_bound=0.120, batch=223
512: loss=1.055, reward_mean=0.130, reward_bound=0.167, batch=225
513: loss=1.059, reward_mean=0.110, reward_bound=0.206, batch=225
514: loss=1.055, reward_mean=0.090, reward_bound=0.234, batch=227
515: loss=1.056, reward_mean=0.050, reward_bound=0.205, batch=229
516: loss=1.059, reward_mean=0.120, reward_bound=0.282, batch=225
517: loss=1.061, reward_mean=0.100, reward_bound=0.314, batch=219
518: loss=1.059, reward_mean=0.060, reward_bound=0.328, batch=223
519: loss=1.063, reward_mean=0.060, reward_bound=0.349, batch=218
520: loss=1.061, reward_mean=0.070, reward_bound=0.234, batch=222
521: loss=

632: loss=1.034, reward_mean=0.080, reward_bound=0.282, batch=220
633: loss=1.033, reward_mean=0.070, reward_bound=0.254, batch=223
634: loss=1.033, reward_mean=0.100, reward_bound=0.335, batch=226
635: loss=1.034, reward_mean=0.050, reward_bound=0.331, batch=228
636: loss=1.034, reward_mean=0.040, reward_bound=0.282, batch=228
637: loss=1.038, reward_mean=0.080, reward_bound=0.353, batch=229
638: loss=1.037, reward_mean=0.070, reward_bound=0.343, batch=230
639: loss=1.036, reward_mean=0.060, reward_bound=0.387, batch=225
640: loss=1.032, reward_mean=0.070, reward_bound=0.289, batch=227
641: loss=1.034, reward_mean=0.080, reward_bound=0.342, batch=229
642: loss=1.035, reward_mean=0.040, reward_bound=0.250, batch=230
643: loss=1.033, reward_mean=0.040, reward_bound=0.329, batch=231
644: loss=1.035, reward_mean=0.020, reward_bound=0.349, batch=230
645: loss=1.035, reward_mean=0.050, reward_bound=0.356, batch=231
646: loss=1.035, reward_mean=0.070, reward_bound=0.387, batch=231
647: loss=

758: loss=1.023, reward_mean=0.060, reward_bound=0.192, batch=226
759: loss=1.023, reward_mean=0.090, reward_bound=0.254, batch=227
760: loss=1.024, reward_mean=0.060, reward_bound=0.342, batch=229
761: loss=1.023, reward_mean=0.060, reward_bound=0.349, batch=228
762: loss=1.021, reward_mean=0.040, reward_bound=0.286, batch=229
763: loss=1.023, reward_mean=0.070, reward_bound=0.328, batch=230
764: loss=1.020, reward_mean=0.120, reward_bound=0.376, batch=231
765: loss=1.021, reward_mean=0.030, reward_bound=0.387, batch=230
766: loss=1.021, reward_mean=0.050, reward_bound=0.418, batch=231
767: loss=1.021, reward_mean=0.100, reward_bound=0.349, batch=231
768: loss=1.022, reward_mean=0.090, reward_bound=0.430, batch=229
769: loss=1.021, reward_mean=0.020, reward_bound=0.324, batch=230
770: loss=1.022, reward_mean=0.090, reward_bound=0.387, batch=230
771: loss=1.022, reward_mean=0.040, reward_bound=0.420, batch=231
772: loss=1.019, reward_mean=0.110, reward_bound=0.478, batch=224
773: loss=

885: loss=0.927, reward_mean=0.130, reward_bound=0.180, batch=224
886: loss=0.933, reward_mean=0.110, reward_bound=0.206, batch=221
887: loss=0.940, reward_mean=0.170, reward_bound=0.254, batch=218
888: loss=0.938, reward_mean=0.090, reward_bound=0.113, batch=222
889: loss=0.933, reward_mean=0.120, reward_bound=0.263, batch=225
890: loss=0.933, reward_mean=0.110, reward_bound=0.282, batch=222
891: loss=0.931, reward_mean=0.090, reward_bound=0.272, batch=225
892: loss=0.930, reward_mean=0.110, reward_bound=0.266, batch=227
893: loss=0.930, reward_mean=0.090, reward_bound=0.314, batch=218
894: loss=0.928, reward_mean=0.090, reward_bound=0.231, batch=222
895: loss=0.929, reward_mean=0.070, reward_bound=0.263, batch=225
896: loss=0.927, reward_mean=0.040, reward_bound=0.154, batch=227
897: loss=0.930, reward_mean=0.070, reward_bound=0.308, batch=229
898: loss=0.929, reward_mean=0.090, reward_bound=0.314, batch=229
899: loss=0.930, reward_mean=0.140, reward_bound=0.349, batch=223
900: loss=

1009: loss=0.910, reward_mean=0.120, reward_bound=0.167, batch=214
1010: loss=0.908, reward_mean=0.080, reward_bound=0.195, batch=220
1011: loss=0.908, reward_mean=0.090, reward_bound=0.175, batch=224
1012: loss=0.906, reward_mean=0.140, reward_bound=0.229, batch=222
1013: loss=0.906, reward_mean=0.180, reward_bound=0.282, batch=222
1014: loss=0.907, reward_mean=0.070, reward_bound=0.236, batch=225
1015: loss=0.905, reward_mean=0.060, reward_bound=0.254, batch=226
1016: loss=0.907, reward_mean=0.140, reward_bound=0.314, batch=223
1017: loss=0.911, reward_mean=0.120, reward_bound=0.349, batch=220
1018: loss=0.913, reward_mean=0.150, reward_bound=0.365, batch=224
1019: loss=0.912, reward_mean=0.150, reward_bound=0.349, batch=226
1020: loss=0.913, reward_mean=0.150, reward_bound=0.387, batch=223
1021: loss=0.915, reward_mean=0.080, reward_bound=0.384, batch=226
1022: loss=0.916, reward_mean=0.110, reward_bound=0.349, batch=227
1023: loss=0.915, reward_mean=0.090, reward_bound=0.387, batch

1132: loss=0.813, reward_mean=0.100, reward_bound=0.207, batch=229
1133: loss=0.820, reward_mean=0.110, reward_bound=0.282, batch=226
1134: loss=0.815, reward_mean=0.210, reward_bound=0.314, batch=226
1135: loss=0.815, reward_mean=0.130, reward_bound=0.331, batch=228
1136: loss=0.814, reward_mean=0.100, reward_bound=0.289, batch=229
1137: loss=0.814, reward_mean=0.140, reward_bound=0.309, batch=230
1138: loss=0.833, reward_mean=0.180, reward_bound=0.349, batch=196
1139: loss=0.830, reward_mean=0.150, reward_bound=0.150, batch=206
1140: loss=0.829, reward_mean=0.150, reward_bound=0.158, batch=214
1141: loss=0.822, reward_mean=0.130, reward_bound=0.146, batch=220
1142: loss=0.827, reward_mean=0.150, reward_bound=0.234, batch=224
1143: loss=0.823, reward_mean=0.130, reward_bound=0.254, batch=225
1144: loss=0.827, reward_mean=0.140, reward_bound=0.314, batch=221
1145: loss=0.832, reward_mean=0.080, reward_bound=0.349, batch=209
1146: loss=0.831, reward_mean=0.180, reward_bound=0.265, batch

1255: loss=0.832, reward_mean=0.090, reward_bound=0.000, batch=200
1256: loss=0.844, reward_mean=0.140, reward_bound=0.022, batch=210
1257: loss=0.848, reward_mean=0.140, reward_bound=0.065, batch=216
1258: loss=0.847, reward_mean=0.110, reward_bound=0.109, batch=220
1259: loss=0.848, reward_mean=0.090, reward_bound=0.135, batch=222
1260: loss=0.856, reward_mean=0.120, reward_bound=0.150, batch=221
1261: loss=0.856, reward_mean=0.090, reward_bound=0.167, batch=223
1262: loss=0.855, reward_mean=0.080, reward_bound=0.185, batch=220
1263: loss=0.854, reward_mean=0.140, reward_bound=0.206, batch=226
1264: loss=0.854, reward_mean=0.150, reward_bound=0.206, batch=224
1265: loss=0.864, reward_mean=0.100, reward_bound=0.229, batch=221
1266: loss=0.859, reward_mean=0.140, reward_bound=0.254, batch=218
1267: loss=0.861, reward_mean=0.100, reward_bound=0.178, batch=222
1268: loss=0.860, reward_mean=0.130, reward_bound=0.282, batch=219
1269: loss=0.858, reward_mean=0.160, reward_bound=0.282, batch

1378: loss=0.838, reward_mean=0.090, reward_bound=0.351, batch=228
1379: loss=0.844, reward_mean=0.130, reward_bound=0.387, batch=221
1380: loss=0.843, reward_mean=0.120, reward_bound=0.314, batch=224
1381: loss=0.839, reward_mean=0.050, reward_bound=0.180, batch=227
1382: loss=0.836, reward_mean=0.060, reward_bound=0.182, batch=229
1383: loss=0.841, reward_mean=0.080, reward_bound=0.295, batch=230
1384: loss=0.844, reward_mean=0.080, reward_bound=0.338, batch=231
1385: loss=0.840, reward_mean=0.110, reward_bound=0.349, batch=229
1386: loss=0.842, reward_mean=0.080, reward_bound=0.430, batch=222
1387: loss=0.843, reward_mean=0.080, reward_bound=0.276, batch=225
1388: loss=0.840, reward_mean=0.100, reward_bound=0.387, batch=225
1389: loss=0.838, reward_mean=0.120, reward_bound=0.356, batch=227
1390: loss=0.839, reward_mean=0.100, reward_bound=0.387, batch=228
1391: loss=0.840, reward_mean=0.100, reward_bound=0.392, batch=229
1392: loss=0.840, reward_mean=0.080, reward_bound=0.387, batch

1502: loss=0.712, reward_mean=0.110, reward_bound=0.038, batch=216
1503: loss=0.711, reward_mean=0.140, reward_bound=0.052, batch=217
1504: loss=0.701, reward_mean=0.260, reward_bound=0.080, batch=220
1505: loss=0.702, reward_mean=0.260, reward_bound=0.109, batch=219
1506: loss=0.703, reward_mean=0.150, reward_bound=0.122, batch=222
1507: loss=0.697, reward_mean=0.140, reward_bound=0.135, batch=214
1508: loss=0.707, reward_mean=0.170, reward_bound=0.150, batch=208
1509: loss=0.712, reward_mean=0.260, reward_bound=0.167, batch=212
1510: loss=0.702, reward_mean=0.160, reward_bound=0.185, batch=208
1511: loss=0.702, reward_mean=0.130, reward_bound=0.171, batch=215
1512: loss=0.691, reward_mean=0.250, reward_bound=0.206, batch=201
1513: loss=0.684, reward_mean=0.180, reward_bound=0.122, batch=210
1514: loss=0.681, reward_mean=0.200, reward_bound=0.167, batch=216
1515: loss=0.682, reward_mean=0.230, reward_bound=0.229, batch=196
1516: loss=0.679, reward_mean=0.200, reward_bound=0.158, batch

1625: loss=0.643, reward_mean=0.240, reward_bound=0.229, batch=218
1626: loss=0.654, reward_mean=0.230, reward_bound=0.254, batch=212
1627: loss=0.648, reward_mean=0.220, reward_bound=0.198, batch=218
1628: loss=0.649, reward_mean=0.300, reward_bound=0.282, batch=221
1629: loss=0.661, reward_mean=0.250, reward_bound=0.314, batch=209
1630: loss=0.670, reward_mean=0.310, reward_bound=0.349, batch=200
1631: loss=0.657, reward_mean=0.240, reward_bound=0.162, batch=210
1632: loss=0.652, reward_mean=0.210, reward_bound=0.167, batch=216
1633: loss=0.653, reward_mean=0.230, reward_bound=0.217, batch=221
1634: loss=0.664, reward_mean=0.220, reward_bound=0.254, batch=222
1635: loss=0.662, reward_mean=0.230, reward_bound=0.282, batch=223
1636: loss=0.667, reward_mean=0.190, reward_bound=0.314, batch=221
1637: loss=0.664, reward_mean=0.300, reward_bound=0.349, batch=220
1638: loss=0.668, reward_mean=0.200, reward_bound=0.274, batch=224
1639: loss=0.668, reward_mean=0.300, reward_bound=0.345, batch

1748: loss=0.667, reward_mean=0.200, reward_bound=0.204, batch=206
1749: loss=0.663, reward_mean=0.180, reward_bound=0.196, batch=214
1750: loss=0.664, reward_mean=0.270, reward_bound=0.206, batch=216
1751: loss=0.668, reward_mean=0.230, reward_bound=0.254, batch=219
1752: loss=0.665, reward_mean=0.170, reward_bound=0.282, batch=216
1753: loss=0.667, reward_mean=0.210, reward_bound=0.314, batch=217
1754: loss=0.669, reward_mean=0.220, reward_bound=0.229, batch=221
1755: loss=0.669, reward_mean=0.160, reward_bound=0.314, batch=224
1756: loss=0.671, reward_mean=0.160, reward_bound=0.252, batch=227
1757: loss=0.669, reward_mean=0.180, reward_bound=0.308, batch=229
1758: loss=0.671, reward_mean=0.240, reward_bound=0.349, batch=222
1759: loss=0.671, reward_mean=0.210, reward_bound=0.282, batch=224
1760: loss=0.670, reward_mean=0.150, reward_bound=0.342, batch=227
1761: loss=0.672, reward_mean=0.240, reward_bound=0.349, batch=228
1762: loss=0.671, reward_mean=0.110, reward_bound=0.353, batch

1871: loss=0.625, reward_mean=0.190, reward_bound=0.229, batch=219
1872: loss=0.635, reward_mean=0.240, reward_bound=0.265, batch=223
1873: loss=0.633, reward_mean=0.210, reward_bound=0.282, batch=224
1874: loss=0.628, reward_mean=0.160, reward_bound=0.311, batch=227
1875: loss=0.633, reward_mean=0.200, reward_bound=0.314, batch=226
1876: loss=0.624, reward_mean=0.280, reward_bound=0.349, batch=219
1877: loss=0.624, reward_mean=0.230, reward_bound=0.309, batch=223
1878: loss=0.620, reward_mean=0.190, reward_bound=0.290, batch=226
1879: loss=0.620, reward_mean=0.180, reward_bound=0.331, batch=228
1880: loss=0.624, reward_mean=0.210, reward_bound=0.387, batch=206
1881: loss=0.620, reward_mean=0.190, reward_bound=0.176, batch=214
1882: loss=0.624, reward_mean=0.240, reward_bound=0.280, batch=220
1883: loss=0.625, reward_mean=0.170, reward_bound=0.206, batch=225
1884: loss=0.630, reward_mean=0.220, reward_bound=0.234, batch=227
1885: loss=0.624, reward_mean=0.160, reward_bound=0.254, batch

1994: loss=0.622, reward_mean=0.220, reward_bound=0.422, batch=229
1995: loss=0.622, reward_mean=0.220, reward_bound=0.430, batch=228
1996: loss=0.620, reward_mean=0.200, reward_bound=0.478, batch=230
1997: loss=0.620, reward_mean=0.280, reward_bound=0.418, batch=231
1998: loss=0.619, reward_mean=0.220, reward_bound=0.430, batch=231
1999: loss=0.616, reward_mean=0.170, reward_bound=0.478, batch=221
2000: loss=0.613, reward_mean=0.330, reward_bound=0.387, batch=224
2001: loss=0.616, reward_mean=0.200, reward_bound=0.430, batch=223
2002: loss=0.616, reward_mean=0.260, reward_bound=0.335, batch=226
2003: loss=0.618, reward_mean=0.260, reward_bound=0.349, batch=227
2004: loss=0.619, reward_mean=0.260, reward_bound=0.380, batch=229
2005: loss=0.619, reward_mean=0.240, reward_bound=0.405, batch=230
2006: loss=0.617, reward_mean=0.220, reward_bound=0.418, batch=231
2007: loss=0.617, reward_mean=0.220, reward_bound=0.430, batch=229
2008: loss=0.616, reward_mean=0.220, reward_bound=0.450, batch

2118: loss=0.507, reward_mean=0.360, reward_bound=0.080, batch=186
2119: loss=0.500, reward_mean=0.300, reward_bound=0.109, batch=196
2120: loss=0.498, reward_mean=0.210, reward_bound=0.104, batch=207
2121: loss=0.496, reward_mean=0.340, reward_bound=0.135, batch=211
2122: loss=0.492, reward_mean=0.280, reward_bound=0.150, batch=214
2123: loss=0.510, reward_mean=0.330, reward_bound=0.185, batch=217
2124: loss=0.520, reward_mean=0.270, reward_bound=0.206, batch=210
2125: loss=0.518, reward_mean=0.150, reward_bound=0.115, batch=217
2126: loss=0.522, reward_mean=0.270, reward_bound=0.229, batch=215
2127: loss=0.526, reward_mean=0.200, reward_bound=0.254, batch=209
2128: loss=0.517, reward_mean=0.290, reward_bound=0.215, batch=216
2129: loss=0.519, reward_mean=0.260, reward_bound=0.229, batch=220
2130: loss=0.516, reward_mean=0.280, reward_bound=0.274, batch=224
2131: loss=0.520, reward_mean=0.340, reward_bound=0.282, batch=217
2132: loss=0.524, reward_mean=0.310, reward_bound=0.314, batch

2241: loss=0.476, reward_mean=0.310, reward_bound=0.268, batch=228
2242: loss=0.471, reward_mean=0.230, reward_bound=0.286, batch=229
2243: loss=0.472, reward_mean=0.290, reward_bound=0.314, batch=229
2244: loss=0.479, reward_mean=0.280, reward_bound=0.430, batch=204
2245: loss=0.470, reward_mean=0.340, reward_bound=0.226, batch=213
2246: loss=0.470, reward_mean=0.300, reward_bound=0.198, batch=219
2247: loss=0.471, reward_mean=0.270, reward_bound=0.229, batch=222
2248: loss=0.474, reward_mean=0.270, reward_bound=0.254, batch=224
2249: loss=0.473, reward_mean=0.280, reward_bound=0.311, batch=227
2250: loss=0.471, reward_mean=0.320, reward_bound=0.314, batch=223
2251: loss=0.468, reward_mean=0.270, reward_bound=0.290, batch=226
2252: loss=0.467, reward_mean=0.310, reward_bound=0.331, batch=228
2253: loss=0.470, reward_mean=0.260, reward_bound=0.349, batch=228
2254: loss=0.471, reward_mean=0.210, reward_bound=0.387, batch=222
2255: loss=0.470, reward_mean=0.260, reward_bound=0.272, batch

2364: loss=0.442, reward_mean=0.350, reward_bound=0.387, batch=225
2365: loss=0.449, reward_mean=0.270, reward_bound=0.430, batch=200
2366: loss=0.440, reward_mean=0.320, reward_bound=0.106, batch=210
2367: loss=0.443, reward_mean=0.290, reward_bound=0.180, batch=217
2368: loss=0.444, reward_mean=0.310, reward_bound=0.206, batch=220
2369: loss=0.439, reward_mean=0.280, reward_bound=0.254, batch=219
2370: loss=0.450, reward_mean=0.320, reward_bound=0.328, batch=223
2371: loss=0.456, reward_mean=0.310, reward_bound=0.349, batch=220
2372: loss=0.458, reward_mean=0.330, reward_bound=0.304, batch=224
2373: loss=0.458, reward_mean=0.260, reward_bound=0.252, batch=227
2374: loss=0.453, reward_mean=0.340, reward_bound=0.342, batch=229
2375: loss=0.451, reward_mean=0.240, reward_bound=0.309, batch=230
2376: loss=0.461, reward_mean=0.290, reward_bound=0.376, batch=231
2377: loss=0.456, reward_mean=0.340, reward_bound=0.387, batch=221
2378: loss=0.453, reward_mean=0.340, reward_bound=0.349, batch

2487: loss=0.428, reward_mean=0.240, reward_bound=0.286, batch=222
2488: loss=0.431, reward_mean=0.350, reward_bound=0.349, batch=215
2489: loss=0.427, reward_mean=0.200, reward_bound=0.161, batch=220
2490: loss=0.432, reward_mean=0.310, reward_bound=0.282, batch=222
2491: loss=0.435, reward_mean=0.300, reward_bound=0.324, batch=225
2492: loss=0.432, reward_mean=0.290, reward_bound=0.321, batch=227
2493: loss=0.432, reward_mean=0.250, reward_bound=0.349, batch=226
2494: loss=0.431, reward_mean=0.310, reward_bound=0.349, batch=226
2495: loss=0.436, reward_mean=0.260, reward_bound=0.387, batch=213
2496: loss=0.437, reward_mean=0.330, reward_bound=0.280, batch=219
2497: loss=0.427, reward_mean=0.300, reward_bound=0.349, batch=222
2498: loss=0.428, reward_mean=0.360, reward_bound=0.360, batch=225
2499: loss=0.427, reward_mean=0.310, reward_bound=0.349, batch=226
2500: loss=0.427, reward_mean=0.240, reward_bound=0.349, batch=227
2501: loss=0.433, reward_mean=0.340, reward_bound=0.387, batch

2610: loss=0.451, reward_mean=0.230, reward_bound=0.349, batch=226
2611: loss=0.451, reward_mean=0.180, reward_bound=0.390, batch=228
2612: loss=0.452, reward_mean=0.240, reward_bound=0.392, batch=229
2613: loss=0.451, reward_mean=0.330, reward_bound=0.430, batch=226
2614: loss=0.449, reward_mean=0.210, reward_bound=0.433, batch=228
2615: loss=0.449, reward_mean=0.310, reward_bound=0.430, batch=228
2616: loss=0.449, reward_mean=0.280, reward_bound=0.387, batch=228
2617: loss=0.449, reward_mean=0.310, reward_bound=0.435, batch=229
2618: loss=0.449, reward_mean=0.310, reward_bound=0.343, batch=230
2619: loss=0.451, reward_mean=0.360, reward_bound=0.387, batch=230
2620: loss=0.450, reward_mean=0.330, reward_bound=0.478, batch=230
2621: loss=0.451, reward_mean=0.290, reward_bound=0.418, batch=231
2622: loss=0.450, reward_mean=0.270, reward_bound=0.478, batch=230
2623: loss=0.450, reward_mean=0.380, reward_bound=0.451, batch=231
2624: loss=0.450, reward_mean=0.320, reward_bound=0.430, batch

2734: loss=0.359, reward_mean=0.310, reward_bound=0.229, batch=223
2735: loss=0.361, reward_mean=0.340, reward_bound=0.271, batch=226
2736: loss=0.361, reward_mean=0.350, reward_bound=0.298, batch=228
2737: loss=0.365, reward_mean=0.310, reward_bound=0.317, batch=229
2738: loss=0.359, reward_mean=0.350, reward_bound=0.349, batch=204
2739: loss=0.359, reward_mean=0.310, reward_bound=0.182, batch=213
2740: loss=0.361, reward_mean=0.280, reward_bound=0.206, batch=215
2741: loss=0.362, reward_mean=0.310, reward_bound=0.210, batch=220
2742: loss=0.362, reward_mean=0.320, reward_bound=0.247, batch=224
2743: loss=0.362, reward_mean=0.290, reward_bound=0.254, batch=226
2744: loss=0.362, reward_mean=0.290, reward_bound=0.282, batch=225
2745: loss=0.361, reward_mean=0.320, reward_bound=0.314, batch=222
2746: loss=0.364, reward_mean=0.310, reward_bound=0.349, batch=220
2747: loss=0.360, reward_mean=0.390, reward_bound=0.349, batch=223
2748: loss=0.357, reward_mean=0.360, reward_bound=0.387, batch

2857: loss=0.366, reward_mean=0.310, reward_bound=0.210, batch=220
2858: loss=0.363, reward_mean=0.220, reward_bound=0.180, batch=224
2859: loss=0.355, reward_mean=0.360, reward_bound=0.314, batch=221
2860: loss=0.356, reward_mean=0.430, reward_bound=0.349, batch=219
2861: loss=0.353, reward_mean=0.420, reward_bound=0.254, batch=221
2862: loss=0.357, reward_mean=0.320, reward_bound=0.282, batch=224
2863: loss=0.356, reward_mean=0.340, reward_bound=0.349, batch=226
2864: loss=0.356, reward_mean=0.290, reward_bound=0.387, batch=223
2865: loss=0.356, reward_mean=0.330, reward_bound=0.413, batch=226
2866: loss=0.355, reward_mean=0.280, reward_bound=0.351, batch=228
2867: loss=0.358, reward_mean=0.350, reward_bound=0.387, batch=228
2868: loss=0.360, reward_mean=0.380, reward_bound=0.430, batch=223
2869: loss=0.358, reward_mean=0.360, reward_bound=0.398, batch=226
2870: loss=0.358, reward_mean=0.340, reward_bound=0.282, batch=227
2871: loss=0.358, reward_mean=0.360, reward_bound=0.349, batch

2980: loss=0.350, reward_mean=0.340, reward_bound=0.376, batch=224
2981: loss=0.352, reward_mean=0.350, reward_bound=0.387, batch=226
2982: loss=0.352, reward_mean=0.260, reward_bound=0.390, batch=228
2983: loss=0.352, reward_mean=0.380, reward_bound=0.392, batch=229
2984: loss=0.355, reward_mean=0.310, reward_bound=0.430, batch=227
2985: loss=0.356, reward_mean=0.270, reward_bound=0.380, batch=229
2986: loss=0.355, reward_mean=0.270, reward_bound=0.405, batch=230
2987: loss=0.354, reward_mean=0.400, reward_bound=0.430, batch=229
2988: loss=0.353, reward_mean=0.330, reward_bound=0.478, batch=232
2989: loss=0.361, reward_mean=0.280, reward_bound=0.478, batch=158
2990: loss=0.352, reward_mean=0.360, reward_bound=0.080, batch=179
2991: loss=0.346, reward_mean=0.340, reward_bound=0.109, batch=193
2992: loss=0.349, reward_mean=0.270, reward_bound=0.144, batch=205
2993: loss=0.351, reward_mean=0.330, reward_bound=0.150, batch=212
2994: loss=0.348, reward_mean=0.340, reward_bound=0.172, batch

3103: loss=0.355, reward_mean=0.300, reward_bound=0.321, batch=222
3104: loss=0.356, reward_mean=0.350, reward_bound=0.292, batch=225
3105: loss=0.354, reward_mean=0.350, reward_bound=0.356, batch=227
3106: loss=0.352, reward_mean=0.350, reward_bound=0.387, batch=223
3107: loss=0.353, reward_mean=0.270, reward_bound=0.358, batch=226
3108: loss=0.350, reward_mean=0.280, reward_bound=0.271, batch=228
3109: loss=0.353, reward_mean=0.370, reward_bound=0.387, batch=228
3110: loss=0.353, reward_mean=0.330, reward_bound=0.387, batch=228
3111: loss=0.353, reward_mean=0.420, reward_bound=0.430, batch=220
3112: loss=0.353, reward_mean=0.370, reward_bound=0.349, batch=223
3113: loss=0.355, reward_mean=0.340, reward_bound=0.387, batch=225
3114: loss=0.354, reward_mean=0.360, reward_bound=0.387, batch=226
3115: loss=0.354, reward_mean=0.290, reward_bound=0.387, batch=226
3116: loss=0.352, reward_mean=0.360, reward_bound=0.409, batch=228
3117: loss=0.349, reward_mean=0.200, reward_bound=0.430, batch

3227: loss=0.310, reward_mean=0.340, reward_bound=0.282, batch=221
3228: loss=0.310, reward_mean=0.380, reward_bound=0.314, batch=224
3229: loss=0.314, reward_mean=0.390, reward_bound=0.349, batch=225
3230: loss=0.310, reward_mean=0.380, reward_bound=0.387, batch=144
3231: loss=0.295, reward_mean=0.370, reward_bound=0.015, batch=171
3232: loss=0.295, reward_mean=0.370, reward_bound=0.089, batch=189
3233: loss=0.295, reward_mean=0.270, reward_bound=0.098, batch=200
3234: loss=0.302, reward_mean=0.300, reward_bound=0.103, batch=210
3235: loss=0.296, reward_mean=0.370, reward_bound=0.135, batch=216
3236: loss=0.300, reward_mean=0.330, reward_bound=0.150, batch=217
3237: loss=0.301, reward_mean=0.360, reward_bound=0.167, batch=220
3238: loss=0.297, reward_mean=0.390, reward_bound=0.206, batch=226
3239: loss=0.296, reward_mean=0.310, reward_bound=0.206, batch=219
3240: loss=0.294, reward_mean=0.340, reward_bound=0.229, batch=215
3241: loss=0.300, reward_mean=0.340, reward_bound=0.254, batch

3350: loss=0.309, reward_mean=0.330, reward_bound=0.254, batch=218
3351: loss=0.306, reward_mean=0.250, reward_bound=0.260, batch=222
3352: loss=0.305, reward_mean=0.390, reward_bound=0.387, batch=220
3353: loss=0.306, reward_mean=0.350, reward_bound=0.376, batch=224
3354: loss=0.308, reward_mean=0.380, reward_bound=0.349, batch=226
3355: loss=0.305, reward_mean=0.340, reward_bound=0.387, batch=227
3356: loss=0.307, reward_mean=0.340, reward_bound=0.422, batch=229
3357: loss=0.308, reward_mean=0.420, reward_bound=0.324, batch=230
3358: loss=0.307, reward_mean=0.330, reward_bound=0.430, batch=221
3359: loss=0.306, reward_mean=0.310, reward_bound=0.282, batch=224
3360: loss=0.308, reward_mean=0.330, reward_bound=0.345, batch=227
3361: loss=0.307, reward_mean=0.440, reward_bound=0.387, batch=228
3362: loss=0.306, reward_mean=0.400, reward_bound=0.430, batch=225
3363: loss=0.305, reward_mean=0.310, reward_bound=0.430, batch=226
3364: loss=0.304, reward_mean=0.380, reward_bound=0.430, batch

3473: loss=0.322, reward_mean=0.280, reward_bound=0.229, batch=222
3474: loss=0.318, reward_mean=0.360, reward_bound=0.254, batch=223
3475: loss=0.318, reward_mean=0.400, reward_bound=0.282, batch=223
3476: loss=0.321, reward_mean=0.390, reward_bound=0.314, batch=221
3477: loss=0.318, reward_mean=0.370, reward_bound=0.349, batch=223
3478: loss=0.320, reward_mean=0.400, reward_bound=0.387, batch=219
3479: loss=0.321, reward_mean=0.370, reward_bound=0.387, batch=222
3480: loss=0.321, reward_mean=0.390, reward_bound=0.373, batch=225
3481: loss=0.319, reward_mean=0.310, reward_bound=0.396, batch=227
3482: loss=0.319, reward_mean=0.280, reward_bound=0.380, batch=229
3483: loss=0.319, reward_mean=0.360, reward_bound=0.405, batch=230
3484: loss=0.313, reward_mean=0.360, reward_bound=0.430, batch=202
3485: loss=0.312, reward_mean=0.320, reward_bound=0.236, batch=211
3486: loss=0.310, reward_mean=0.250, reward_bound=0.185, batch=217
3487: loss=0.312, reward_mean=0.350, reward_bound=0.254, batch

3596: loss=0.304, reward_mean=0.270, reward_bound=0.080, batch=196
3597: loss=0.302, reward_mean=0.340, reward_bound=0.122, batch=204
3598: loss=0.299, reward_mean=0.310, reward_bound=0.135, batch=212
3599: loss=0.303, reward_mean=0.360, reward_bound=0.191, batch=218
3600: loss=0.303, reward_mean=0.310, reward_bound=0.229, batch=221
3601: loss=0.308, reward_mean=0.440, reward_bound=0.254, batch=220
3602: loss=0.313, reward_mean=0.240, reward_bound=0.282, batch=220
3603: loss=0.317, reward_mean=0.420, reward_bound=0.314, batch=214
3604: loss=0.316, reward_mean=0.370, reward_bound=0.185, batch=219
3605: loss=0.317, reward_mean=0.310, reward_bound=0.254, batch=222
3606: loss=0.317, reward_mean=0.400, reward_bound=0.314, batch=224
3607: loss=0.316, reward_mean=0.320, reward_bound=0.345, batch=227
3608: loss=0.320, reward_mean=0.350, reward_bound=0.349, batch=216
3609: loss=0.321, reward_mean=0.440, reward_bound=0.331, batch=221
3610: loss=0.321, reward_mean=0.350, reward_bound=0.314, batch

3719: loss=0.306, reward_mean=0.360, reward_bound=0.365, batch=231
3720: loss=0.307, reward_mean=0.420, reward_bound=0.387, batch=231
3721: loss=0.306, reward_mean=0.370, reward_bound=0.430, batch=228
3722: loss=0.306, reward_mean=0.350, reward_bound=0.478, batch=230
3723: loss=0.306, reward_mean=0.280, reward_bound=0.464, batch=231
3724: loss=0.307, reward_mean=0.330, reward_bound=0.478, batch=224
3725: loss=0.307, reward_mean=0.350, reward_bound=0.349, batch=226
3726: loss=0.305, reward_mean=0.290, reward_bound=0.387, batch=227
3727: loss=0.306, reward_mean=0.330, reward_bound=0.430, batch=228
3728: loss=0.305, reward_mean=0.320, reward_bound=0.478, batch=230
3729: loss=0.305, reward_mean=0.310, reward_bound=0.478, batch=229
3730: loss=0.303, reward_mean=0.280, reward_bound=0.478, batch=231
3731: loss=0.304, reward_mean=0.340, reward_bound=0.478, batch=230
3732: loss=0.305, reward_mean=0.280, reward_bound=0.320, batch=231
3733: loss=0.304, reward_mean=0.330, reward_bound=0.430, batch

3843: loss=0.266, reward_mean=0.270, reward_bound=0.284, batch=221
3844: loss=0.267, reward_mean=0.270, reward_bound=0.254, batch=224
3845: loss=0.263, reward_mean=0.400, reward_bound=0.349, batch=206
3846: loss=0.265, reward_mean=0.380, reward_bound=0.241, batch=214
3847: loss=0.262, reward_mean=0.350, reward_bound=0.254, batch=217
3848: loss=0.256, reward_mean=0.390, reward_bound=0.282, batch=221
3849: loss=0.257, reward_mean=0.520, reward_bound=0.314, batch=221
3850: loss=0.262, reward_mean=0.370, reward_bound=0.349, batch=219
3851: loss=0.262, reward_mean=0.360, reward_bound=0.309, batch=223
3852: loss=0.262, reward_mean=0.400, reward_bound=0.335, batch=226
3853: loss=0.263, reward_mean=0.310, reward_bound=0.387, batch=200
3854: loss=0.262, reward_mean=0.450, reward_bound=0.314, batch=208
3855: loss=0.260, reward_mean=0.440, reward_bound=0.229, batch=213
3856: loss=0.264, reward_mean=0.400, reward_bound=0.282, batch=217
3857: loss=0.266, reward_mean=0.400, reward_bound=0.308, batch

3966: loss=0.261, reward_mean=0.340, reward_bound=0.464, batch=231
3967: loss=0.259, reward_mean=0.420, reward_bound=0.478, batch=95
3968: loss=0.247, reward_mean=0.350, reward_bound=0.000, batch=130
3969: loss=0.234, reward_mean=0.350, reward_bound=0.003, batch=161
3970: loss=0.231, reward_mean=0.370, reward_bound=0.013, batch=181
3971: loss=0.236, reward_mean=0.330, reward_bound=0.031, batch=195
3972: loss=0.233, reward_mean=0.310, reward_bound=0.059, batch=206
3973: loss=0.225, reward_mean=0.470, reward_bound=0.098, batch=206
3974: loss=0.220, reward_mean=0.350, reward_bound=0.122, batch=202
3975: loss=0.223, reward_mean=0.400, reward_bound=0.135, batch=207
3976: loss=0.229, reward_mean=0.410, reward_bound=0.150, batch=214
3977: loss=0.240, reward_mean=0.350, reward_bound=0.167, batch=211
3978: loss=0.235, reward_mean=0.320, reward_bound=0.185, batch=211
3979: loss=0.237, reward_mean=0.440, reward_bound=0.206, batch=204
3980: loss=0.240, reward_mean=0.390, reward_bound=0.204, batch=

4089: loss=0.260, reward_mean=0.330, reward_bound=0.387, batch=229
4090: loss=0.256, reward_mean=0.350, reward_bound=0.430, batch=225
4091: loss=0.247, reward_mean=0.350, reward_bound=0.478, batch=147
4092: loss=0.242, reward_mean=0.340, reward_bound=0.033, batch=173
4093: loss=0.234, reward_mean=0.370, reward_bound=0.050, batch=191
4094: loss=0.238, reward_mean=0.400, reward_bound=0.098, batch=201
4095: loss=0.238, reward_mean=0.390, reward_bound=0.122, batch=210
4096: loss=0.234, reward_mean=0.350, reward_bound=0.150, batch=216
4097: loss=0.225, reward_mean=0.380, reward_bound=0.206, batch=217
4098: loss=0.234, reward_mean=0.360, reward_bound=0.229, batch=211
4099: loss=0.246, reward_mean=0.310, reward_bound=0.254, batch=209
4100: loss=0.246, reward_mean=0.410, reward_bound=0.282, batch=204
4101: loss=0.241, reward_mean=0.330, reward_bound=0.183, batch=213
4102: loss=0.246, reward_mean=0.370, reward_bound=0.229, batch=218
4103: loss=0.247, reward_mean=0.470, reward_bound=0.257, batch

4212: loss=0.264, reward_mean=0.330, reward_bound=0.301, batch=226
4213: loss=0.266, reward_mean=0.250, reward_bound=0.298, batch=228
4214: loss=0.266, reward_mean=0.300, reward_bound=0.314, batch=228
4215: loss=0.264, reward_mean=0.260, reward_bound=0.349, batch=225
4216: loss=0.263, reward_mean=0.400, reward_bound=0.387, batch=226
4217: loss=0.262, reward_mean=0.330, reward_bound=0.430, batch=223
4218: loss=0.263, reward_mean=0.350, reward_bound=0.398, batch=226
4219: loss=0.264, reward_mean=0.340, reward_bound=0.409, batch=228
4220: loss=0.264, reward_mean=0.360, reward_bound=0.317, batch=229
4221: loss=0.264, reward_mean=0.310, reward_bound=0.292, batch=230
4222: loss=0.262, reward_mean=0.380, reward_bound=0.376, batch=231
4223: loss=0.262, reward_mean=0.330, reward_bound=0.387, batch=230
4224: loss=0.262, reward_mean=0.470, reward_bound=0.418, batch=231
4225: loss=0.262, reward_mean=0.340, reward_bound=0.430, batch=226
4226: loss=0.260, reward_mean=0.280, reward_bound=0.390, batch

4336: loss=0.223, reward_mean=0.440, reward_bound=0.263, batch=204
4337: loss=0.219, reward_mean=0.440, reward_bound=0.249, batch=213
4338: loss=0.220, reward_mean=0.360, reward_bound=0.254, batch=217
4339: loss=0.220, reward_mean=0.470, reward_bound=0.245, batch=222
4340: loss=0.222, reward_mean=0.350, reward_bound=0.282, batch=187
4341: loss=0.220, reward_mean=0.350, reward_bound=0.167, batch=200
4342: loss=0.224, reward_mean=0.340, reward_bound=0.185, batch=207
4343: loss=0.219, reward_mean=0.420, reward_bound=0.254, batch=212
4344: loss=0.221, reward_mean=0.360, reward_bound=0.206, batch=219
4345: loss=0.223, reward_mean=0.320, reward_bound=0.239, batch=223
4346: loss=0.222, reward_mean=0.420, reward_bound=0.282, batch=219
4347: loss=0.236, reward_mean=0.440, reward_bound=0.314, batch=180
4348: loss=0.230, reward_mean=0.310, reward_bound=0.075, batch=196
4349: loss=0.220, reward_mean=0.480, reward_bound=0.158, batch=207
4350: loss=0.226, reward_mean=0.450, reward_bound=0.206, batch

4459: loss=0.238, reward_mean=0.370, reward_bound=0.176, batch=200
4460: loss=0.235, reward_mean=0.400, reward_bound=0.247, batch=210
4461: loss=0.235, reward_mean=0.390, reward_bound=0.254, batch=213
4462: loss=0.237, reward_mean=0.370, reward_bound=0.282, batch=213
4463: loss=0.236, reward_mean=0.330, reward_bound=0.198, batch=219
4464: loss=0.236, reward_mean=0.430, reward_bound=0.254, batch=222
4465: loss=0.238, reward_mean=0.350, reward_bound=0.282, batch=224
4466: loss=0.238, reward_mean=0.290, reward_bound=0.229, batch=226
4467: loss=0.234, reward_mean=0.390, reward_bound=0.314, batch=221
4468: loss=0.230, reward_mean=0.450, reward_bound=0.349, batch=212
4469: loss=0.229, reward_mean=0.450, reward_bound=0.236, batch=218
4470: loss=0.226, reward_mean=0.320, reward_bound=0.254, batch=220
4471: loss=0.222, reward_mean=0.280, reward_bound=0.304, batch=224
4472: loss=0.225, reward_mean=0.350, reward_bound=0.349, batch=222
4473: loss=0.228, reward_mean=0.400, reward_bound=0.360, batch

4582: loss=0.202, reward_mean=0.360, reward_bound=0.349, batch=213
4583: loss=0.206, reward_mean=0.340, reward_bound=0.335, batch=219
4584: loss=0.207, reward_mean=0.360, reward_bound=0.282, batch=222
4585: loss=0.206, reward_mean=0.390, reward_bound=0.292, batch=225
4586: loss=0.204, reward_mean=0.320, reward_bound=0.321, batch=227
4587: loss=0.202, reward_mean=0.380, reward_bound=0.349, batch=228
4588: loss=0.209, reward_mean=0.350, reward_bound=0.387, batch=210
4589: loss=0.211, reward_mean=0.340, reward_bound=0.167, batch=216
4590: loss=0.210, reward_mean=0.320, reward_bound=0.254, batch=218
4591: loss=0.210, reward_mean=0.350, reward_bound=0.286, batch=222
4592: loss=0.214, reward_mean=0.510, reward_bound=0.314, batch=224
4593: loss=0.213, reward_mean=0.380, reward_bound=0.349, batch=224
4594: loss=0.212, reward_mean=0.380, reward_bound=0.345, batch=227
4595: loss=0.212, reward_mean=0.380, reward_bound=0.277, batch=229
4596: loss=0.209, reward_mean=0.450, reward_bound=0.295, batch

4705: loss=0.208, reward_mean=0.340, reward_bound=0.282, batch=201
4706: loss=0.205, reward_mean=0.370, reward_bound=0.229, batch=209
4707: loss=0.199, reward_mean=0.330, reward_bound=0.194, batch=216
4708: loss=0.201, reward_mean=0.350, reward_bound=0.282, batch=217
4709: loss=0.210, reward_mean=0.410, reward_bound=0.314, batch=199
4710: loss=0.201, reward_mean=0.420, reward_bound=0.194, batch=209
4711: loss=0.207, reward_mean=0.390, reward_bound=0.239, batch=216
4712: loss=0.208, reward_mean=0.410, reward_bound=0.268, batch=221
4713: loss=0.210, reward_mean=0.410, reward_bound=0.282, batch=222
4714: loss=0.209, reward_mean=0.440, reward_bound=0.314, batch=217
4715: loss=0.207, reward_mean=0.460, reward_bound=0.349, batch=205
4716: loss=0.202, reward_mean=0.470, reward_bound=0.206, batch=212
4717: loss=0.205, reward_mean=0.420, reward_bound=0.229, batch=216
4718: loss=0.205, reward_mean=0.410, reward_bound=0.282, batch=220
4719: loss=0.204, reward_mean=0.410, reward_bound=0.314, batch

4828: loss=0.225, reward_mean=0.410, reward_bound=0.430, batch=206
4829: loss=0.223, reward_mean=0.380, reward_bound=0.241, batch=214
4830: loss=0.221, reward_mean=0.390, reward_bound=0.252, batch=220
4831: loss=0.220, reward_mean=0.360, reward_bound=0.185, batch=223
4832: loss=0.222, reward_mean=0.380, reward_bound=0.254, batch=223
4833: loss=0.225, reward_mean=0.420, reward_bound=0.282, batch=225
4834: loss=0.223, reward_mean=0.420, reward_bound=0.314, batch=224
4835: loss=0.224, reward_mean=0.370, reward_bound=0.345, batch=227
4836: loss=0.224, reward_mean=0.430, reward_bound=0.349, batch=227
4837: loss=0.225, reward_mean=0.430, reward_bound=0.292, batch=229
4838: loss=0.223, reward_mean=0.320, reward_bound=0.314, batch=229
4839: loss=0.222, reward_mean=0.400, reward_bound=0.364, batch=230
4840: loss=0.221, reward_mean=0.390, reward_bound=0.387, batch=225
4841: loss=0.220, reward_mean=0.380, reward_bound=0.365, batch=227
4842: loss=0.219, reward_mean=0.370, reward_bound=0.282, batch

4951: loss=0.216, reward_mean=0.360, reward_bound=0.430, batch=230
4952: loss=0.215, reward_mean=0.340, reward_bound=0.477, batch=231
4954: loss=0.183, reward_mean=0.490, reward_bound=0.000, batch=49
4955: loss=0.205, reward_mean=0.490, reward_bound=0.000, batch=98
4956: loss=0.202, reward_mean=0.480, reward_bound=0.001, batch=138
4957: loss=0.199, reward_mean=0.370, reward_bound=0.003, batch=166
4958: loss=0.207, reward_mean=0.490, reward_bound=0.015, batch=184
4959: loss=0.214, reward_mean=0.490, reward_bound=0.028, batch=197
4960: loss=0.208, reward_mean=0.490, reward_bound=0.042, batch=208
4961: loss=0.211, reward_mean=0.320, reward_bound=0.053, batch=215
4962: loss=0.206, reward_mean=0.350, reward_bound=0.072, batch=209
4963: loss=0.205, reward_mean=0.400, reward_bound=0.089, batch=205
4964: loss=0.206, reward_mean=0.410, reward_bound=0.098, batch=208
4965: loss=0.199, reward_mean=0.400, reward_bound=0.122, batch=210
4966: loss=0.191, reward_mean=0.430, reward_bound=0.135, batch=2

5075: loss=0.190, reward_mean=0.460, reward_bound=0.109, batch=193
5076: loss=0.200, reward_mean=0.420, reward_bound=0.135, batch=201
5077: loss=0.201, reward_mean=0.460, reward_bound=0.167, batch=205
5078: loss=0.193, reward_mean=0.370, reward_bound=0.185, batch=207
5079: loss=0.197, reward_mean=0.440, reward_bound=0.206, batch=209
5080: loss=0.200, reward_mean=0.400, reward_bound=0.229, batch=211
5081: loss=0.199, reward_mean=0.430, reward_bound=0.206, batch=217
5082: loss=0.195, reward_mean=0.460, reward_bound=0.254, batch=207
5083: loss=0.193, reward_mean=0.380, reward_bound=0.185, batch=214
5084: loss=0.196, reward_mean=0.460, reward_bound=0.254, batch=219
5085: loss=0.199, reward_mean=0.420, reward_bound=0.282, batch=203
5086: loss=0.196, reward_mean=0.410, reward_bound=0.229, batch=211
5087: loss=0.190, reward_mean=0.400, reward_bound=0.314, batch=199
5088: loss=0.194, reward_mean=0.520, reward_bound=0.250, batch=209
5089: loss=0.191, reward_mean=0.530, reward_bound=0.295, batch

5198: loss=0.166, reward_mean=0.370, reward_bound=0.229, batch=209
5199: loss=0.162, reward_mean=0.470, reward_bound=0.254, batch=197
5200: loss=0.162, reward_mean=0.510, reward_bound=0.254, batch=207
5201: loss=0.162, reward_mean=0.390, reward_bound=0.249, batch=215
5202: loss=0.164, reward_mean=0.480, reward_bound=0.282, batch=203
5203: loss=0.166, reward_mean=0.510, reward_bound=0.229, batch=211
5204: loss=0.165, reward_mean=0.450, reward_bound=0.206, batch=217
5205: loss=0.160, reward_mean=0.430, reward_bound=0.254, batch=220
5206: loss=0.164, reward_mean=0.450, reward_bound=0.282, batch=220
5207: loss=0.168, reward_mean=0.500, reward_bound=0.314, batch=194
5208: loss=0.164, reward_mean=0.340, reward_bound=0.135, batch=205
5209: loss=0.167, reward_mean=0.450, reward_bound=0.185, batch=211
5210: loss=0.167, reward_mean=0.460, reward_bound=0.254, batch=215
5211: loss=0.167, reward_mean=0.430, reward_bound=0.282, batch=216
5212: loss=0.165, reward_mean=0.440, reward_bound=0.314, batch

5321: loss=0.174, reward_mean=0.410, reward_bound=0.314, batch=219
5322: loss=0.173, reward_mean=0.450, reward_bound=0.349, batch=218
5323: loss=0.172, reward_mean=0.500, reward_bound=0.282, batch=221
5324: loss=0.172, reward_mean=0.420, reward_bound=0.282, batch=224
5325: loss=0.175, reward_mean=0.440, reward_bound=0.345, batch=227
5326: loss=0.174, reward_mean=0.400, reward_bound=0.380, batch=229
5327: loss=0.175, reward_mean=0.480, reward_bound=0.387, batch=205
5328: loss=0.177, reward_mean=0.460, reward_bound=0.282, batch=211
5329: loss=0.178, reward_mean=0.350, reward_bound=0.229, batch=217
5330: loss=0.176, reward_mean=0.490, reward_bound=0.249, batch=222
5331: loss=0.177, reward_mean=0.300, reward_bound=0.229, batch=223
5332: loss=0.175, reward_mean=0.490, reward_bound=0.282, batch=222
5333: loss=0.176, reward_mean=0.470, reward_bound=0.314, batch=223
5334: loss=0.176, reward_mean=0.410, reward_bound=0.314, batch=225
5335: loss=0.170, reward_mean=0.420, reward_bound=0.349, batch

5444: loss=0.172, reward_mean=0.360, reward_bound=0.430, batch=219
5445: loss=0.173, reward_mean=0.440, reward_bound=0.265, batch=223
5446: loss=0.174, reward_mean=0.540, reward_bound=0.372, batch=226
5447: loss=0.173, reward_mean=0.440, reward_bound=0.349, batch=227
5448: loss=0.173, reward_mean=0.440, reward_bound=0.349, batch=228
5449: loss=0.173, reward_mean=0.400, reward_bound=0.392, batch=229
5450: loss=0.175, reward_mean=0.530, reward_bound=0.430, batch=224
5451: loss=0.174, reward_mean=0.390, reward_bound=0.387, batch=226
5452: loss=0.173, reward_mean=0.500, reward_bound=0.368, batch=228
5453: loss=0.175, reward_mean=0.400, reward_bound=0.430, batch=227
5454: loss=0.176, reward_mean=0.420, reward_bound=0.422, batch=229
5455: loss=0.176, reward_mean=0.420, reward_bound=0.478, batch=231
5456: loss=0.176, reward_mean=0.460, reward_bound=0.387, batch=231
5457: loss=0.177, reward_mean=0.450, reward_bound=0.478, batch=213
5458: loss=0.180, reward_mean=0.380, reward_bound=0.301, batch

5568: loss=0.187, reward_mean=0.420, reward_bound=0.280, batch=220
5569: loss=0.189, reward_mean=0.460, reward_bound=0.282, batch=213
5570: loss=0.189, reward_mean=0.430, reward_bound=0.282, batch=218
5571: loss=0.189, reward_mean=0.430, reward_bound=0.314, batch=213
5572: loss=0.184, reward_mean=0.380, reward_bound=0.349, batch=205
5573: loss=0.185, reward_mean=0.370, reward_bound=0.234, batch=213
5574: loss=0.184, reward_mean=0.440, reward_bound=0.271, batch=219
5575: loss=0.177, reward_mean=0.430, reward_bound=0.282, batch=217
5576: loss=0.177, reward_mean=0.450, reward_bound=0.308, batch=222
5577: loss=0.180, reward_mean=0.450, reward_bound=0.272, batch=225
5578: loss=0.185, reward_mean=0.430, reward_bound=0.349, batch=225
5579: loss=0.184, reward_mean=0.440, reward_bound=0.329, batch=227
5580: loss=0.184, reward_mean=0.420, reward_bound=0.308, batch=229
5581: loss=0.186, reward_mean=0.390, reward_bound=0.387, batch=198
5582: loss=0.184, reward_mean=0.450, reward_bound=0.229, batch

5691: loss=0.168, reward_mean=0.460, reward_bound=0.387, batch=221
5692: loss=0.170, reward_mean=0.480, reward_bound=0.282, batch=224
5693: loss=0.171, reward_mean=0.400, reward_bound=0.314, batch=226
5694: loss=0.175, reward_mean=0.450, reward_bound=0.349, batch=227
5695: loss=0.177, reward_mean=0.360, reward_bound=0.373, batch=229
5696: loss=0.173, reward_mean=0.380, reward_bound=0.387, batch=226
5697: loss=0.173, reward_mean=0.430, reward_bound=0.368, batch=228
5698: loss=0.167, reward_mean=0.420, reward_bound=0.430, batch=220
5699: loss=0.169, reward_mean=0.440, reward_bound=0.314, batch=223
5700: loss=0.166, reward_mean=0.470, reward_bound=0.387, batch=225
5701: loss=0.166, reward_mean=0.400, reward_bound=0.365, batch=227
5702: loss=0.165, reward_mean=0.470, reward_bound=0.414, batch=229
5703: loss=0.165, reward_mean=0.460, reward_bound=0.430, batch=223
5704: loss=0.167, reward_mean=0.420, reward_bound=0.413, batch=226
5705: loss=0.169, reward_mean=0.430, reward_bound=0.351, batch

5814: loss=0.175, reward_mean=0.450, reward_bound=0.282, batch=218
5815: loss=0.174, reward_mean=0.460, reward_bound=0.260, batch=222
5816: loss=0.175, reward_mean=0.480, reward_bound=0.314, batch=222
5817: loss=0.176, reward_mean=0.400, reward_bound=0.324, batch=225
5818: loss=0.177, reward_mean=0.350, reward_bound=0.349, batch=222
5819: loss=0.176, reward_mean=0.480, reward_bound=0.360, batch=225
5820: loss=0.176, reward_mean=0.440, reward_bound=0.387, batch=226
5821: loss=0.174, reward_mean=0.480, reward_bound=0.409, batch=228
5822: loss=0.174, reward_mean=0.370, reward_bound=0.387, batch=228
5823: loss=0.173, reward_mean=0.400, reward_bound=0.430, batch=217
5824: loss=0.174, reward_mean=0.380, reward_bound=0.335, batch=222
5825: loss=0.175, reward_mean=0.440, reward_bound=0.349, batch=223
5826: loss=0.174, reward_mean=0.400, reward_bound=0.387, batch=220
5827: loss=0.176, reward_mean=0.420, reward_bound=0.282, batch=222
5828: loss=0.177, reward_mean=0.410, reward_bound=0.213, batch

5937: loss=0.173, reward_mean=0.460, reward_bound=0.217, batch=222
5938: loss=0.171, reward_mean=0.370, reward_bound=0.324, batch=225
5939: loss=0.171, reward_mean=0.370, reward_bound=0.356, batch=227
5940: loss=0.170, reward_mean=0.400, reward_bound=0.380, batch=229
5941: loss=0.168, reward_mean=0.450, reward_bound=0.387, batch=216
5942: loss=0.168, reward_mean=0.450, reward_bound=0.298, batch=221
5943: loss=0.169, reward_mean=0.410, reward_bound=0.349, batch=224
5944: loss=0.168, reward_mean=0.320, reward_bound=0.374, batch=227
5945: loss=0.167, reward_mean=0.340, reward_bound=0.277, batch=229
5946: loss=0.166, reward_mean=0.310, reward_bound=0.265, batch=230
5947: loss=0.167, reward_mean=0.430, reward_bound=0.282, batch=230
5948: loss=0.167, reward_mean=0.340, reward_bound=0.314, batch=230
5949: loss=0.167, reward_mean=0.380, reward_bound=0.349, batch=230
5950: loss=0.166, reward_mean=0.470, reward_bound=0.387, batch=229
5951: loss=0.170, reward_mean=0.370, reward_bound=0.430, batch

6061: loss=0.184, reward_mean=0.360, reward_bound=0.038, batch=206
6062: loss=0.190, reward_mean=0.480, reward_bound=0.052, batch=211
6063: loss=0.190, reward_mean=0.390, reward_bound=0.072, batch=215
6064: loss=0.193, reward_mean=0.400, reward_bound=0.089, batch=221
6065: loss=0.196, reward_mean=0.460, reward_bound=0.098, batch=220
6066: loss=0.192, reward_mean=0.410, reward_bound=0.122, batch=208
6067: loss=0.186, reward_mean=0.390, reward_bound=0.135, batch=207
6068: loss=0.194, reward_mean=0.450, reward_bound=0.150, batch=198
6069: loss=0.188, reward_mean=0.490, reward_bound=0.167, batch=185
6070: loss=0.185, reward_mean=0.540, reward_bound=0.185, batch=177
6071: loss=0.184, reward_mean=0.480, reward_bound=0.163, batch=194
6072: loss=0.183, reward_mean=0.350, reward_bound=0.167, batch=205
6073: loss=0.185, reward_mean=0.500, reward_bound=0.206, batch=184
6074: loss=0.186, reward_mean=0.440, reward_bound=0.165, batch=199
6075: loss=0.186, reward_mean=0.410, reward_bound=0.167, batch

6184: loss=0.207, reward_mean=0.400, reward_bound=0.387, batch=226
6185: loss=0.207, reward_mean=0.410, reward_bound=0.372, batch=228
6186: loss=0.199, reward_mean=0.430, reward_bound=0.430, batch=124
6187: loss=0.182, reward_mean=0.490, reward_bound=0.024, batch=157
6188: loss=0.198, reward_mean=0.420, reward_bound=0.031, batch=179
6189: loss=0.198, reward_mean=0.410, reward_bound=0.080, batch=191
6190: loss=0.202, reward_mean=0.430, reward_bound=0.098, batch=200
6191: loss=0.205, reward_mean=0.500, reward_bound=0.122, batch=200
6192: loss=0.204, reward_mean=0.390, reward_bound=0.135, batch=206
6193: loss=0.207, reward_mean=0.430, reward_bound=0.150, batch=212
6194: loss=0.200, reward_mean=0.500, reward_bound=0.206, batch=222
6195: loss=0.201, reward_mean=0.480, reward_bound=0.206, batch=234
6196: loss=0.196, reward_mean=0.460, reward_bound=0.226, batch=234
6197: loss=0.201, reward_mean=0.480, reward_bound=0.229, batch=230
6198: loss=0.205, reward_mean=0.510, reward_bound=0.254, batch

6307: loss=0.189, reward_mean=0.420, reward_bound=0.254, batch=199
6308: loss=0.187, reward_mean=0.430, reward_bound=0.215, batch=209
6309: loss=0.189, reward_mean=0.520, reward_bound=0.229, batch=213
6310: loss=0.187, reward_mean=0.420, reward_bound=0.254, batch=218
6311: loss=0.187, reward_mean=0.410, reward_bound=0.282, batch=196
6312: loss=0.185, reward_mean=0.530, reward_bound=0.168, batch=207
6313: loss=0.186, reward_mean=0.530, reward_bound=0.308, batch=215
6314: loss=0.185, reward_mean=0.370, reward_bound=0.289, batch=220
6315: loss=0.183, reward_mean=0.450, reward_bound=0.314, batch=204
6316: loss=0.177, reward_mean=0.440, reward_bound=0.206, batch=212
6317: loss=0.178, reward_mean=0.470, reward_bound=0.236, batch=218
6318: loss=0.182, reward_mean=0.420, reward_bound=0.257, batch=222
6319: loss=0.181, reward_mean=0.480, reward_bound=0.314, batch=220
6320: loss=0.182, reward_mean=0.480, reward_bound=0.349, batch=182
6321: loss=0.183, reward_mean=0.480, reward_bound=0.089, batch

6430: loss=0.187, reward_mean=0.410, reward_bound=0.118, batch=210
6431: loss=0.185, reward_mean=0.440, reward_bound=0.131, batch=217
6432: loss=0.191, reward_mean=0.470, reward_bound=0.206, batch=212
6433: loss=0.196, reward_mean=0.470, reward_bound=0.229, batch=215
6434: loss=0.192, reward_mean=0.410, reward_bound=0.254, batch=212
6435: loss=0.191, reward_mean=0.460, reward_bound=0.220, batch=218
6436: loss=0.196, reward_mean=0.490, reward_bound=0.282, batch=210
6437: loss=0.196, reward_mean=0.380, reward_bound=0.304, batch=217
6438: loss=0.194, reward_mean=0.520, reward_bound=0.308, batch=222
6439: loss=0.195, reward_mean=0.380, reward_bound=0.292, batch=225
6440: loss=0.199, reward_mean=0.440, reward_bound=0.314, batch=213
6441: loss=0.197, reward_mean=0.410, reward_bound=0.211, batch=219
6442: loss=0.195, reward_mean=0.460, reward_bound=0.239, batch=223
6443: loss=0.197, reward_mean=0.400, reward_bound=0.314, batch=220
6444: loss=0.199, reward_mean=0.450, reward_bound=0.349, batch

6553: loss=0.185, reward_mean=0.440, reward_bound=0.405, batch=230
6554: loss=0.185, reward_mean=0.400, reward_bound=0.347, batch=231
6555: loss=0.185, reward_mean=0.520, reward_bound=0.387, batch=231
6556: loss=0.185, reward_mean=0.500, reward_bound=0.478, batch=206
6557: loss=0.185, reward_mean=0.370, reward_bound=0.217, batch=214
6558: loss=0.185, reward_mean=0.430, reward_bound=0.229, batch=216
6559: loss=0.187, reward_mean=0.520, reward_bound=0.241, batch=221
6560: loss=0.188, reward_mean=0.420, reward_bound=0.282, batch=222
6561: loss=0.186, reward_mean=0.420, reward_bound=0.282, batch=224
6562: loss=0.185, reward_mean=0.520, reward_bound=0.314, batch=223
6563: loss=0.185, reward_mean=0.520, reward_bound=0.349, batch=222
6564: loss=0.186, reward_mean=0.520, reward_bound=0.360, batch=225
6565: loss=0.186, reward_mean=0.530, reward_bound=0.282, batch=226
6566: loss=0.188, reward_mean=0.410, reward_bound=0.387, batch=220
6567: loss=0.187, reward_mean=0.470, reward_bound=0.418, batch

6677: loss=0.151, reward_mean=0.510, reward_bound=0.167, batch=207
6678: loss=0.159, reward_mean=0.480, reward_bound=0.206, batch=209
6679: loss=0.157, reward_mean=0.420, reward_bound=0.203, batch=216
6680: loss=0.153, reward_mean=0.430, reward_bound=0.229, batch=220
6681: loss=0.154, reward_mean=0.470, reward_bound=0.254, batch=214
6682: loss=0.157, reward_mean=0.460, reward_bound=0.252, batch=220
6683: loss=0.157, reward_mean=0.500, reward_bound=0.282, batch=213
6684: loss=0.157, reward_mean=0.490, reward_bound=0.282, batch=218
6685: loss=0.156, reward_mean=0.340, reward_bound=0.229, batch=221
6686: loss=0.162, reward_mean=0.430, reward_bound=0.314, batch=207
6687: loss=0.162, reward_mean=0.420, reward_bound=0.282, batch=213
6688: loss=0.161, reward_mean=0.400, reward_bound=0.314, batch=216
6689: loss=0.166, reward_mean=0.480, reward_bound=0.349, batch=204
6690: loss=0.165, reward_mean=0.470, reward_bound=0.252, batch=213
6691: loss=0.162, reward_mean=0.570, reward_bound=0.244, batch

6800: loss=0.165, reward_mean=0.430, reward_bound=0.185, batch=209
6801: loss=0.155, reward_mean=0.540, reward_bound=0.254, batch=214
6802: loss=0.155, reward_mean=0.450, reward_bound=0.282, batch=214
6803: loss=0.156, reward_mean=0.400, reward_bound=0.206, batch=219
6804: loss=0.155, reward_mean=0.330, reward_bound=0.229, batch=222
6805: loss=0.159, reward_mean=0.370, reward_bound=0.236, batch=225
6806: loss=0.160, reward_mean=0.480, reward_bound=0.314, batch=221
6807: loss=0.160, reward_mean=0.400, reward_bound=0.314, batch=224
6808: loss=0.159, reward_mean=0.500, reward_bound=0.314, batch=226
6809: loss=0.165, reward_mean=0.470, reward_bound=0.349, batch=217
6810: loss=0.162, reward_mean=0.450, reward_bound=0.254, batch=221
6811: loss=0.163, reward_mean=0.370, reward_bound=0.254, batch=222
6812: loss=0.165, reward_mean=0.470, reward_bound=0.324, batch=225
6813: loss=0.164, reward_mean=0.510, reward_bound=0.321, batch=227
6814: loss=0.168, reward_mean=0.430, reward_bound=0.380, batch

6923: loss=0.155, reward_mean=0.420, reward_bound=0.301, batch=226
6924: loss=0.155, reward_mean=0.490, reward_bound=0.314, batch=226
6925: loss=0.154, reward_mean=0.440, reward_bound=0.349, batch=225
6926: loss=0.157, reward_mean=0.570, reward_bound=0.387, batch=212
6927: loss=0.161, reward_mean=0.480, reward_bound=0.213, batch=218
6928: loss=0.156, reward_mean=0.360, reward_bound=0.282, batch=221
6929: loss=0.156, reward_mean=0.460, reward_bound=0.349, batch=223
6930: loss=0.156, reward_mean=0.410, reward_bound=0.335, batch=226
6931: loss=0.155, reward_mean=0.480, reward_bound=0.349, batch=227
6932: loss=0.154, reward_mean=0.410, reward_bound=0.380, batch=229
6933: loss=0.155, reward_mean=0.430, reward_bound=0.387, batch=223
6934: loss=0.154, reward_mean=0.440, reward_bound=0.398, batch=226
6935: loss=0.153, reward_mean=0.440, reward_bound=0.430, batch=149
6936: loss=0.147, reward_mean=0.410, reward_bound=0.072, batch=173
6937: loss=0.142, reward_mean=0.530, reward_bound=0.122, batch

7046: loss=0.140, reward_mean=0.410, reward_bound=0.206, batch=213
7047: loss=0.137, reward_mean=0.420, reward_bound=0.254, batch=218
7048: loss=0.138, reward_mean=0.480, reward_bound=0.282, batch=219
7049: loss=0.136, reward_mean=0.380, reward_bound=0.254, batch=222
7050: loss=0.139, reward_mean=0.420, reward_bound=0.292, batch=225
7051: loss=0.142, reward_mean=0.510, reward_bound=0.314, batch=220
7052: loss=0.142, reward_mean=0.370, reward_bound=0.349, batch=214
7053: loss=0.142, reward_mean=0.400, reward_bound=0.339, batch=220
7054: loss=0.142, reward_mean=0.440, reward_bound=0.387, batch=198
7055: loss=0.141, reward_mean=0.360, reward_bound=0.208, batch=208
7056: loss=0.142, reward_mean=0.480, reward_bound=0.229, batch=213
7057: loss=0.138, reward_mean=0.370, reward_bound=0.254, batch=216
7058: loss=0.135, reward_mean=0.550, reward_bound=0.282, batch=219
7059: loss=0.138, reward_mean=0.330, reward_bound=0.314, batch=218
7060: loss=0.137, reward_mean=0.480, reward_bound=0.317, batch

7169: loss=0.140, reward_mean=0.450, reward_bound=0.430, batch=231
7170: loss=0.137, reward_mean=0.390, reward_bound=0.478, batch=202
7171: loss=0.145, reward_mean=0.390, reward_bound=0.185, batch=210
7172: loss=0.134, reward_mean=0.500, reward_bound=0.282, batch=213
7173: loss=0.133, reward_mean=0.370, reward_bound=0.282, batch=218
7174: loss=0.131, reward_mean=0.460, reward_bound=0.231, batch=222
7175: loss=0.132, reward_mean=0.440, reward_bound=0.282, batch=224
7176: loss=0.132, reward_mean=0.430, reward_bound=0.314, batch=221
7177: loss=0.135, reward_mean=0.440, reward_bound=0.349, batch=222
7178: loss=0.137, reward_mean=0.410, reward_bound=0.349, batch=224
7179: loss=0.137, reward_mean=0.380, reward_bound=0.384, batch=227
7180: loss=0.137, reward_mean=0.480, reward_bound=0.387, batch=222
7181: loss=0.137, reward_mean=0.400, reward_bound=0.387, batch=223
7182: loss=0.136, reward_mean=0.470, reward_bound=0.314, batch=225
7183: loss=0.136, reward_mean=0.400, reward_bound=0.430, batch

7293: loss=0.147, reward_mean=0.330, reward_bound=0.282, batch=210
7294: loss=0.149, reward_mean=0.430, reward_bound=0.194, batch=217
7295: loss=0.148, reward_mean=0.380, reward_bound=0.229, batch=220
7296: loss=0.148, reward_mean=0.370, reward_bound=0.304, batch=224
7297: loss=0.149, reward_mean=0.400, reward_bound=0.314, batch=176
7298: loss=0.153, reward_mean=0.460, reward_bound=0.128, batch=193
7299: loss=0.152, reward_mean=0.450, reward_bound=0.135, batch=203
7300: loss=0.147, reward_mean=0.450, reward_bound=0.185, batch=207
7301: loss=0.142, reward_mean=0.500, reward_bound=0.229, batch=214
7302: loss=0.144, reward_mean=0.390, reward_bound=0.254, batch=209
7303: loss=0.146, reward_mean=0.520, reward_bound=0.282, batch=203
7304: loss=0.145, reward_mean=0.410, reward_bound=0.229, batch=210
7305: loss=0.145, reward_mean=0.420, reward_bound=0.167, batch=216
7306: loss=0.145, reward_mean=0.420, reward_bound=0.241, batch=221
7307: loss=0.145, reward_mean=0.440, reward_bound=0.282, batch

7416: loss=0.166, reward_mean=0.480, reward_bound=0.387, batch=185
7417: loss=0.164, reward_mean=0.390, reward_bound=0.115, batch=199
7418: loss=0.158, reward_mean=0.620, reward_bound=0.265, batch=209
7419: loss=0.158, reward_mean=0.520, reward_bound=0.254, batch=215
7420: loss=0.159, reward_mean=0.350, reward_bound=0.260, batch=220
7421: loss=0.161, reward_mean=0.370, reward_bound=0.282, batch=220
7422: loss=0.164, reward_mean=0.440, reward_bound=0.314, batch=209
7423: loss=0.165, reward_mean=0.470, reward_bound=0.314, batch=215
7424: loss=0.166, reward_mean=0.510, reward_bound=0.349, batch=211
7425: loss=0.165, reward_mean=0.510, reward_bound=0.282, batch=215
7426: loss=0.165, reward_mean=0.430, reward_bound=0.314, batch=219
7427: loss=0.165, reward_mean=0.450, reward_bound=0.254, batch=222
7428: loss=0.165, reward_mean=0.430, reward_bound=0.282, batch=222
7429: loss=0.166, reward_mean=0.470, reward_bound=0.324, batch=225
7430: loss=0.163, reward_mean=0.450, reward_bound=0.387, batch

7539: loss=0.169, reward_mean=0.420, reward_bound=0.145, batch=204
7540: loss=0.170, reward_mean=0.460, reward_bound=0.226, batch=213
7541: loss=0.167, reward_mean=0.360, reward_bound=0.229, batch=211
7542: loss=0.167, reward_mean=0.390, reward_bound=0.254, batch=215
7543: loss=0.166, reward_mean=0.440, reward_bound=0.282, batch=212
7544: loss=0.166, reward_mean=0.400, reward_bound=0.263, batch=218
7545: loss=0.169, reward_mean=0.460, reward_bound=0.314, batch=211
7546: loss=0.169, reward_mean=0.400, reward_bound=0.314, batch=217
7547: loss=0.166, reward_mean=0.440, reward_bound=0.349, batch=212
7548: loss=0.168, reward_mean=0.420, reward_bound=0.314, batch=216
7549: loss=0.170, reward_mean=0.430, reward_bound=0.314, batch=218
7550: loss=0.171, reward_mean=0.420, reward_bound=0.282, batch=221
7551: loss=0.171, reward_mean=0.490, reward_bound=0.349, batch=221
7552: loss=0.173, reward_mean=0.430, reward_bound=0.387, batch=210
7553: loss=0.172, reward_mean=0.480, reward_bound=0.296, batch

7662: loss=0.174, reward_mean=0.440, reward_bound=0.274, batch=217
7663: loss=0.175, reward_mean=0.530, reward_bound=0.277, batch=222
7664: loss=0.172, reward_mean=0.440, reward_bound=0.292, batch=225
7665: loss=0.173, reward_mean=0.450, reward_bound=0.349, batch=204
7666: loss=0.171, reward_mean=0.400, reward_bound=0.183, batch=213
7667: loss=0.173, reward_mean=0.440, reward_bound=0.206, batch=218
7668: loss=0.176, reward_mean=0.470, reward_bound=0.254, batch=217
7669: loss=0.171, reward_mean=0.460, reward_bound=0.282, batch=216
7670: loss=0.170, reward_mean=0.410, reward_bound=0.314, batch=218
7671: loss=0.170, reward_mean=0.440, reward_bound=0.260, batch=222
7672: loss=0.173, reward_mean=0.410, reward_bound=0.314, batch=222
7673: loss=0.171, reward_mean=0.430, reward_bound=0.263, batch=225
7674: loss=0.170, reward_mean=0.470, reward_bound=0.349, batch=221
7675: loss=0.170, reward_mean=0.440, reward_bound=0.349, batch=224
7676: loss=0.170, reward_mean=0.400, reward_bound=0.345, batch

7785: loss=0.172, reward_mean=0.500, reward_bound=0.282, batch=215
7786: loss=0.171, reward_mean=0.460, reward_bound=0.314, batch=214
7787: loss=0.171, reward_mean=0.400, reward_bound=0.314, batch=219
7788: loss=0.172, reward_mean=0.340, reward_bound=0.314, batch=222
7789: loss=0.169, reward_mean=0.470, reward_bound=0.349, batch=222
7790: loss=0.169, reward_mean=0.450, reward_bound=0.360, batch=225
7791: loss=0.169, reward_mean=0.410, reward_bound=0.314, batch=226
7792: loss=0.170, reward_mean=0.400, reward_bound=0.387, batch=216
7793: loss=0.169, reward_mean=0.320, reward_bound=0.241, batch=221
7794: loss=0.169, reward_mean=0.450, reward_bound=0.282, batch=224
7795: loss=0.168, reward_mean=0.340, reward_bound=0.345, batch=227
7796: loss=0.172, reward_mean=0.440, reward_bound=0.349, batch=228
7797: loss=0.172, reward_mean=0.420, reward_bound=0.321, batch=229
7798: loss=0.171, reward_mean=0.470, reward_bound=0.405, batch=230
7799: loss=0.171, reward_mean=0.390, reward_bound=0.406, batch

7909: loss=0.160, reward_mean=0.420, reward_bound=0.349, batch=157
7910: loss=0.161, reward_mean=0.450, reward_bound=0.037, batch=180
7911: loss=0.169, reward_mean=0.300, reward_bound=0.041, batch=196
7912: loss=0.162, reward_mean=0.420, reward_bound=0.068, batch=207
7913: loss=0.161, reward_mean=0.450, reward_bound=0.132, batch=215
7914: loss=0.161, reward_mean=0.400, reward_bound=0.138, batch=220
7915: loss=0.159, reward_mean=0.380, reward_bound=0.167, batch=221
7916: loss=0.156, reward_mean=0.370, reward_bound=0.185, batch=220
7917: loss=0.159, reward_mean=0.420, reward_bound=0.206, batch=231
7918: loss=0.163, reward_mean=0.470, reward_bound=0.206, batch=229
7919: loss=0.163, reward_mean=0.470, reward_bound=0.229, batch=228
7920: loss=0.159, reward_mean=0.370, reward_bound=0.254, batch=220
7921: loss=0.165, reward_mean=0.410, reward_bound=0.282, batch=212
7922: loss=0.166, reward_mean=0.370, reward_bound=0.314, batch=205
7923: loss=0.167, reward_mean=0.440, reward_bound=0.289, batch

8032: loss=0.174, reward_mean=0.380, reward_bound=0.387, batch=213
8033: loss=0.173, reward_mean=0.370, reward_bound=0.271, batch=219
8034: loss=0.174, reward_mean=0.490, reward_bound=0.265, batch=223
8035: loss=0.173, reward_mean=0.350, reward_bound=0.282, batch=224
8036: loss=0.173, reward_mean=0.370, reward_bound=0.314, batch=226
8037: loss=0.173, reward_mean=0.440, reward_bound=0.349, batch=226
8038: loss=0.173, reward_mean=0.430, reward_bound=0.387, batch=223
8039: loss=0.172, reward_mean=0.320, reward_bound=0.314, batch=225
8040: loss=0.172, reward_mean=0.490, reward_bound=0.314, batch=225
8041: loss=0.172, reward_mean=0.390, reward_bound=0.314, batch=226
8042: loss=0.166, reward_mean=0.480, reward_bound=0.430, batch=173
8043: loss=0.168, reward_mean=0.440, reward_bound=0.144, batch=191
8044: loss=0.172, reward_mean=0.390, reward_bound=0.150, batch=201
8045: loss=0.176, reward_mean=0.350, reward_bound=0.167, batch=209
8046: loss=0.172, reward_mean=0.420, reward_bound=0.185, batch

8155: loss=0.154, reward_mean=0.460, reward_bound=0.314, batch=221
8156: loss=0.153, reward_mean=0.430, reward_bound=0.314, batch=224
8157: loss=0.154, reward_mean=0.400, reward_bound=0.387, batch=210
8158: loss=0.154, reward_mean=0.410, reward_bound=0.254, batch=216
8159: loss=0.155, reward_mean=0.340, reward_bound=0.282, batch=220
8160: loss=0.156, reward_mean=0.390, reward_bound=0.329, batch=224
8161: loss=0.157, reward_mean=0.390, reward_bound=0.349, batch=224
8162: loss=0.157, reward_mean=0.380, reward_bound=0.384, batch=227
8163: loss=0.154, reward_mean=0.490, reward_bound=0.387, batch=222
8164: loss=0.154, reward_mean=0.440, reward_bound=0.400, batch=225
8165: loss=0.153, reward_mean=0.500, reward_bound=0.430, batch=180
8166: loss=0.151, reward_mean=0.390, reward_bound=0.135, batch=193
8167: loss=0.151, reward_mean=0.440, reward_bound=0.167, batch=203
8168: loss=0.154, reward_mean=0.450, reward_bound=0.229, batch=210
8169: loss=0.154, reward_mean=0.450, reward_bound=0.222, batch

8278: loss=0.159, reward_mean=0.480, reward_bound=0.206, batch=213
8279: loss=0.161, reward_mean=0.390, reward_bound=0.206, batch=218
8280: loss=0.165, reward_mean=0.410, reward_bound=0.254, batch=220
8281: loss=0.162, reward_mean=0.390, reward_bound=0.282, batch=223
8282: loss=0.162, reward_mean=0.430, reward_bound=0.314, batch=223
8283: loss=0.156, reward_mean=0.520, reward_bound=0.349, batch=212
8284: loss=0.162, reward_mean=0.480, reward_bound=0.206, batch=222
8285: loss=0.161, reward_mean=0.480, reward_bound=0.206, batch=229
8286: loss=0.154, reward_mean=0.420, reward_bound=0.239, batch=230
8287: loss=0.155, reward_mean=0.330, reward_bound=0.282, batch=229
8288: loss=0.158, reward_mean=0.470, reward_bound=0.314, batch=227
8289: loss=0.159, reward_mean=0.380, reward_bound=0.249, batch=229
8290: loss=0.161, reward_mean=0.410, reward_bound=0.282, batch=229
8291: loss=0.161, reward_mean=0.440, reward_bound=0.349, batch=228
8292: loss=0.161, reward_mean=0.430, reward_bound=0.387, batch

8402: loss=0.140, reward_mean=0.410, reward_bound=0.254, batch=217
8403: loss=0.135, reward_mean=0.470, reward_bound=0.282, batch=213
8404: loss=0.133, reward_mean=0.480, reward_bound=0.314, batch=208
8405: loss=0.134, reward_mean=0.430, reward_bound=0.257, batch=215
8406: loss=0.135, reward_mean=0.420, reward_bound=0.282, batch=219
8407: loss=0.135, reward_mean=0.430, reward_bound=0.314, batch=221
8408: loss=0.132, reward_mean=0.350, reward_bound=0.349, batch=167
8409: loss=0.133, reward_mean=0.420, reward_bound=0.089, batch=189
8410: loss=0.133, reward_mean=0.490, reward_bound=0.122, batch=200
8411: loss=0.128, reward_mean=0.420, reward_bound=0.162, batch=210
8412: loss=0.134, reward_mean=0.350, reward_bound=0.167, batch=214
8413: loss=0.132, reward_mean=0.490, reward_bound=0.206, batch=216
8414: loss=0.132, reward_mean=0.490, reward_bound=0.217, batch=221
8415: loss=0.130, reward_mean=0.440, reward_bound=0.229, batch=219
8416: loss=0.128, reward_mean=0.500, reward_bound=0.282, batch

8525: loss=0.127, reward_mean=0.410, reward_bound=0.282, batch=225
8526: loss=0.126, reward_mean=0.500, reward_bound=0.314, batch=223
8527: loss=0.127, reward_mean=0.410, reward_bound=0.349, batch=219
8528: loss=0.128, reward_mean=0.420, reward_bound=0.387, batch=209
8529: loss=0.127, reward_mean=0.460, reward_bound=0.328, batch=216
8530: loss=0.127, reward_mean=0.410, reward_bound=0.298, batch=221
8531: loss=0.127, reward_mean=0.390, reward_bound=0.314, batch=223
8532: loss=0.128, reward_mean=0.410, reward_bound=0.349, batch=222
8533: loss=0.128, reward_mean=0.380, reward_bound=0.360, batch=225
8534: loss=0.128, reward_mean=0.380, reward_bound=0.260, batch=227
8535: loss=0.129, reward_mean=0.480, reward_bound=0.308, batch=229
8536: loss=0.128, reward_mean=0.370, reward_bound=0.387, batch=220
8537: loss=0.127, reward_mean=0.360, reward_bound=0.240, batch=224
8538: loss=0.127, reward_mean=0.410, reward_bound=0.280, batch=227
8539: loss=0.128, reward_mean=0.460, reward_bound=0.342, batch

8648: loss=0.121, reward_mean=0.350, reward_bound=0.257, batch=222
8649: loss=0.119, reward_mean=0.450, reward_bound=0.349, batch=222
8650: loss=0.119, reward_mean=0.490, reward_bound=0.387, batch=221
8651: loss=0.119, reward_mean=0.380, reward_bound=0.387, batch=223
8652: loss=0.120, reward_mean=0.370, reward_bound=0.301, batch=226
8653: loss=0.119, reward_mean=0.480, reward_bound=0.349, batch=226
8654: loss=0.119, reward_mean=0.500, reward_bound=0.368, batch=228
8655: loss=0.120, reward_mean=0.460, reward_bound=0.392, batch=229
8656: loss=0.119, reward_mean=0.480, reward_bound=0.430, batch=153
8657: loss=0.120, reward_mean=0.470, reward_bound=0.080, batch=176
8658: loss=0.114, reward_mean=0.420, reward_bound=0.104, batch=193
8659: loss=0.115, reward_mean=0.410, reward_bound=0.122, batch=204
8660: loss=0.112, reward_mean=0.440, reward_bound=0.150, batch=210
8661: loss=0.112, reward_mean=0.390, reward_bound=0.185, batch=210
8662: loss=0.114, reward_mean=0.420, reward_bound=0.206, batch

8771: loss=0.114, reward_mean=0.410, reward_bound=0.282, batch=224
8772: loss=0.113, reward_mean=0.370, reward_bound=0.349, batch=224
8773: loss=0.113, reward_mean=0.440, reward_bound=0.387, batch=198
8774: loss=0.115, reward_mean=0.450, reward_bound=0.152, batch=208
8775: loss=0.110, reward_mean=0.430, reward_bound=0.187, batch=215
8776: loss=0.112, reward_mean=0.510, reward_bound=0.254, batch=218
8777: loss=0.114, reward_mean=0.400, reward_bound=0.231, batch=222
8778: loss=0.114, reward_mean=0.420, reward_bound=0.254, batch=223
8779: loss=0.114, reward_mean=0.470, reward_bound=0.282, batch=222
8780: loss=0.113, reward_mean=0.410, reward_bound=0.314, batch=219
8781: loss=0.113, reward_mean=0.400, reward_bound=0.349, batch=218
8782: loss=0.113, reward_mean=0.480, reward_bound=0.282, batch=221
8783: loss=0.113, reward_mean=0.400, reward_bound=0.314, batch=224
8784: loss=0.112, reward_mean=0.510, reward_bound=0.311, batch=227
8785: loss=0.112, reward_mean=0.430, reward_bound=0.349, batch

8894: loss=0.112, reward_mean=0.450, reward_bound=0.380, batch=229
8895: loss=0.112, reward_mean=0.380, reward_bound=0.387, batch=229
8896: loss=0.112, reward_mean=0.470, reward_bound=0.430, batch=228
8897: loss=0.113, reward_mean=0.420, reward_bound=0.478, batch=231
8898: loss=0.112, reward_mean=0.500, reward_bound=0.478, batch=227
8899: loss=0.112, reward_mean=0.450, reward_bound=0.422, batch=229
8900: loss=0.112, reward_mean=0.380, reward_bound=0.450, batch=230
8901: loss=0.112, reward_mean=0.440, reward_bound=0.430, batch=230
8902: loss=0.112, reward_mean=0.450, reward_bound=0.439, batch=231
8903: loss=0.112, reward_mean=0.540, reward_bound=0.349, batch=231
8904: loss=0.112, reward_mean=0.480, reward_bound=0.478, batch=231
8905: loss=0.112, reward_mean=0.370, reward_bound=0.282, batch=231
8906: loss=0.112, reward_mean=0.390, reward_bound=0.430, batch=231
8908: loss=0.105, reward_mean=0.410, reward_bound=0.000, batch=41
8909: loss=0.101, reward_mean=0.440, reward_bound=0.000, batch=

9018: loss=0.108, reward_mean=0.430, reward_bound=0.257, batch=222
9019: loss=0.110, reward_mean=0.470, reward_bound=0.349, batch=219
9020: loss=0.109, reward_mean=0.480, reward_bound=0.282, batch=222
9021: loss=0.110, reward_mean=0.460, reward_bound=0.349, batch=224
9022: loss=0.111, reward_mean=0.440, reward_bound=0.387, batch=224
9023: loss=0.112, reward_mean=0.540, reward_bound=0.345, batch=227
9024: loss=0.112, reward_mean=0.370, reward_bound=0.380, batch=229
9025: loss=0.111, reward_mean=0.510, reward_bound=0.387, batch=228
9026: loss=0.112, reward_mean=0.420, reward_bound=0.430, batch=118
9027: loss=0.097, reward_mean=0.410, reward_bound=0.010, batch=152
9028: loss=0.108, reward_mean=0.470, reward_bound=0.026, batch=176
9029: loss=0.109, reward_mean=0.520, reward_bound=0.061, batch=193
9030: loss=0.116, reward_mean=0.380, reward_bound=0.080, batch=204
9031: loss=0.115, reward_mean=0.450, reward_bound=0.120, batch=213
9032: loss=0.115, reward_mean=0.420, reward_bound=0.122, batch

9141: loss=0.117, reward_mean=0.350, reward_bound=0.150, batch=213
9142: loss=0.117, reward_mean=0.500, reward_bound=0.167, batch=215
9143: loss=0.123, reward_mean=0.400, reward_bound=0.185, batch=212
9144: loss=0.123, reward_mean=0.420, reward_bound=0.206, batch=224
9145: loss=0.124, reward_mean=0.510, reward_bound=0.206, batch=215
9146: loss=0.125, reward_mean=0.540, reward_bound=0.229, batch=204
9147: loss=0.122, reward_mean=0.450, reward_bound=0.183, batch=213
9148: loss=0.120, reward_mean=0.460, reward_bound=0.254, batch=202
9149: loss=0.121, reward_mean=0.480, reward_bound=0.245, batch=211
9150: loss=0.120, reward_mean=0.410, reward_bound=0.254, batch=217
9151: loss=0.113, reward_mean=0.490, reward_bound=0.282, batch=199
9152: loss=0.112, reward_mean=0.500, reward_bound=0.215, batch=209
9153: loss=0.112, reward_mean=0.370, reward_bound=0.215, batch=216
9154: loss=0.115, reward_mean=0.460, reward_bound=0.229, batch=219
9155: loss=0.114, reward_mean=0.460, reward_bound=0.254, batch

9264: loss=0.109, reward_mean=0.380, reward_bound=0.430, batch=220
9265: loss=0.109, reward_mean=0.350, reward_bound=0.376, batch=224
9266: loss=0.109, reward_mean=0.490, reward_bound=0.387, batch=225
9267: loss=0.109, reward_mean=0.520, reward_bound=0.396, batch=227
9268: loss=0.109, reward_mean=0.410, reward_bound=0.430, batch=225
9269: loss=0.107, reward_mean=0.450, reward_bound=0.478, batch=151
9270: loss=0.102, reward_mean=0.420, reward_bound=0.038, batch=175
9271: loss=0.101, reward_mean=0.500, reward_bound=0.101, batch=192
9272: loss=0.104, reward_mean=0.410, reward_bound=0.135, batch=203
9273: loss=0.109, reward_mean=0.570, reward_bound=0.150, batch=211
9274: loss=0.107, reward_mean=0.420, reward_bound=0.185, batch=211
9275: loss=0.113, reward_mean=0.490, reward_bound=0.206, batch=216
9276: loss=0.112, reward_mean=0.400, reward_bound=0.229, batch=209
9277: loss=0.111, reward_mean=0.470, reward_bound=0.254, batch=202
9278: loss=0.110, reward_mean=0.440, reward_bound=0.191, batch

9387: loss=0.111, reward_mean=0.460, reward_bound=0.293, batch=229
9388: loss=0.111, reward_mean=0.510, reward_bound=0.430, batch=217
9389: loss=0.110, reward_mean=0.530, reward_bound=0.380, batch=222
9390: loss=0.110, reward_mean=0.530, reward_bound=0.349, batch=224
9391: loss=0.110, reward_mean=0.430, reward_bound=0.387, batch=223
9392: loss=0.110, reward_mean=0.480, reward_bound=0.349, batch=225
9393: loss=0.110, reward_mean=0.290, reward_bound=0.356, batch=227
9394: loss=0.110, reward_mean=0.450, reward_bound=0.373, batch=229
9395: loss=0.110, reward_mean=0.450, reward_bound=0.387, batch=228
9396: loss=0.110, reward_mean=0.360, reward_bound=0.353, batch=229
9397: loss=0.110, reward_mean=0.390, reward_bound=0.430, batch=224
9398: loss=0.110, reward_mean=0.490, reward_bound=0.387, batch=226
9399: loss=0.110, reward_mean=0.450, reward_bound=0.368, batch=228
9400: loss=0.109, reward_mean=0.450, reward_bound=0.397, batch=229
9401: loss=0.109, reward_mean=0.470, reward_bound=0.478, batch

9511: loss=0.111, reward_mean=0.440, reward_bound=0.295, batch=223
9512: loss=0.111, reward_mean=0.330, reward_bound=0.301, batch=226
9513: loss=0.111, reward_mean=0.410, reward_bound=0.314, batch=225
9514: loss=0.107, reward_mean=0.470, reward_bound=0.349, batch=154
9515: loss=0.105, reward_mean=0.410, reward_bound=0.065, batch=176
9516: loss=0.106, reward_mean=0.510, reward_bound=0.104, batch=193
9517: loss=0.107, reward_mean=0.420, reward_bound=0.135, batch=202
9518: loss=0.108, reward_mean=0.500, reward_bound=0.150, batch=209
9519: loss=0.106, reward_mean=0.490, reward_bound=0.185, batch=211
9520: loss=0.111, reward_mean=0.450, reward_bound=0.229, batch=212
9521: loss=0.113, reward_mean=0.390, reward_bound=0.254, batch=205
9522: loss=0.113, reward_mean=0.450, reward_bound=0.260, batch=213
9523: loss=0.111, reward_mean=0.380, reward_bound=0.229, batch=218
9524: loss=0.112, reward_mean=0.400, reward_bound=0.282, batch=210
9525: loss=0.115, reward_mean=0.440, reward_bound=0.247, batch

9634: loss=0.111, reward_mean=0.450, reward_bound=0.206, batch=206
9635: loss=0.110, reward_mean=0.510, reward_bound=0.229, batch=212
9636: loss=0.112, reward_mean=0.400, reward_bound=0.254, batch=215
9637: loss=0.111, reward_mean=0.470, reward_bound=0.260, batch=220
9638: loss=0.111, reward_mean=0.470, reward_bound=0.282, batch=220
9639: loss=0.111, reward_mean=0.520, reward_bound=0.314, batch=221
9640: loss=0.113, reward_mean=0.530, reward_bound=0.282, batch=224
9641: loss=0.112, reward_mean=0.460, reward_bound=0.314, batch=225
9642: loss=0.107, reward_mean=0.420, reward_bound=0.349, batch=215
9643: loss=0.107, reward_mean=0.500, reward_bound=0.234, batch=220
9644: loss=0.107, reward_mean=0.440, reward_bound=0.338, batch=224
9645: loss=0.108, reward_mean=0.430, reward_bound=0.349, batch=226
9646: loss=0.108, reward_mean=0.480, reward_bound=0.387, batch=216
9647: loss=0.109, reward_mean=0.440, reward_bound=0.351, batch=221
9648: loss=0.109, reward_mean=0.490, reward_bound=0.349, batch

9757: loss=0.095, reward_mean=0.430, reward_bound=0.387, batch=226
9758: loss=0.095, reward_mean=0.480, reward_bound=0.351, batch=228
9759: loss=0.096, reward_mean=0.480, reward_bound=0.387, batch=228
9760: loss=0.095, reward_mean=0.380, reward_bound=0.325, batch=229
9761: loss=0.095, reward_mean=0.400, reward_bound=0.309, batch=230
9762: loss=0.095, reward_mean=0.470, reward_bound=0.418, batch=231
9763: loss=0.095, reward_mean=0.450, reward_bound=0.387, batch=231
9764: loss=0.095, reward_mean=0.420, reward_bound=0.387, batch=231
9765: loss=0.101, reward_mean=0.460, reward_bound=0.430, batch=169
9766: loss=0.096, reward_mean=0.380, reward_bound=0.049, batch=188
9767: loss=0.098, reward_mean=0.390, reward_bound=0.080, batch=200
9768: loss=0.096, reward_mean=0.440, reward_bound=0.122, batch=207
9769: loss=0.094, reward_mean=0.490, reward_bound=0.147, batch=215
9770: loss=0.097, reward_mean=0.460, reward_bound=0.167, batch=218
9771: loss=0.095, reward_mean=0.500, reward_bound=0.185, batch

9880: loss=0.105, reward_mean=0.420, reward_bound=0.387, batch=229
9881: loss=0.106, reward_mean=0.530, reward_bound=0.430, batch=192
9882: loss=0.105, reward_mean=0.500, reward_bound=0.167, batch=203
9883: loss=0.105, reward_mean=0.530, reward_bound=0.198, batch=212
9884: loss=0.108, reward_mean=0.500, reward_bound=0.254, batch=215
9885: loss=0.107, reward_mean=0.420, reward_bound=0.282, batch=213
9886: loss=0.106, reward_mean=0.480, reward_bound=0.314, batch=214
9887: loss=0.105, reward_mean=0.400, reward_bound=0.183, batch=220
9888: loss=0.105, reward_mean=0.510, reward_bound=0.314, batch=223
9889: loss=0.107, reward_mean=0.460, reward_bound=0.349, batch=215
9890: loss=0.107, reward_mean=0.500, reward_bound=0.356, batch=220
9891: loss=0.113, reward_mean=0.420, reward_bound=0.365, batch=224
9892: loss=0.107, reward_mean=0.480, reward_bound=0.387, batch=213
9893: loss=0.105, reward_mean=0.530, reward_bound=0.290, batch=219
9894: loss=0.105, reward_mean=0.450, reward_bound=0.295, batch

10003: loss=0.107, reward_mean=0.440, reward_bound=0.478, batch=216
10004: loss=0.106, reward_mean=0.420, reward_bound=0.284, batch=221
10005: loss=0.106, reward_mean=0.330, reward_bound=0.314, batch=224
10006: loss=0.108, reward_mean=0.460, reward_bound=0.311, batch=227
10007: loss=0.107, reward_mean=0.420, reward_bound=0.342, batch=229
10008: loss=0.106, reward_mean=0.480, reward_bound=0.349, batch=227
10009: loss=0.107, reward_mean=0.520, reward_bound=0.387, batch=226
10010: loss=0.107, reward_mean=0.390, reward_bound=0.430, batch=224
10011: loss=0.106, reward_mean=0.350, reward_bound=0.345, batch=227
10012: loss=0.108, reward_mean=0.450, reward_bound=0.387, batch=226
10013: loss=0.108, reward_mean=0.450, reward_bound=0.430, batch=227
10014: loss=0.108, reward_mean=0.460, reward_bound=0.478, batch=225
10015: loss=0.109, reward_mean=0.410, reward_bound=0.440, batch=227
10016: loss=0.109, reward_mean=0.390, reward_bound=0.460, batch=229
10017: loss=0.109, reward_mean=0.430, reward_bou

10125: loss=0.086, reward_mean=0.500, reward_bound=0.257, batch=215
10126: loss=0.087, reward_mean=0.520, reward_bound=0.282, batch=218
10127: loss=0.088, reward_mean=0.510, reward_bound=0.282, batch=221
10128: loss=0.086, reward_mean=0.520, reward_bound=0.314, batch=221
10129: loss=0.084, reward_mean=0.400, reward_bound=0.349, batch=202
10130: loss=0.083, reward_mean=0.340, reward_bound=0.150, batch=209
10131: loss=0.084, reward_mean=0.470, reward_bound=0.229, batch=212
10132: loss=0.085, reward_mean=0.440, reward_bound=0.282, batch=216
10133: loss=0.086, reward_mean=0.340, reward_bound=0.241, batch=221
10134: loss=0.085, reward_mean=0.480, reward_bound=0.314, batch=220
10135: loss=0.084, reward_mean=0.500, reward_bound=0.349, batch=219
10136: loss=0.084, reward_mean=0.420, reward_bound=0.203, batch=223
10137: loss=0.086, reward_mean=0.460, reward_bound=0.271, batch=226
10138: loss=0.086, reward_mean=0.370, reward_bound=0.254, batch=227
10139: loss=0.085, reward_mean=0.350, reward_bou

10246: loss=0.082, reward_mean=0.510, reward_bound=0.376, batch=224
10247: loss=0.083, reward_mean=0.420, reward_bound=0.387, batch=223
10248: loss=0.084, reward_mean=0.390, reward_bound=0.360, batch=226
10249: loss=0.084, reward_mean=0.540, reward_bound=0.316, batch=228
10250: loss=0.085, reward_mean=0.480, reward_bound=0.353, batch=229
10251: loss=0.085, reward_mean=0.440, reward_bound=0.405, batch=230
10252: loss=0.083, reward_mean=0.520, reward_bound=0.430, batch=223
10253: loss=0.082, reward_mean=0.490, reward_bound=0.290, batch=226
10254: loss=0.084, reward_mean=0.410, reward_bound=0.387, batch=227
10255: loss=0.083, reward_mean=0.460, reward_bound=0.430, batch=228
10256: loss=0.083, reward_mean=0.440, reward_bound=0.362, batch=229
10257: loss=0.084, reward_mean=0.590, reward_bound=0.478, batch=232
10258: loss=0.082, reward_mean=0.500, reward_bound=0.478, batch=93
10259: loss=0.077, reward_mean=0.430, reward_bound=0.003, batch=135
10260: loss=0.077, reward_mean=0.490, reward_boun

10367: loss=0.081, reward_mean=0.520, reward_bound=0.349, batch=228
10368: loss=0.082, reward_mean=0.370, reward_bound=0.392, batch=229
10369: loss=0.081, reward_mean=0.490, reward_bound=0.430, batch=199
10370: loss=0.079, reward_mean=0.470, reward_bound=0.229, batch=208
10371: loss=0.081, reward_mean=0.490, reward_bound=0.231, batch=215
10372: loss=0.078, reward_mean=0.400, reward_bound=0.282, batch=218
10373: loss=0.079, reward_mean=0.440, reward_bound=0.314, batch=217
10374: loss=0.079, reward_mean=0.400, reward_bound=0.342, batch=222
10375: loss=0.079, reward_mean=0.400, reward_bound=0.314, batch=224
10376: loss=0.077, reward_mean=0.510, reward_bound=0.349, batch=219
10377: loss=0.078, reward_mean=0.460, reward_bound=0.282, batch=222
10378: loss=0.077, reward_mean=0.470, reward_bound=0.349, batch=224
10379: loss=0.076, reward_mean=0.370, reward_bound=0.345, batch=227
10380: loss=0.076, reward_mean=0.410, reward_bound=0.229, batch=228
10381: loss=0.076, reward_mean=0.450, reward_bou

10488: loss=0.072, reward_mean=0.460, reward_bound=0.380, batch=222
10489: loss=0.073, reward_mean=0.480, reward_bound=0.324, batch=225
10490: loss=0.074, reward_mean=0.360, reward_bound=0.356, batch=227
10491: loss=0.076, reward_mean=0.440, reward_bound=0.387, batch=215
10492: loss=0.076, reward_mean=0.480, reward_bound=0.281, batch=220
10493: loss=0.076, reward_mean=0.430, reward_bound=0.338, batch=224
10494: loss=0.073, reward_mean=0.490, reward_bound=0.430, batch=207
10495: loss=0.071, reward_mean=0.380, reward_bound=0.314, batch=212
10496: loss=0.076, reward_mean=0.420, reward_bound=0.213, batch=218
10497: loss=0.075, reward_mean=0.320, reward_bound=0.257, batch=222
10498: loss=0.073, reward_mean=0.430, reward_bound=0.314, batch=222
10499: loss=0.073, reward_mean=0.520, reward_bound=0.349, batch=221
10500: loss=0.073, reward_mean=0.510, reward_bound=0.349, batch=223
10501: loss=0.072, reward_mean=0.520, reward_bound=0.335, batch=226
10502: loss=0.071, reward_mean=0.510, reward_bou

10610: loss=0.076, reward_mean=0.450, reward_bound=0.135, batch=196
10611: loss=0.075, reward_mean=0.510, reward_bound=0.178, batch=207
10612: loss=0.073, reward_mean=0.440, reward_bound=0.206, batch=214
10613: loss=0.072, reward_mean=0.400, reward_bound=0.229, batch=217
10614: loss=0.070, reward_mean=0.470, reward_bound=0.254, batch=212
10615: loss=0.070, reward_mean=0.450, reward_bound=0.282, batch=211
10616: loss=0.070, reward_mean=0.490, reward_bound=0.314, batch=169
10617: loss=0.066, reward_mean=0.440, reward_bound=0.065, batch=187
10618: loss=0.069, reward_mean=0.400, reward_bound=0.087, batch=201
10619: loss=0.074, reward_mean=0.330, reward_bound=0.109, batch=206
10620: loss=0.071, reward_mean=0.490, reward_bound=0.167, batch=211
10621: loss=0.072, reward_mean=0.430, reward_bound=0.206, batch=215
10622: loss=0.071, reward_mean=0.440, reward_bound=0.229, batch=214
10623: loss=0.073, reward_mean=0.440, reward_bound=0.254, batch=212
10624: loss=0.074, reward_mean=0.410, reward_bou

10731: loss=0.066, reward_mean=0.440, reward_bound=0.247, batch=217
10732: loss=0.068, reward_mean=0.450, reward_bound=0.254, batch=221
10733: loss=0.069, reward_mean=0.330, reward_bound=0.282, batch=219
10734: loss=0.068, reward_mean=0.430, reward_bound=0.314, batch=218
10735: loss=0.068, reward_mean=0.380, reward_bound=0.254, batch=221
10736: loss=0.067, reward_mean=0.500, reward_bound=0.282, batch=223
10737: loss=0.073, reward_mean=0.510, reward_bound=0.349, batch=191
10738: loss=0.072, reward_mean=0.520, reward_bound=0.229, batch=202
10739: loss=0.071, reward_mean=0.390, reward_bound=0.213, batch=211
10740: loss=0.073, reward_mean=0.520, reward_bound=0.206, batch=217
10741: loss=0.072, reward_mean=0.420, reward_bound=0.249, batch=222
10742: loss=0.072, reward_mean=0.410, reward_bound=0.254, batch=220
10743: loss=0.073, reward_mean=0.470, reward_bound=0.282, batch=215
10744: loss=0.073, reward_mean=0.480, reward_bound=0.234, batch=220
10745: loss=0.073, reward_mean=0.520, reward_bou

10852: loss=0.068, reward_mean=0.410, reward_bound=0.254, batch=197
10853: loss=0.067, reward_mean=0.450, reward_bound=0.150, batch=207
10854: loss=0.069, reward_mean=0.430, reward_bound=0.206, batch=214
10855: loss=0.072, reward_mean=0.440, reward_bound=0.229, batch=219
10856: loss=0.069, reward_mean=0.400, reward_bound=0.254, batch=220
10857: loss=0.065, reward_mean=0.450, reward_bound=0.282, batch=195
10858: loss=0.066, reward_mean=0.480, reward_bound=0.153, batch=206
10859: loss=0.065, reward_mean=0.460, reward_bound=0.176, batch=214
10860: loss=0.067, reward_mean=0.560, reward_bound=0.226, batch=220
10861: loss=0.065, reward_mean=0.450, reward_bound=0.229, batch=223
10862: loss=0.067, reward_mean=0.460, reward_bound=0.254, batch=220
10863: loss=0.066, reward_mean=0.410, reward_bound=0.282, batch=220
10864: loss=0.067, reward_mean=0.440, reward_bound=0.314, batch=201
10865: loss=0.069, reward_mean=0.440, reward_bound=0.167, batch=209
10866: loss=0.068, reward_mean=0.500, reward_bou

10973: loss=0.081, reward_mean=0.500, reward_bound=0.282, batch=220
10974: loss=0.082, reward_mean=0.410, reward_bound=0.254, batch=222
10975: loss=0.082, reward_mean=0.470, reward_bound=0.314, batch=221
10976: loss=0.082, reward_mean=0.420, reward_bound=0.314, batch=223
10977: loss=0.082, reward_mean=0.470, reward_bound=0.335, batch=226
10978: loss=0.078, reward_mean=0.520, reward_bound=0.349, batch=211
10979: loss=0.077, reward_mean=0.370, reward_bound=0.254, batch=215
10980: loss=0.079, reward_mean=0.470, reward_bound=0.314, batch=219
10981: loss=0.079, reward_mean=0.500, reward_bound=0.314, batch=222
10982: loss=0.080, reward_mean=0.520, reward_bound=0.229, batch=224
10983: loss=0.080, reward_mean=0.410, reward_bound=0.345, batch=227
10984: loss=0.078, reward_mean=0.380, reward_bound=0.349, batch=223
10985: loss=0.079, reward_mean=0.360, reward_bound=0.372, batch=226
10986: loss=0.079, reward_mean=0.500, reward_bound=0.387, batch=206
10987: loss=0.076, reward_mean=0.510, reward_bou

11094: loss=0.077, reward_mean=0.550, reward_bound=0.387, batch=228
11095: loss=0.077, reward_mean=0.530, reward_bound=0.430, batch=226
11096: loss=0.078, reward_mean=0.480, reward_bound=0.331, batch=228
11097: loss=0.078, reward_mean=0.440, reward_bound=0.387, batch=227
11098: loss=0.080, reward_mean=0.370, reward_bound=0.478, batch=214
11099: loss=0.083, reward_mean=0.440, reward_bound=0.282, batch=219
11100: loss=0.081, reward_mean=0.470, reward_bound=0.349, batch=221
11101: loss=0.080, reward_mean=0.400, reward_bound=0.430, batch=221
11102: loss=0.080, reward_mean=0.450, reward_bound=0.349, batch=224
11103: loss=0.079, reward_mean=0.490, reward_bound=0.426, batch=227
11104: loss=0.079, reward_mean=0.480, reward_bound=0.422, batch=229
11105: loss=0.079, reward_mean=0.440, reward_bound=0.405, batch=230
11106: loss=0.079, reward_mean=0.440, reward_bound=0.365, batch=231
11107: loss=0.079, reward_mean=0.480, reward_bound=0.430, batch=226
11108: loss=0.080, reward_mean=0.400, reward_bou

11216: loss=0.067, reward_mean=0.580, reward_bound=0.229, batch=221
11217: loss=0.063, reward_mean=0.520, reward_bound=0.254, batch=215
11218: loss=0.061, reward_mean=0.430, reward_bound=0.189, batch=220
11219: loss=0.062, reward_mean=0.450, reward_bound=0.282, batch=209
11220: loss=0.063, reward_mean=0.500, reward_bound=0.254, batch=215
11221: loss=0.063, reward_mean=0.400, reward_bound=0.314, batch=206
11222: loss=0.065, reward_mean=0.380, reward_bound=0.185, batch=213
11223: loss=0.064, reward_mean=0.480, reward_bound=0.244, batch=219
11224: loss=0.063, reward_mean=0.420, reward_bound=0.265, batch=223
11225: loss=0.062, reward_mean=0.370, reward_bound=0.229, batch=225
11226: loss=0.062, reward_mean=0.450, reward_bound=0.314, batch=223
11227: loss=0.061, reward_mean=0.320, reward_bound=0.349, batch=209
11228: loss=0.059, reward_mean=0.420, reward_bound=0.314, batch=215
11229: loss=0.058, reward_mean=0.450, reward_bound=0.296, batch=220
11230: loss=0.059, reward_mean=0.410, reward_bou

11337: loss=0.072, reward_mean=0.450, reward_bound=0.349, batch=219
11338: loss=0.074, reward_mean=0.420, reward_bound=0.265, batch=223
11339: loss=0.073, reward_mean=0.450, reward_bound=0.335, batch=226
11340: loss=0.072, reward_mean=0.460, reward_bound=0.349, batch=227
11341: loss=0.068, reward_mean=0.440, reward_bound=0.387, batch=222
11342: loss=0.067, reward_mean=0.400, reward_bound=0.272, batch=225
11343: loss=0.067, reward_mean=0.360, reward_bound=0.314, batch=225
11344: loss=0.067, reward_mean=0.380, reward_bound=0.387, batch=225
11345: loss=0.067, reward_mean=0.420, reward_bound=0.396, batch=227
11346: loss=0.066, reward_mean=0.500, reward_bound=0.387, batch=228
11347: loss=0.066, reward_mean=0.400, reward_bound=0.357, batch=229
11348: loss=0.068, reward_mean=0.420, reward_bound=0.430, batch=216
11349: loss=0.067, reward_mean=0.450, reward_bound=0.314, batch=220
11350: loss=0.068, reward_mean=0.460, reward_bound=0.349, batch=222
11351: loss=0.067, reward_mean=0.420, reward_bou

11458: loss=0.061, reward_mean=0.490, reward_bound=0.268, batch=214
11459: loss=0.059, reward_mean=0.440, reward_bound=0.280, batch=220
11460: loss=0.058, reward_mean=0.430, reward_bound=0.266, batch=224
11461: loss=0.059, reward_mean=0.470, reward_bound=0.311, batch=227
11462: loss=0.060, reward_mean=0.460, reward_bound=0.314, batch=223
11463: loss=0.059, reward_mean=0.430, reward_bound=0.349, batch=223
11464: loss=0.058, reward_mean=0.380, reward_bound=0.282, batch=224
11465: loss=0.058, reward_mean=0.420, reward_bound=0.308, batch=227
11466: loss=0.057, reward_mean=0.470, reward_bound=0.387, batch=198
11467: loss=0.061, reward_mean=0.430, reward_bound=0.229, batch=207
11468: loss=0.055, reward_mean=0.480, reward_bound=0.277, batch=215
11469: loss=0.055, reward_mean=0.420, reward_bound=0.282, batch=213
11470: loss=0.059, reward_mean=0.500, reward_bound=0.314, batch=216
11471: loss=0.058, reward_mean=0.450, reward_bound=0.298, batch=221
11472: loss=0.059, reward_mean=0.490, reward_bou

11579: loss=0.061, reward_mean=0.440, reward_bound=0.387, batch=212
11580: loss=0.060, reward_mean=0.390, reward_bound=0.191, batch=218
11581: loss=0.063, reward_mean=0.460, reward_bound=0.231, batch=222
11582: loss=0.061, reward_mean=0.480, reward_bound=0.292, batch=225
11583: loss=0.061, reward_mean=0.470, reward_bound=0.387, batch=222
11584: loss=0.061, reward_mean=0.470, reward_bound=0.400, batch=225
11585: loss=0.060, reward_mean=0.360, reward_bound=0.321, batch=227
11586: loss=0.060, reward_mean=0.450, reward_bound=0.387, batch=227
11587: loss=0.060, reward_mean=0.460, reward_bound=0.349, batch=228
11588: loss=0.062, reward_mean=0.520, reward_bound=0.430, batch=214
11589: loss=0.060, reward_mean=0.460, reward_bound=0.254, batch=219
11590: loss=0.060, reward_mean=0.430, reward_bound=0.229, batch=221
11591: loss=0.061, reward_mean=0.430, reward_bound=0.314, batch=223
11592: loss=0.060, reward_mean=0.420, reward_bound=0.271, batch=226
11593: loss=0.064, reward_mean=0.430, reward_bou

11700: loss=0.062, reward_mean=0.390, reward_bound=0.282, batch=225
11701: loss=0.064, reward_mean=0.490, reward_bound=0.396, batch=227
11702: loss=0.063, reward_mean=0.410, reward_bound=0.387, batch=228
11703: loss=0.063, reward_mean=0.440, reward_bound=0.353, batch=229
11704: loss=0.064, reward_mean=0.420, reward_bound=0.430, batch=227
11705: loss=0.063, reward_mean=0.380, reward_bound=0.373, batch=229
11706: loss=0.063, reward_mean=0.430, reward_bound=0.387, batch=229
11707: loss=0.063, reward_mean=0.300, reward_bound=0.424, batch=230
11708: loss=0.063, reward_mean=0.450, reward_bound=0.430, batch=230
11709: loss=0.063, reward_mean=0.420, reward_bound=0.430, batch=230
11710: loss=0.064, reward_mean=0.430, reward_bound=0.338, batch=231
11711: loss=0.063, reward_mean=0.360, reward_bound=0.478, batch=225
11712: loss=0.062, reward_mean=0.520, reward_bound=0.387, batch=226
11713: loss=0.062, reward_mean=0.400, reward_bound=0.430, batch=226
11714: loss=0.062, reward_mean=0.420, reward_bou

11822: loss=0.067, reward_mean=0.470, reward_bound=0.229, batch=212
11823: loss=0.067, reward_mean=0.430, reward_bound=0.206, batch=218
11824: loss=0.070, reward_mean=0.440, reward_bound=0.314, batch=209
11825: loss=0.069, reward_mean=0.520, reward_bound=0.314, batch=213
11826: loss=0.068, reward_mean=0.460, reward_bound=0.282, batch=217
11827: loss=0.068, reward_mean=0.430, reward_bound=0.277, batch=222
11828: loss=0.070, reward_mean=0.400, reward_bound=0.314, batch=223
11829: loss=0.070, reward_mean=0.500, reward_bound=0.282, batch=225
11830: loss=0.069, reward_mean=0.420, reward_bound=0.266, batch=227
11831: loss=0.067, reward_mean=0.440, reward_bound=0.349, batch=206
11832: loss=0.067, reward_mean=0.380, reward_bound=0.230, batch=214
11833: loss=0.065, reward_mean=0.450, reward_bound=0.204, batch=220
11834: loss=0.068, reward_mean=0.430, reward_bound=0.274, batch=224
11835: loss=0.068, reward_mean=0.520, reward_bound=0.345, batch=227
11836: loss=0.066, reward_mean=0.460, reward_bou

11943: loss=0.062, reward_mean=0.480, reward_bound=0.387, batch=222
11944: loss=0.062, reward_mean=0.410, reward_bound=0.254, batch=224
11945: loss=0.062, reward_mean=0.500, reward_bound=0.311, batch=227
11946: loss=0.064, reward_mean=0.450, reward_bound=0.430, batch=216
11947: loss=0.062, reward_mean=0.450, reward_bound=0.316, batch=221
11948: loss=0.061, reward_mean=0.460, reward_bound=0.206, batch=224
11949: loss=0.061, reward_mean=0.440, reward_bound=0.349, batch=226
11950: loss=0.064, reward_mean=0.410, reward_bound=0.387, batch=223
11951: loss=0.063, reward_mean=0.440, reward_bound=0.322, batch=226
11952: loss=0.063, reward_mean=0.540, reward_bound=0.349, batch=226
11953: loss=0.063, reward_mean=0.440, reward_bound=0.409, batch=228
11954: loss=0.063, reward_mean=0.470, reward_bound=0.430, batch=221
11955: loss=0.062, reward_mean=0.470, reward_bound=0.349, batch=224
11956: loss=0.062, reward_mean=0.410, reward_bound=0.426, batch=227
11957: loss=0.061, reward_mean=0.390, reward_bou

12064: loss=0.053, reward_mean=0.430, reward_bound=0.387, batch=223
12065: loss=0.052, reward_mean=0.500, reward_bound=0.387, batch=224
12066: loss=0.052, reward_mean=0.520, reward_bound=0.387, batch=225
12067: loss=0.052, reward_mean=0.450, reward_bound=0.387, batch=226
12068: loss=0.052, reward_mean=0.420, reward_bound=0.387, batch=227
12069: loss=0.052, reward_mean=0.500, reward_bound=0.387, batch=227
12070: loss=0.052, reward_mean=0.500, reward_bound=0.422, batch=229
12071: loss=0.057, reward_mean=0.460, reward_bound=0.430, batch=206
12072: loss=0.054, reward_mean=0.540, reward_bound=0.241, batch=214
12073: loss=0.053, reward_mean=0.490, reward_bound=0.254, batch=219
12074: loss=0.052, reward_mean=0.520, reward_bound=0.314, batch=222
12075: loss=0.052, reward_mean=0.410, reward_bound=0.324, batch=225
12076: loss=0.056, reward_mean=0.500, reward_bound=0.349, batch=218
12077: loss=0.055, reward_mean=0.350, reward_bound=0.349, batch=221
12078: loss=0.056, reward_mean=0.430, reward_bou

12185: loss=0.065, reward_mean=0.460, reward_bound=0.317, batch=222
12186: loss=0.065, reward_mean=0.480, reward_bound=0.349, batch=223
12187: loss=0.066, reward_mean=0.420, reward_bound=0.229, batch=225
12188: loss=0.067, reward_mean=0.410, reward_bound=0.356, batch=227
12189: loss=0.064, reward_mean=0.460, reward_bound=0.387, batch=225
12190: loss=0.065, reward_mean=0.490, reward_bound=0.296, batch=227
12191: loss=0.064, reward_mean=0.430, reward_bound=0.349, batch=226
12192: loss=0.063, reward_mean=0.380, reward_bound=0.409, batch=228
12193: loss=0.065, reward_mean=0.420, reward_bound=0.353, batch=229
12194: loss=0.061, reward_mean=0.510, reward_bound=0.430, batch=202
12195: loss=0.060, reward_mean=0.470, reward_bound=0.191, batch=211
12196: loss=0.059, reward_mean=0.480, reward_bound=0.206, batch=217
12197: loss=0.058, reward_mean=0.470, reward_bound=0.229, batch=220
12198: loss=0.059, reward_mean=0.460, reward_bound=0.254, batch=223
12199: loss=0.058, reward_mean=0.540, reward_bou

12307: loss=0.054, reward_mean=0.510, reward_bound=0.101, batch=220
12308: loss=0.053, reward_mean=0.480, reward_bound=0.118, batch=224
12309: loss=0.053, reward_mean=0.450, reward_bound=0.122, batch=225
12310: loss=0.054, reward_mean=0.400, reward_bound=0.135, batch=223
12311: loss=0.052, reward_mean=0.490, reward_bound=0.150, batch=224
12312: loss=0.051, reward_mean=0.370, reward_bound=0.167, batch=206
12313: loss=0.052, reward_mean=0.410, reward_bound=0.138, batch=214
12314: loss=0.055, reward_mean=0.470, reward_bound=0.185, batch=204
12315: loss=0.053, reward_mean=0.410, reward_bound=0.206, batch=185
12316: loss=0.053, reward_mean=0.330, reward_bound=0.052, batch=198
12317: loss=0.052, reward_mean=0.460, reward_bound=0.154, batch=208
12318: loss=0.051, reward_mean=0.490, reward_bound=0.206, batch=213
12319: loss=0.049, reward_mean=0.420, reward_bound=0.229, batch=191
12320: loss=0.047, reward_mean=0.450, reward_bound=0.150, batch=203
12321: loss=0.048, reward_mean=0.460, reward_bou

12428: loss=0.042, reward_mean=0.480, reward_bound=0.135, batch=218
12429: loss=0.039, reward_mean=0.420, reward_bound=0.167, batch=217
12430: loss=0.037, reward_mean=0.410, reward_bound=0.185, batch=210
12431: loss=0.038, reward_mean=0.430, reward_bound=0.206, batch=225
12432: loss=0.038, reward_mean=0.370, reward_bound=0.206, batch=218
12433: loss=0.038, reward_mean=0.470, reward_bound=0.229, batch=207
12434: loss=0.040, reward_mean=0.500, reward_bound=0.254, batch=199
12435: loss=0.040, reward_mean=0.470, reward_bound=0.229, batch=208
12436: loss=0.039, reward_mean=0.450, reward_bound=0.254, batch=214
12437: loss=0.038, reward_mean=0.440, reward_bound=0.229, batch=219
12438: loss=0.044, reward_mean=0.520, reward_bound=0.282, batch=201
12439: loss=0.044, reward_mean=0.360, reward_bound=0.167, batch=210
12440: loss=0.043, reward_mean=0.450, reward_bound=0.247, batch=217
12441: loss=0.042, reward_mean=0.490, reward_bound=0.277, batch=222
12442: loss=0.042, reward_mean=0.360, reward_bou

12549: loss=0.048, reward_mean=0.350, reward_bound=0.185, batch=216
12550: loss=0.048, reward_mean=0.510, reward_bound=0.229, batch=214
12551: loss=0.048, reward_mean=0.480, reward_bound=0.229, batch=219
12552: loss=0.054, reward_mean=0.430, reward_bound=0.254, batch=205
12553: loss=0.055, reward_mean=0.480, reward_bound=0.210, batch=213
12554: loss=0.055, reward_mean=0.390, reward_bound=0.229, batch=218
12555: loss=0.051, reward_mean=0.380, reward_bound=0.282, batch=196
12556: loss=0.052, reward_mean=0.540, reward_bound=0.298, batch=207
12557: loss=0.052, reward_mean=0.360, reward_bound=0.206, batch=213
12558: loss=0.052, reward_mean=0.470, reward_bound=0.254, batch=217
12559: loss=0.051, reward_mean=0.480, reward_bound=0.308, batch=222
12560: loss=0.050, reward_mean=0.440, reward_bound=0.314, batch=200
12561: loss=0.049, reward_mean=0.480, reward_bound=0.222, batch=210
12562: loss=0.049, reward_mean=0.360, reward_bound=0.222, batch=217
12563: loss=0.054, reward_mean=0.500, reward_bou

12670: loss=0.047, reward_mean=0.470, reward_bound=0.364, batch=230
12671: loss=0.047, reward_mean=0.490, reward_bound=0.365, batch=231
12672: loss=0.047, reward_mean=0.460, reward_bound=0.430, batch=231
12673: loss=0.047, reward_mean=0.460, reward_bound=0.387, batch=231
12674: loss=0.047, reward_mean=0.440, reward_bound=0.387, batch=231
12675: loss=0.047, reward_mean=0.520, reward_bound=0.430, batch=231
12676: loss=0.047, reward_mean=0.420, reward_bound=0.282, batch=231
12677: loss=0.047, reward_mean=0.480, reward_bound=0.430, batch=231
12678: loss=0.047, reward_mean=0.430, reward_bound=0.387, batch=231
12679: loss=0.047, reward_mean=0.400, reward_bound=0.387, batch=231
12680: loss=0.041, reward_mean=0.560, reward_bound=0.478, batch=150
12681: loss=0.037, reward_mean=0.430, reward_bound=0.109, batch=174
12682: loss=0.040, reward_mean=0.410, reward_bound=0.108, batch=192
12683: loss=0.041, reward_mean=0.470, reward_bound=0.122, batch=201
12684: loss=0.045, reward_mean=0.380, reward_bou

12791: loss=0.038, reward_mean=0.490, reward_bound=0.387, batch=226
12792: loss=0.037, reward_mean=0.460, reward_bound=0.368, batch=228
12793: loss=0.039, reward_mean=0.450, reward_bound=0.430, batch=205
12794: loss=0.038, reward_mean=0.450, reward_bound=0.289, batch=213
12795: loss=0.040, reward_mean=0.460, reward_bound=0.314, batch=215
12796: loss=0.039, reward_mean=0.400, reward_bound=0.349, batch=219
12797: loss=0.038, reward_mean=0.520, reward_bound=0.295, batch=223
12798: loss=0.038, reward_mean=0.440, reward_bound=0.314, batch=225
12799: loss=0.040, reward_mean=0.340, reward_bound=0.356, batch=227
12800: loss=0.038, reward_mean=0.510, reward_bound=0.387, batch=225
12801: loss=0.038, reward_mean=0.430, reward_bound=0.349, batch=226
12802: loss=0.040, reward_mean=0.430, reward_bound=0.430, batch=217
12803: loss=0.039, reward_mean=0.440, reward_bound=0.422, batch=222
12804: loss=0.038, reward_mean=0.450, reward_bound=0.324, batch=225
12805: loss=0.040, reward_mean=0.460, reward_bou

12913: loss=0.053, reward_mean=0.480, reward_bound=0.229, batch=215
12914: loss=0.056, reward_mean=0.470, reward_bound=0.254, batch=184
12915: loss=0.056, reward_mean=0.460, reward_bound=0.149, batch=199
12916: loss=0.060, reward_mean=0.490, reward_bound=0.185, batch=207
12917: loss=0.058, reward_mean=0.400, reward_bound=0.206, batch=212
12918: loss=0.058, reward_mean=0.440, reward_bound=0.229, batch=213
12919: loss=0.057, reward_mean=0.400, reward_bound=0.220, batch=219
12920: loss=0.059, reward_mean=0.450, reward_bound=0.254, batch=215
12921: loss=0.064, reward_mean=0.420, reward_bound=0.282, batch=182
12922: loss=0.062, reward_mean=0.460, reward_bound=0.167, batch=196
12923: loss=0.065, reward_mean=0.330, reward_bound=0.168, batch=207
12924: loss=0.067, reward_mean=0.470, reward_bound=0.185, batch=214
12925: loss=0.066, reward_mean=0.390, reward_bound=0.206, batch=217
12926: loss=0.066, reward_mean=0.400, reward_bound=0.229, batch=218
12927: loss=0.066, reward_mean=0.430, reward_bou

13034: loss=0.079, reward_mean=0.510, reward_bound=0.349, batch=219
13035: loss=0.080, reward_mean=0.390, reward_bound=0.254, batch=222
13036: loss=0.081, reward_mean=0.450, reward_bound=0.314, batch=222
13037: loss=0.078, reward_mean=0.420, reward_bound=0.360, batch=225
13038: loss=0.083, reward_mean=0.500, reward_bound=0.387, batch=195
13039: loss=0.083, reward_mean=0.380, reward_bound=0.210, batch=206
13040: loss=0.082, reward_mean=0.530, reward_bound=0.282, batch=212
13041: loss=0.084, reward_mean=0.440, reward_bound=0.314, batch=213
13042: loss=0.083, reward_mean=0.570, reward_bound=0.301, batch=219
13043: loss=0.085, reward_mean=0.490, reward_bound=0.314, batch=221
13044: loss=0.085, reward_mean=0.420, reward_bound=0.349, batch=216
13045: loss=0.084, reward_mean=0.430, reward_bound=0.349, batch=219
13046: loss=0.084, reward_mean=0.380, reward_bound=0.295, batch=223
13047: loss=0.084, reward_mean=0.470, reward_bound=0.349, batch=223
13048: loss=0.085, reward_mean=0.390, reward_bou

13155: loss=0.061, reward_mean=0.490, reward_bound=0.185, batch=195
13156: loss=0.060, reward_mean=0.490, reward_bound=0.150, batch=204
13157: loss=0.059, reward_mean=0.470, reward_bound=0.206, batch=210
13158: loss=0.058, reward_mean=0.440, reward_bound=0.229, batch=215
13159: loss=0.063, reward_mean=0.460, reward_bound=0.210, batch=220
13160: loss=0.065, reward_mean=0.480, reward_bound=0.254, batch=222
13161: loss=0.068, reward_mean=0.400, reward_bound=0.282, batch=218
13162: loss=0.067, reward_mean=0.450, reward_bound=0.231, batch=222
13163: loss=0.068, reward_mean=0.510, reward_bound=0.314, batch=219
13164: loss=0.063, reward_mean=0.380, reward_bound=0.349, batch=206
13165: loss=0.062, reward_mean=0.490, reward_bound=0.282, batch=212
13166: loss=0.064, reward_mean=0.430, reward_bound=0.229, batch=217
13167: loss=0.063, reward_mean=0.460, reward_bound=0.282, batch=221
13168: loss=0.064, reward_mean=0.430, reward_bound=0.282, batch=224
13169: loss=0.065, reward_mean=0.450, reward_bou

13276: loss=0.075, reward_mean=0.410, reward_bound=0.342, batch=215
13277: loss=0.074, reward_mean=0.520, reward_bound=0.314, batch=219
13278: loss=0.075, reward_mean=0.400, reward_bound=0.349, batch=222
13279: loss=0.075, reward_mean=0.440, reward_bound=0.272, batch=225
13280: loss=0.075, reward_mean=0.460, reward_bound=0.314, batch=226
13281: loss=0.074, reward_mean=0.370, reward_bound=0.349, batch=227
13282: loss=0.071, reward_mean=0.450, reward_bound=0.387, batch=197
13283: loss=0.070, reward_mean=0.410, reward_bound=0.206, batch=207
13284: loss=0.070, reward_mean=0.390, reward_bound=0.224, batch=215
13285: loss=0.064, reward_mean=0.490, reward_bound=0.254, batch=218
13286: loss=0.065, reward_mean=0.450, reward_bound=0.254, batch=220
13287: loss=0.064, reward_mean=0.420, reward_bound=0.282, batch=223
13288: loss=0.064, reward_mean=0.460, reward_bound=0.301, batch=226
13289: loss=0.065, reward_mean=0.530, reward_bound=0.314, batch=221
13290: loss=0.066, reward_mean=0.530, reward_bou

13397: loss=0.071, reward_mean=0.540, reward_bound=0.349, batch=222
13398: loss=0.070, reward_mean=0.490, reward_bound=0.387, batch=220
13399: loss=0.071, reward_mean=0.510, reward_bound=0.349, batch=223
13400: loss=0.072, reward_mean=0.420, reward_bound=0.314, batch=225
13401: loss=0.071, reward_mean=0.420, reward_bound=0.273, batch=227
13402: loss=0.071, reward_mean=0.500, reward_bound=0.335, batch=229
13403: loss=0.072, reward_mean=0.400, reward_bound=0.405, batch=230
13404: loss=0.073, reward_mean=0.480, reward_bound=0.338, batch=231
13405: loss=0.072, reward_mean=0.430, reward_bound=0.349, batch=230
13406: loss=0.073, reward_mean=0.550, reward_bound=0.430, batch=219
13407: loss=0.075, reward_mean=0.480, reward_bound=0.349, batch=222
13408: loss=0.074, reward_mean=0.380, reward_bound=0.387, batch=224
13409: loss=0.074, reward_mean=0.430, reward_bound=0.380, batch=227
13410: loss=0.073, reward_mean=0.440, reward_bound=0.342, batch=229
13411: loss=0.073, reward_mean=0.490, reward_bou

13519: loss=0.080, reward_mean=0.420, reward_bound=0.282, batch=216
13520: loss=0.079, reward_mean=0.350, reward_bound=0.282, batch=219
13521: loss=0.078, reward_mean=0.430, reward_bound=0.278, batch=223
13522: loss=0.079, reward_mean=0.450, reward_bound=0.314, batch=208
13523: loss=0.078, reward_mean=0.500, reward_bound=0.282, batch=213
13524: loss=0.079, reward_mean=0.450, reward_bound=0.301, batch=219
13525: loss=0.080, reward_mean=0.430, reward_bound=0.314, batch=220
13526: loss=0.080, reward_mean=0.420, reward_bound=0.296, batch=224
13527: loss=0.080, reward_mean=0.440, reward_bound=0.345, batch=227
13528: loss=0.079, reward_mean=0.400, reward_bound=0.314, batch=228
13529: loss=0.079, reward_mean=0.450, reward_bound=0.289, batch=229
13530: loss=0.093, reward_mean=0.420, reward_bound=0.349, batch=160
13531: loss=0.087, reward_mean=0.450, reward_bound=0.131, batch=182
13532: loss=0.087, reward_mean=0.500, reward_bound=0.072, batch=196
13533: loss=0.089, reward_mean=0.420, reward_bou

13640: loss=0.096, reward_mean=0.410, reward_bound=0.254, batch=210
13641: loss=0.096, reward_mean=0.410, reward_bound=0.282, batch=212
13642: loss=0.098, reward_mean=0.370, reward_bound=0.206, batch=220
13643: loss=0.095, reward_mean=0.460, reward_bound=0.222, batch=224
13644: loss=0.093, reward_mean=0.510, reward_bound=0.314, batch=220
13645: loss=0.093, reward_mean=0.420, reward_bound=0.254, batch=223
13646: loss=0.094, reward_mean=0.540, reward_bound=0.349, batch=217
13647: loss=0.096, reward_mean=0.450, reward_bound=0.254, batch=221
13648: loss=0.095, reward_mean=0.430, reward_bound=0.314, batch=224
13649: loss=0.097, reward_mean=0.390, reward_bound=0.349, batch=226
13650: loss=0.096, reward_mean=0.400, reward_bound=0.368, batch=228
13651: loss=0.098, reward_mean=0.380, reward_bound=0.387, batch=217
13652: loss=0.097, reward_mean=0.400, reward_bound=0.314, batch=221
13653: loss=0.097, reward_mean=0.440, reward_bound=0.387, batch=223
13654: loss=0.096, reward_mean=0.440, reward_bou

13761: loss=0.095, reward_mean=0.450, reward_bound=0.282, batch=210
13762: loss=0.095, reward_mean=0.450, reward_bound=0.282, batch=215
13763: loss=0.094, reward_mean=0.480, reward_bound=0.314, batch=213
13764: loss=0.095, reward_mean=0.440, reward_bound=0.335, batch=219
13765: loss=0.095, reward_mean=0.430, reward_bound=0.314, batch=222
13766: loss=0.095, reward_mean=0.380, reward_bound=0.349, batch=210
13767: loss=0.095, reward_mean=0.460, reward_bound=0.247, batch=217
13768: loss=0.093, reward_mean=0.420, reward_bound=0.282, batch=219
13769: loss=0.094, reward_mean=0.410, reward_bound=0.349, batch=220
13770: loss=0.094, reward_mean=0.460, reward_bound=0.314, batch=223
13771: loss=0.093, reward_mean=0.470, reward_bound=0.349, batch=224
13772: loss=0.093, reward_mean=0.440, reward_bound=0.384, batch=227
13773: loss=0.094, reward_mean=0.520, reward_bound=0.380, batch=229
13774: loss=0.094, reward_mean=0.500, reward_bound=0.387, batch=219
13775: loss=0.094, reward_mean=0.470, reward_bou

13882: loss=0.093, reward_mean=0.440, reward_bound=0.254, batch=220
13883: loss=0.093, reward_mean=0.560, reward_bound=0.338, batch=224
13884: loss=0.095, reward_mean=0.480, reward_bound=0.311, batch=227
13885: loss=0.092, reward_mean=0.440, reward_bound=0.314, batch=228
13886: loss=0.092, reward_mean=0.440, reward_bound=0.349, batch=224
13887: loss=0.092, reward_mean=0.510, reward_bound=0.387, batch=220
13888: loss=0.094, reward_mean=0.460, reward_bound=0.274, batch=224
13889: loss=0.094, reward_mean=0.510, reward_bound=0.349, batch=225
13890: loss=0.094, reward_mean=0.400, reward_bound=0.349, batch=226
13891: loss=0.093, reward_mean=0.380, reward_bound=0.368, batch=228
13892: loss=0.093, reward_mean=0.400, reward_bound=0.387, batch=227
13893: loss=0.089, reward_mean=0.450, reward_bound=0.430, batch=200
13894: loss=0.088, reward_mean=0.500, reward_bound=0.222, batch=210
13895: loss=0.088, reward_mean=0.500, reward_bound=0.274, batch=217
13896: loss=0.088, reward_mean=0.510, reward_bou

14003: loss=0.085, reward_mean=0.460, reward_bound=0.430, batch=214
14004: loss=0.084, reward_mean=0.450, reward_bound=0.229, batch=219
14005: loss=0.082, reward_mean=0.500, reward_bound=0.364, batch=223
14006: loss=0.083, reward_mean=0.440, reward_bound=0.387, batch=225
14007: loss=0.082, reward_mean=0.430, reward_bound=0.430, batch=224
14008: loss=0.082, reward_mean=0.440, reward_bound=0.465, batch=227
14009: loss=0.082, reward_mean=0.410, reward_bound=0.430, batch=228
14010: loss=0.081, reward_mean=0.310, reward_bound=0.430, batch=228
14011: loss=0.081, reward_mean=0.460, reward_bound=0.478, batch=230
14012: loss=0.081, reward_mean=0.440, reward_bound=0.376, batch=231
14013: loss=0.081, reward_mean=0.390, reward_bound=0.430, batch=230
14014: loss=0.081, reward_mean=0.380, reward_bound=0.365, batch=231
14015: loss=0.081, reward_mean=0.400, reward_bound=0.349, batch=231
14016: loss=0.082, reward_mean=0.450, reward_bound=0.387, batch=231
14017: loss=0.081, reward_mean=0.470, reward_bou

14125: loss=0.079, reward_mean=0.430, reward_bound=0.314, batch=205
14126: loss=0.079, reward_mean=0.380, reward_bound=0.229, batch=212
14127: loss=0.077, reward_mean=0.400, reward_bound=0.292, batch=218
14128: loss=0.076, reward_mean=0.380, reward_bound=0.257, batch=222
14129: loss=0.078, reward_mean=0.360, reward_bound=0.314, batch=222
14130: loss=0.078, reward_mean=0.420, reward_bound=0.349, batch=214
14131: loss=0.080, reward_mean=0.490, reward_bound=0.311, batch=220
14132: loss=0.080, reward_mean=0.470, reward_bound=0.314, batch=222
14133: loss=0.080, reward_mean=0.440, reward_bound=0.349, batch=224
14134: loss=0.079, reward_mean=0.410, reward_bound=0.384, batch=227
14135: loss=0.083, reward_mean=0.450, reward_bound=0.387, batch=154
14136: loss=0.080, reward_mean=0.410, reward_bound=0.071, batch=178
14137: loss=0.077, reward_mean=0.500, reward_bound=0.098, batch=192
14138: loss=0.077, reward_mean=0.390, reward_bound=0.122, batch=198
14139: loss=0.078, reward_mean=0.490, reward_bou

14246: loss=0.084, reward_mean=0.410, reward_bound=0.185, batch=218
14247: loss=0.080, reward_mean=0.430, reward_bound=0.206, batch=221
14248: loss=0.085, reward_mean=0.440, reward_bound=0.254, batch=216
14249: loss=0.085, reward_mean=0.490, reward_bound=0.282, batch=216
14250: loss=0.085, reward_mean=0.530, reward_bound=0.314, batch=207
14251: loss=0.085, reward_mean=0.440, reward_bound=0.254, batch=214
14252: loss=0.085, reward_mean=0.350, reward_bound=0.282, batch=218
14253: loss=0.084, reward_mean=0.440, reward_bound=0.282, batch=221
14254: loss=0.087, reward_mean=0.470, reward_bound=0.314, batch=223
14255: loss=0.088, reward_mean=0.440, reward_bound=0.349, batch=212
14256: loss=0.087, reward_mean=0.440, reward_bound=0.254, batch=217
14257: loss=0.087, reward_mean=0.530, reward_bound=0.308, batch=222
14258: loss=0.087, reward_mean=0.490, reward_bound=0.292, batch=225
14259: loss=0.089, reward_mean=0.460, reward_bound=0.321, batch=227
14260: loss=0.090, reward_mean=0.480, reward_bou

14367: loss=0.086, reward_mean=0.360, reward_bound=0.260, batch=220
14368: loss=0.086, reward_mean=0.420, reward_bound=0.282, batch=221
14369: loss=0.087, reward_mean=0.440, reward_bound=0.314, batch=218
14370: loss=0.087, reward_mean=0.470, reward_bound=0.314, batch=221
14371: loss=0.086, reward_mean=0.410, reward_bound=0.349, batch=211
14372: loss=0.087, reward_mean=0.510, reward_bound=0.282, batch=217
14373: loss=0.091, reward_mean=0.500, reward_bound=0.277, batch=222
14374: loss=0.088, reward_mean=0.440, reward_bound=0.282, batch=224
14375: loss=0.087, reward_mean=0.340, reward_bound=0.204, batch=227
14376: loss=0.088, reward_mean=0.380, reward_bound=0.314, batch=225
14377: loss=0.086, reward_mean=0.430, reward_bound=0.356, batch=227
14378: loss=0.091, reward_mean=0.400, reward_bound=0.387, batch=207
14379: loss=0.092, reward_mean=0.340, reward_bound=0.182, batch=215
14380: loss=0.092, reward_mean=0.430, reward_bound=0.260, batch=220
14381: loss=0.092, reward_mean=0.430, reward_bou

14488: loss=0.095, reward_mean=0.380, reward_bound=0.392, batch=229
14489: loss=0.096, reward_mean=0.380, reward_bound=0.328, batch=230
14490: loss=0.097, reward_mean=0.370, reward_bound=0.430, batch=211
14491: loss=0.097, reward_mean=0.500, reward_bound=0.349, batch=216
14492: loss=0.096, reward_mean=0.460, reward_bound=0.230, batch=221
14493: loss=0.095, reward_mean=0.440, reward_bound=0.282, batch=223
14494: loss=0.097, reward_mean=0.490, reward_bound=0.349, batch=221
14495: loss=0.096, reward_mean=0.450, reward_bound=0.349, batch=224
14496: loss=0.097, reward_mean=0.490, reward_bound=0.280, batch=227
14497: loss=0.097, reward_mean=0.340, reward_bound=0.282, batch=228
14498: loss=0.098, reward_mean=0.440, reward_bound=0.353, batch=229
14499: loss=0.097, reward_mean=0.480, reward_bound=0.387, batch=222
14500: loss=0.097, reward_mean=0.460, reward_bound=0.360, batch=225
14501: loss=0.095, reward_mean=0.390, reward_bound=0.387, batch=226
14502: loss=0.095, reward_mean=0.500, reward_bou

14609: loss=0.090, reward_mean=0.450, reward_bound=0.430, batch=228
14610: loss=0.091, reward_mean=0.400, reward_bound=0.392, batch=229
14611: loss=0.091, reward_mean=0.450, reward_bound=0.450, batch=230
14612: loss=0.091, reward_mean=0.430, reward_bound=0.430, batch=230
14613: loss=0.091, reward_mean=0.470, reward_bound=0.464, batch=231
14614: loss=0.091, reward_mean=0.470, reward_bound=0.478, batch=226
14615: loss=0.091, reward_mean=0.500, reward_bound=0.478, batch=227
14616: loss=0.092, reward_mean=0.410, reward_bound=0.330, batch=229
14617: loss=0.092, reward_mean=0.480, reward_bound=0.405, batch=230
14618: loss=0.091, reward_mean=0.430, reward_bound=0.430, batch=230
14619: loss=0.091, reward_mean=0.350, reward_bound=0.464, batch=231
14620: loss=0.092, reward_mean=0.480, reward_bound=0.478, batch=230
14621: loss=0.092, reward_mean=0.320, reward_bound=0.387, batch=230
14622: loss=0.092, reward_mean=0.480, reward_bound=0.478, batch=230
14623: loss=0.092, reward_mean=0.460, reward_bou

14731: loss=0.105, reward_mean=0.530, reward_bound=0.282, batch=202
14732: loss=0.103, reward_mean=0.400, reward_bound=0.172, batch=211
14733: loss=0.106, reward_mean=0.450, reward_bound=0.185, batch=217
14734: loss=0.103, reward_mean=0.410, reward_bound=0.206, batch=221
14735: loss=0.104, reward_mean=0.480, reward_bound=0.254, batch=224
14736: loss=0.105, reward_mean=0.520, reward_bound=0.282, batch=226
14737: loss=0.108, reward_mean=0.550, reward_bound=0.314, batch=209
14738: loss=0.110, reward_mean=0.500, reward_bound=0.265, batch=216
14739: loss=0.109, reward_mean=0.420, reward_bound=0.314, batch=220
14740: loss=0.108, reward_mean=0.510, reward_bound=0.274, batch=224
14741: loss=0.109, reward_mean=0.500, reward_bound=0.282, batch=225
14742: loss=0.108, reward_mean=0.490, reward_bound=0.349, batch=196
14743: loss=0.106, reward_mean=0.480, reward_bound=0.268, batch=207
14744: loss=0.107, reward_mean=0.470, reward_bound=0.229, batch=214
14745: loss=0.106, reward_mean=0.390, reward_bou

14852: loss=0.099, reward_mean=0.440, reward_bound=0.274, batch=217
14853: loss=0.102, reward_mean=0.430, reward_bound=0.277, batch=222
14854: loss=0.100, reward_mean=0.390, reward_bound=0.292, batch=225
14855: loss=0.100, reward_mean=0.290, reward_bound=0.314, batch=225
14856: loss=0.099, reward_mean=0.460, reward_bound=0.349, batch=225
14857: loss=0.099, reward_mean=0.450, reward_bound=0.329, batch=227
14858: loss=0.099, reward_mean=0.400, reward_bound=0.380, batch=229
14859: loss=0.101, reward_mean=0.430, reward_bound=0.387, batch=214
14860: loss=0.101, reward_mean=0.390, reward_bound=0.303, batch=220
14861: loss=0.101, reward_mean=0.410, reward_bound=0.314, batch=222
14862: loss=0.100, reward_mean=0.450, reward_bound=0.263, batch=225
14863: loss=0.100, reward_mean=0.400, reward_bound=0.349, batch=226
14864: loss=0.100, reward_mean=0.430, reward_bound=0.349, batch=227
14865: loss=0.100, reward_mean=0.460, reward_bound=0.387, batch=226
14866: loss=0.099, reward_mean=0.430, reward_bou

14973: loss=0.096, reward_mean=0.450, reward_bound=0.254, batch=213
14974: loss=0.094, reward_mean=0.460, reward_bound=0.282, batch=208
14975: loss=0.093, reward_mean=0.530, reward_bound=0.282, batch=213
14976: loss=0.095, reward_mean=0.420, reward_bound=0.314, batch=207
14977: loss=0.095, reward_mean=0.440, reward_bound=0.150, batch=214
14978: loss=0.093, reward_mean=0.370, reward_bound=0.229, batch=219
14979: loss=0.092, reward_mean=0.510, reward_bound=0.282, batch=220
14980: loss=0.095, reward_mean=0.440, reward_bound=0.314, batch=219
14981: loss=0.096, reward_mean=0.470, reward_bound=0.349, batch=214
14982: loss=0.095, reward_mean=0.390, reward_bound=0.206, batch=219
14983: loss=0.096, reward_mean=0.450, reward_bound=0.239, batch=223
14984: loss=0.095, reward_mean=0.480, reward_bound=0.301, batch=226
14985: loss=0.099, reward_mean=0.490, reward_bound=0.314, batch=226
14986: loss=0.096, reward_mean=0.470, reward_bound=0.349, batch=226
14987: loss=0.096, reward_mean=0.440, reward_bou

15094: loss=0.082, reward_mean=0.540, reward_bound=0.282, batch=219
15095: loss=0.085, reward_mean=0.490, reward_bound=0.349, batch=219
15096: loss=0.087, reward_mean=0.470, reward_bound=0.387, batch=218
15097: loss=0.088, reward_mean=0.520, reward_bound=0.289, batch=222
15098: loss=0.088, reward_mean=0.460, reward_bound=0.282, batch=224
15099: loss=0.089, reward_mean=0.410, reward_bound=0.349, batch=226
15100: loss=0.089, reward_mean=0.480, reward_bound=0.331, batch=228
15101: loss=0.089, reward_mean=0.410, reward_bound=0.349, batch=228
15102: loss=0.089, reward_mean=0.380, reward_bound=0.353, batch=229
15103: loss=0.087, reward_mean=0.480, reward_bound=0.387, batch=226
15104: loss=0.088, reward_mean=0.410, reward_bound=0.368, batch=228
15105: loss=0.088, reward_mean=0.410, reward_bound=0.353, batch=229
15106: loss=0.086, reward_mean=0.540, reward_bound=0.430, batch=222
15107: loss=0.090, reward_mean=0.600, reward_bound=0.478, batch=173
15108: loss=0.095, reward_mean=0.370, reward_bou

15215: loss=0.085, reward_mean=0.430, reward_bound=0.368, batch=228
15216: loss=0.085, reward_mean=0.470, reward_bound=0.430, batch=223
15217: loss=0.085, reward_mean=0.500, reward_bound=0.244, batch=226
15218: loss=0.087, reward_mean=0.440, reward_bound=0.351, batch=228
15219: loss=0.087, reward_mean=0.490, reward_bound=0.392, batch=229
15220: loss=0.085, reward_mean=0.550, reward_bound=0.430, batch=224
15221: loss=0.085, reward_mean=0.370, reward_bound=0.469, batch=227
15222: loss=0.085, reward_mean=0.450, reward_bound=0.380, batch=229
15223: loss=0.084, reward_mean=0.440, reward_bound=0.450, batch=230
15224: loss=0.084, reward_mean=0.420, reward_bound=0.365, batch=231
15225: loss=0.084, reward_mean=0.440, reward_bound=0.387, batch=230
15226: loss=0.084, reward_mean=0.490, reward_bound=0.478, batch=223
15227: loss=0.084, reward_mean=0.470, reward_bound=0.387, batch=224
15228: loss=0.084, reward_mean=0.420, reward_bound=0.280, batch=227
15229: loss=0.086, reward_mean=0.390, reward_bou

15337: loss=0.088, reward_mean=0.480, reward_bound=0.301, batch=226
15338: loss=0.089, reward_mean=0.430, reward_bound=0.349, batch=227
15339: loss=0.090, reward_mean=0.380, reward_bound=0.277, batch=229
15340: loss=0.088, reward_mean=0.350, reward_bound=0.387, batch=150
15341: loss=0.089, reward_mean=0.460, reward_bound=0.086, batch=175
15342: loss=0.088, reward_mean=0.420, reward_bound=0.109, batch=193
15343: loss=0.089, reward_mean=0.460, reward_bound=0.130, batch=205
15344: loss=0.083, reward_mean=0.510, reward_bound=0.150, batch=211
15345: loss=0.083, reward_mean=0.500, reward_bound=0.167, batch=212
15346: loss=0.087, reward_mean=0.370, reward_bound=0.185, batch=215
15347: loss=0.084, reward_mean=0.400, reward_bound=0.206, batch=215
15348: loss=0.087, reward_mean=0.390, reward_bound=0.229, batch=214
15349: loss=0.085, reward_mean=0.400, reward_bound=0.229, batch=217
15350: loss=0.091, reward_mean=0.440, reward_bound=0.254, batch=211
15351: loss=0.091, reward_mean=0.370, reward_bou

15458: loss=0.093, reward_mean=0.470, reward_bound=0.430, batch=210
15459: loss=0.093, reward_mean=0.440, reward_bound=0.254, batch=215
15460: loss=0.095, reward_mean=0.430, reward_bound=0.321, batch=220
15461: loss=0.094, reward_mean=0.350, reward_bound=0.266, batch=224
15462: loss=0.096, reward_mean=0.430, reward_bound=0.311, batch=227
15463: loss=0.097, reward_mean=0.420, reward_bound=0.308, batch=229
15464: loss=0.097, reward_mean=0.490, reward_bound=0.295, batch=230
15465: loss=0.096, reward_mean=0.440, reward_bound=0.338, batch=231
15466: loss=0.092, reward_mean=0.360, reward_bound=0.349, batch=227
15467: loss=0.093, reward_mean=0.550, reward_bound=0.387, batch=225
15468: loss=0.093, reward_mean=0.400, reward_bound=0.314, batch=226
15469: loss=0.093, reward_mean=0.440, reward_bound=0.349, batch=227
15470: loss=0.093, reward_mean=0.370, reward_bound=0.335, batch=229
15471: loss=0.093, reward_mean=0.410, reward_bound=0.314, batch=229
15472: loss=0.093, reward_mean=0.450, reward_bou

15579: loss=0.079, reward_mean=0.460, reward_bound=0.387, batch=216
15580: loss=0.078, reward_mean=0.500, reward_bound=0.314, batch=219
15581: loss=0.078, reward_mean=0.440, reward_bound=0.349, batch=222
15582: loss=0.081, reward_mean=0.430, reward_bound=0.387, batch=224
15583: loss=0.083, reward_mean=0.450, reward_bound=0.430, batch=199
15584: loss=0.082, reward_mean=0.350, reward_bound=0.203, batch=209
15585: loss=0.081, reward_mean=0.450, reward_bound=0.203, batch=216
15586: loss=0.084, reward_mean=0.450, reward_bound=0.254, batch=220
15587: loss=0.083, reward_mean=0.400, reward_bound=0.282, batch=218
15588: loss=0.082, reward_mean=0.400, reward_bound=0.257, batch=222
15589: loss=0.083, reward_mean=0.460, reward_bound=0.314, batch=223
15590: loss=0.083, reward_mean=0.440, reward_bound=0.335, batch=226
15591: loss=0.085, reward_mean=0.430, reward_bound=0.349, batch=217
15592: loss=0.084, reward_mean=0.410, reward_bound=0.282, batch=220
15593: loss=0.083, reward_mean=0.480, reward_bou

15700: loss=0.088, reward_mean=0.500, reward_bound=0.349, batch=225
15701: loss=0.088, reward_mean=0.410, reward_bound=0.349, batch=225
15702: loss=0.089, reward_mean=0.470, reward_bound=0.387, batch=224
15703: loss=0.089, reward_mean=0.460, reward_bound=0.387, batch=226
15704: loss=0.090, reward_mean=0.450, reward_bound=0.368, batch=228
15705: loss=0.089, reward_mean=0.480, reward_bound=0.387, batch=228
15706: loss=0.091, reward_mean=0.460, reward_bound=0.430, batch=220
15707: loss=0.091, reward_mean=0.480, reward_bound=0.349, batch=222
15708: loss=0.090, reward_mean=0.430, reward_bound=0.292, batch=225
15709: loss=0.090, reward_mean=0.380, reward_bound=0.349, batch=226
15710: loss=0.091, reward_mean=0.560, reward_bound=0.387, batch=225
15711: loss=0.091, reward_mean=0.430, reward_bound=0.365, batch=227
15712: loss=0.090, reward_mean=0.280, reward_bound=0.422, batch=229
15713: loss=0.090, reward_mean=0.440, reward_bound=0.430, batch=226
15714: loss=0.091, reward_mean=0.380, reward_bou

15821: loss=0.086, reward_mean=0.450, reward_bound=0.314, batch=226
15822: loss=0.087, reward_mean=0.520, reward_bound=0.368, batch=228
15823: loss=0.087, reward_mean=0.350, reward_bound=0.387, batch=227
15824: loss=0.087, reward_mean=0.480, reward_bound=0.430, batch=226
15825: loss=0.088, reward_mean=0.360, reward_bound=0.433, batch=228
15826: loss=0.089, reward_mean=0.450, reward_bound=0.478, batch=232
15827: loss=0.087, reward_mean=0.410, reward_bound=0.478, batch=223
15828: loss=0.086, reward_mean=0.440, reward_bound=0.314, batch=224
15829: loss=0.086, reward_mean=0.440, reward_bound=0.282, batch=226
15830: loss=0.086, reward_mean=0.420, reward_bound=0.387, batch=226
15831: loss=0.086, reward_mean=0.450, reward_bound=0.268, batch=228
15832: loss=0.087, reward_mean=0.420, reward_bound=0.353, batch=229
15833: loss=0.087, reward_mean=0.580, reward_bound=0.405, batch=230
15834: loss=0.087, reward_mean=0.520, reward_bound=0.387, batch=230
15835: loss=0.088, reward_mean=0.420, reward_bou

15943: loss=0.094, reward_mean=0.460, reward_bound=0.224, batch=215
15944: loss=0.096, reward_mean=0.510, reward_bound=0.282, batch=216
15945: loss=0.097, reward_mean=0.340, reward_bound=0.282, batch=218
15946: loss=0.098, reward_mean=0.540, reward_bound=0.314, batch=217
15947: loss=0.098, reward_mean=0.390, reward_bound=0.224, batch=222
15948: loss=0.098, reward_mean=0.460, reward_bound=0.282, batch=224
15949: loss=0.097, reward_mean=0.400, reward_bound=0.311, batch=227
15950: loss=0.097, reward_mean=0.470, reward_bound=0.308, batch=229
15951: loss=0.094, reward_mean=0.390, reward_bound=0.349, batch=220
15952: loss=0.093, reward_mean=0.460, reward_bound=0.349, batch=223
15953: loss=0.093, reward_mean=0.420, reward_bound=0.358, batch=226
15954: loss=0.093, reward_mean=0.530, reward_bound=0.387, batch=195
15955: loss=0.093, reward_mean=0.390, reward_bound=0.141, batch=206
15956: loss=0.093, reward_mean=0.460, reward_bound=0.217, batch=214
15957: loss=0.093, reward_mean=0.410, reward_bou

16064: loss=0.090, reward_mean=0.410, reward_bound=0.314, batch=224
16065: loss=0.090, reward_mean=0.420, reward_bound=0.349, batch=218
16066: loss=0.090, reward_mean=0.440, reward_bound=0.353, batch=222
16067: loss=0.091, reward_mean=0.450, reward_bound=0.349, batch=224
16068: loss=0.092, reward_mean=0.500, reward_bound=0.349, batch=226
16069: loss=0.091, reward_mean=0.430, reward_bound=0.349, batch=227
16070: loss=0.092, reward_mean=0.450, reward_bound=0.330, batch=229
16071: loss=0.091, reward_mean=0.430, reward_bound=0.387, batch=223
16072: loss=0.090, reward_mean=0.390, reward_bound=0.384, batch=226
16073: loss=0.091, reward_mean=0.440, reward_bound=0.430, batch=203
16074: loss=0.088, reward_mean=0.360, reward_bound=0.185, batch=210
16075: loss=0.090, reward_mean=0.520, reward_bound=0.229, batch=213
16076: loss=0.093, reward_mean=0.510, reward_bound=0.301, batch=219
16077: loss=0.093, reward_mean=0.400, reward_bound=0.314, batch=220
16078: loss=0.092, reward_mean=0.440, reward_bou

16185: loss=0.085, reward_mean=0.510, reward_bound=0.229, batch=216
16186: loss=0.082, reward_mean=0.450, reward_bound=0.254, batch=214
16187: loss=0.079, reward_mean=0.400, reward_bound=0.224, batch=220
16188: loss=0.083, reward_mean=0.470, reward_bound=0.247, batch=224
16189: loss=0.084, reward_mean=0.460, reward_bound=0.282, batch=224
16190: loss=0.084, reward_mean=0.450, reward_bound=0.280, batch=227
16191: loss=0.078, reward_mean=0.420, reward_bound=0.314, batch=210
16192: loss=0.079, reward_mean=0.390, reward_bound=0.282, batch=213
16193: loss=0.077, reward_mean=0.430, reward_bound=0.229, batch=218
16194: loss=0.077, reward_mean=0.470, reward_bound=0.254, batch=221
16195: loss=0.078, reward_mean=0.410, reward_bound=0.254, batch=224
16196: loss=0.076, reward_mean=0.460, reward_bound=0.314, batch=226
16197: loss=0.075, reward_mean=0.430, reward_bound=0.349, batch=211
16198: loss=0.077, reward_mean=0.500, reward_bound=0.229, batch=217
16199: loss=0.076, reward_mean=0.490, reward_bou

16306: loss=0.077, reward_mean=0.470, reward_bound=0.254, batch=223
16307: loss=0.078, reward_mean=0.390, reward_bound=0.349, batch=225
16308: loss=0.080, reward_mean=0.440, reward_bound=0.387, batch=218
16309: loss=0.080, reward_mean=0.480, reward_bound=0.353, batch=222
16310: loss=0.081, reward_mean=0.470, reward_bound=0.349, batch=224
16311: loss=0.081, reward_mean=0.410, reward_bound=0.311, batch=227
16312: loss=0.081, reward_mean=0.440, reward_bound=0.314, batch=228
16313: loss=0.080, reward_mean=0.430, reward_bound=0.349, batch=227
16314: loss=0.079, reward_mean=0.560, reward_bound=0.373, batch=229
16315: loss=0.079, reward_mean=0.420, reward_bound=0.387, batch=229
16316: loss=0.076, reward_mean=0.390, reward_bound=0.430, batch=209
16317: loss=0.076, reward_mean=0.440, reward_bound=0.295, batch=216
16318: loss=0.076, reward_mean=0.500, reward_bound=0.349, batch=216
16319: loss=0.074, reward_mean=0.420, reward_bound=0.368, batch=221
16320: loss=0.074, reward_mean=0.390, reward_bou

16427: loss=0.073, reward_mean=0.520, reward_bound=0.277, batch=222
16428: loss=0.073, reward_mean=0.460, reward_bound=0.349, batch=221
16429: loss=0.075, reward_mean=0.360, reward_bound=0.314, batch=224
16430: loss=0.074, reward_mean=0.440, reward_bound=0.380, batch=227
16431: loss=0.071, reward_mean=0.500, reward_bound=0.387, batch=226
16432: loss=0.071, reward_mean=0.450, reward_bound=0.368, batch=228
16433: loss=0.071, reward_mean=0.370, reward_bound=0.387, batch=228
16434: loss=0.072, reward_mean=0.460, reward_bound=0.430, batch=217
16435: loss=0.071, reward_mean=0.440, reward_bound=0.277, batch=222
16436: loss=0.070, reward_mean=0.430, reward_bound=0.292, batch=225
16437: loss=0.072, reward_mean=0.440, reward_bound=0.314, batch=224
16438: loss=0.074, reward_mean=0.390, reward_bound=0.349, batch=225
16439: loss=0.074, reward_mean=0.520, reward_bound=0.349, batch=226
16440: loss=0.074, reward_mean=0.420, reward_bound=0.368, batch=228
16441: loss=0.072, reward_mean=0.480, reward_bou

16549: loss=0.072, reward_mean=0.520, reward_bound=0.254, batch=215
16550: loss=0.071, reward_mean=0.420, reward_bound=0.314, batch=218
16551: loss=0.076, reward_mean=0.480, reward_bound=0.349, batch=203
16552: loss=0.076, reward_mean=0.360, reward_bound=0.178, batch=212
16553: loss=0.073, reward_mean=0.450, reward_bound=0.206, batch=221
16554: loss=0.073, reward_mean=0.470, reward_bound=0.206, batch=224
16555: loss=0.073, reward_mean=0.370, reward_bound=0.280, batch=227
16556: loss=0.073, reward_mean=0.450, reward_bound=0.282, batch=227
16557: loss=0.073, reward_mean=0.410, reward_bound=0.314, batch=227
16558: loss=0.074, reward_mean=0.490, reward_bound=0.349, batch=222
16559: loss=0.068, reward_mean=0.540, reward_bound=0.387, batch=145
16560: loss=0.074, reward_mean=0.400, reward_bound=0.031, batch=169
16561: loss=0.071, reward_mean=0.520, reward_bound=0.089, batch=185
16562: loss=0.070, reward_mean=0.560, reward_bound=0.153, batch=199
16563: loss=0.070, reward_mean=0.470, reward_bou

16670: loss=0.064, reward_mean=0.380, reward_bound=0.277, batch=222
16671: loss=0.067, reward_mean=0.310, reward_bound=0.292, batch=225
16672: loss=0.065, reward_mean=0.390, reward_bound=0.314, batch=222
16673: loss=0.063, reward_mean=0.470, reward_bound=0.349, batch=222
16674: loss=0.063, reward_mean=0.440, reward_bound=0.324, batch=225
16675: loss=0.062, reward_mean=0.480, reward_bound=0.349, batch=226
16676: loss=0.065, reward_mean=0.440, reward_bound=0.387, batch=224
16677: loss=0.064, reward_mean=0.400, reward_bound=0.426, batch=227
16678: loss=0.064, reward_mean=0.450, reward_bound=0.430, batch=221
16679: loss=0.064, reward_mean=0.460, reward_bound=0.314, batch=223
16680: loss=0.063, reward_mean=0.330, reward_bound=0.271, batch=226
16681: loss=0.062, reward_mean=0.410, reward_bound=0.316, batch=228
16682: loss=0.062, reward_mean=0.480, reward_bound=0.349, batch=228
16683: loss=0.062, reward_mean=0.520, reward_bound=0.317, batch=229
16684: loss=0.063, reward_mean=0.450, reward_bou

16791: loss=0.048, reward_mean=0.380, reward_bound=0.236, batch=218
16792: loss=0.051, reward_mean=0.390, reward_bound=0.257, batch=222
16793: loss=0.049, reward_mean=0.300, reward_bound=0.282, batch=222
16794: loss=0.054, reward_mean=0.450, reward_bound=0.387, batch=212
16795: loss=0.056, reward_mean=0.410, reward_bound=0.324, batch=218
16796: loss=0.056, reward_mean=0.450, reward_bound=0.349, batch=219
16797: loss=0.055, reward_mean=0.500, reward_bound=0.405, batch=223
16798: loss=0.054, reward_mean=0.490, reward_bound=0.387, batch=225
16799: loss=0.053, reward_mean=0.500, reward_bound=0.430, batch=198
16800: loss=0.051, reward_mean=0.420, reward_bound=0.152, batch=208
16801: loss=0.053, reward_mean=0.430, reward_bound=0.229, batch=212
16802: loss=0.051, reward_mean=0.430, reward_bound=0.254, batch=217
16803: loss=0.049, reward_mean=0.430, reward_bound=0.178, batch=222
16804: loss=0.052, reward_mean=0.480, reward_bound=0.254, batch=224
16805: loss=0.049, reward_mean=0.460, reward_bou

16912: loss=0.033, reward_mean=0.400, reward_bound=0.339, batch=227
16913: loss=0.035, reward_mean=0.360, reward_bound=0.308, batch=229
16914: loss=0.033, reward_mean=0.480, reward_bound=0.349, batch=229
16915: loss=0.033, reward_mean=0.470, reward_bound=0.364, batch=230
16916: loss=0.034, reward_mean=0.330, reward_bound=0.387, batch=224
16917: loss=0.033, reward_mean=0.430, reward_bound=0.384, batch=227
16918: loss=0.033, reward_mean=0.490, reward_bound=0.349, batch=228
16919: loss=0.033, reward_mean=0.520, reward_bound=0.387, batch=228
16920: loss=0.033, reward_mean=0.370, reward_bound=0.297, batch=229
16921: loss=0.034, reward_mean=0.460, reward_bound=0.430, batch=222
16922: loss=0.034, reward_mean=0.430, reward_bound=0.324, batch=225
16923: loss=0.036, reward_mean=0.440, reward_bound=0.321, batch=227
16924: loss=0.037, reward_mean=0.390, reward_bound=0.422, batch=229
16925: loss=0.034, reward_mean=0.490, reward_bound=0.430, batch=223
16926: loss=0.036, reward_mean=0.500, reward_bou

17033: loss=0.046, reward_mean=0.440, reward_bound=0.387, batch=225
17034: loss=0.046, reward_mean=0.390, reward_bound=0.321, batch=227
17035: loss=0.049, reward_mean=0.460, reward_bound=0.342, batch=229
17036: loss=0.049, reward_mean=0.380, reward_bound=0.364, batch=230
17037: loss=0.049, reward_mean=0.380, reward_bound=0.387, batch=228
17038: loss=0.049, reward_mean=0.560, reward_bound=0.387, batch=228
17039: loss=0.046, reward_mean=0.410, reward_bound=0.430, batch=225
17040: loss=0.046, reward_mean=0.430, reward_bound=0.356, batch=227
17041: loss=0.045, reward_mean=0.340, reward_bound=0.308, batch=229
17042: loss=0.045, reward_mean=0.480, reward_bound=0.405, batch=230
17043: loss=0.045, reward_mean=0.420, reward_bound=0.464, batch=231
17044: loss=0.045, reward_mean=0.590, reward_bound=0.430, batch=231
17045: loss=0.045, reward_mean=0.420, reward_bound=0.430, batch=231
17046: loss=0.045, reward_mean=0.480, reward_bound=0.430, batch=231
17047: loss=0.047, reward_mean=0.470, reward_bou

17155: loss=0.042, reward_mean=0.480, reward_bound=0.254, batch=217
17156: loss=0.042, reward_mean=0.430, reward_bound=0.229, batch=221
17157: loss=0.041, reward_mean=0.440, reward_bound=0.282, batch=216
17158: loss=0.041, reward_mean=0.410, reward_bound=0.314, batch=205
17159: loss=0.039, reward_mean=0.550, reward_bound=0.210, batch=213
17160: loss=0.041, reward_mean=0.370, reward_bound=0.197, batch=219
17161: loss=0.041, reward_mean=0.460, reward_bound=0.254, batch=220
17162: loss=0.041, reward_mean=0.380, reward_bound=0.274, batch=224
17163: loss=0.039, reward_mean=0.410, reward_bound=0.314, batch=226
17164: loss=0.039, reward_mean=0.480, reward_bound=0.282, batch=227
17165: loss=0.038, reward_mean=0.390, reward_bound=0.349, batch=203
17166: loss=0.036, reward_mean=0.450, reward_bound=0.167, batch=211
17167: loss=0.036, reward_mean=0.530, reward_bound=0.254, batch=216
17168: loss=0.035, reward_mean=0.370, reward_bound=0.143, batch=221
17169: loss=0.037, reward_mean=0.440, reward_bou

17276: loss=0.050, reward_mean=0.440, reward_bound=0.418, batch=231
17277: loss=0.050, reward_mean=0.400, reward_bound=0.430, batch=230
17278: loss=0.050, reward_mean=0.350, reward_bound=0.464, batch=231
17279: loss=0.057, reward_mean=0.420, reward_bound=0.478, batch=94
17280: loss=0.052, reward_mean=0.350, reward_bound=0.000, batch=129
17281: loss=0.048, reward_mean=0.440, reward_bound=0.007, batch=160
17282: loss=0.050, reward_mean=0.330, reward_bound=0.020, batch=182
17283: loss=0.054, reward_mean=0.440, reward_bound=0.044, batch=197
17284: loss=0.050, reward_mean=0.420, reward_bound=0.070, batch=208
17285: loss=0.050, reward_mean=0.430, reward_bound=0.098, batch=208
17286: loss=0.050, reward_mean=0.410, reward_bound=0.122, batch=212
17287: loss=0.052, reward_mean=0.500, reward_bound=0.135, batch=217
17288: loss=0.054, reward_mean=0.480, reward_bound=0.163, batch=222
17289: loss=0.056, reward_mean=0.480, reward_bound=0.167, batch=219
17290: loss=0.056, reward_mean=0.430, reward_boun

17397: loss=0.057, reward_mean=0.430, reward_bound=0.430, batch=215
17398: loss=0.056, reward_mean=0.470, reward_bound=0.329, batch=220
17399: loss=0.055, reward_mean=0.430, reward_bound=0.304, batch=224
17400: loss=0.054, reward_mean=0.450, reward_bound=0.384, batch=227
17401: loss=0.055, reward_mean=0.480, reward_bound=0.387, batch=226
17402: loss=0.057, reward_mean=0.380, reward_bound=0.331, batch=228
17403: loss=0.054, reward_mean=0.480, reward_bound=0.392, batch=229
17404: loss=0.055, reward_mean=0.450, reward_bound=0.430, batch=224
17405: loss=0.054, reward_mean=0.430, reward_bound=0.345, batch=227
17406: loss=0.054, reward_mean=0.420, reward_bound=0.349, batch=228
17407: loss=0.056, reward_mean=0.480, reward_bound=0.392, batch=229
17408: loss=0.056, reward_mean=0.510, reward_bound=0.405, batch=230
17409: loss=0.057, reward_mean=0.420, reward_bound=0.430, batch=230
17410: loss=0.050, reward_mean=0.500, reward_bound=0.478, batch=146
17411: loss=0.052, reward_mean=0.450, reward_bou

17518: loss=0.053, reward_mean=0.360, reward_bound=0.249, batch=222
17519: loss=0.054, reward_mean=0.460, reward_bound=0.263, batch=225
17520: loss=0.055, reward_mean=0.420, reward_bound=0.282, batch=224
17521: loss=0.056, reward_mean=0.390, reward_bound=0.349, batch=209
17522: loss=0.057, reward_mean=0.420, reward_bound=0.174, batch=216
17523: loss=0.058, reward_mean=0.510, reward_bound=0.268, batch=221
17524: loss=0.055, reward_mean=0.510, reward_bound=0.314, batch=223
17525: loss=0.057, reward_mean=0.480, reward_bound=0.349, batch=219
17526: loss=0.059, reward_mean=0.460, reward_bound=0.265, batch=223
17527: loss=0.057, reward_mean=0.470, reward_bound=0.387, batch=206
17528: loss=0.055, reward_mean=0.400, reward_bound=0.225, batch=214
17529: loss=0.056, reward_mean=0.480, reward_bound=0.314, batch=219
17530: loss=0.056, reward_mean=0.420, reward_bound=0.349, batch=222
17531: loss=0.055, reward_mean=0.400, reward_bound=0.324, batch=225
17532: loss=0.056, reward_mean=0.400, reward_bou

17639: loss=0.054, reward_mean=0.450, reward_bound=0.387, batch=229
17640: loss=0.054, reward_mean=0.420, reward_bound=0.500, batch=230
17641: loss=0.054, reward_mean=0.570, reward_bound=0.376, batch=231
17642: loss=0.054, reward_mean=0.520, reward_bound=0.478, batch=231
17643: loss=0.054, reward_mean=0.370, reward_bound=0.478, batch=231
17644: loss=0.054, reward_mean=0.480, reward_bound=0.478, batch=231
17646: loss=0.040, reward_mean=0.460, reward_bound=0.000, batch=46
17647: loss=0.037, reward_mean=0.520, reward_bound=0.000, batch=98
17648: loss=0.043, reward_mean=0.490, reward_bound=0.001, batch=138
17649: loss=0.049, reward_mean=0.510, reward_bound=0.006, batch=166
17650: loss=0.050, reward_mean=0.420, reward_bound=0.015, batch=182
17651: loss=0.049, reward_mean=0.530, reward_bound=0.029, batch=197
17652: loss=0.054, reward_mean=0.480, reward_bound=0.047, batch=204
17653: loss=0.052, reward_mean=0.480, reward_bound=0.065, batch=211
17654: loss=0.051, reward_mean=0.390, reward_bound

17761: loss=0.034, reward_mean=0.510, reward_bound=0.314, batch=223
17762: loss=0.038, reward_mean=0.460, reward_bound=0.349, batch=220
17763: loss=0.040, reward_mean=0.560, reward_bound=0.338, batch=224
17764: loss=0.038, reward_mean=0.500, reward_bound=0.387, batch=216
17765: loss=0.037, reward_mean=0.480, reward_bound=0.331, batch=221
17766: loss=0.037, reward_mean=0.450, reward_bound=0.349, batch=222
17767: loss=0.036, reward_mean=0.450, reward_bound=0.373, batch=225
17768: loss=0.036, reward_mean=0.520, reward_bound=0.387, batch=226
17769: loss=0.029, reward_mean=0.480, reward_bound=0.430, batch=138
17770: loss=0.028, reward_mean=0.440, reward_bound=0.053, batch=166
17771: loss=0.027, reward_mean=0.550, reward_bound=0.089, batch=184
17772: loss=0.022, reward_mean=0.520, reward_bound=0.122, batch=196
17773: loss=0.030, reward_mean=0.390, reward_bound=0.143, batch=207
17774: loss=0.028, reward_mean=0.400, reward_bound=0.150, batch=214
17775: loss=0.027, reward_mean=0.460, reward_bou

17882: loss=0.019, reward_mean=0.500, reward_bound=0.215, batch=216
17883: loss=0.019, reward_mean=0.390, reward_bound=0.229, batch=220
17884: loss=0.019, reward_mean=0.380, reward_bound=0.229, batch=223
17885: loss=0.017, reward_mean=0.500, reward_bound=0.254, batch=221
17886: loss=0.017, reward_mean=0.420, reward_bound=0.282, batch=222
17887: loss=0.020, reward_mean=0.500, reward_bound=0.314, batch=192
17888: loss=0.019, reward_mean=0.430, reward_bound=0.254, batch=202
17889: loss=0.020, reward_mean=0.430, reward_bound=0.229, batch=210
17890: loss=0.020, reward_mean=0.430, reward_bound=0.229, batch=215
17891: loss=0.019, reward_mean=0.490, reward_bound=0.260, batch=220
17892: loss=0.019, reward_mean=0.490, reward_bound=0.282, batch=221
17893: loss=0.020, reward_mean=0.450, reward_bound=0.314, batch=218
17894: loss=0.024, reward_mean=0.570, reward_bound=0.349, batch=185
17895: loss=0.021, reward_mean=0.380, reward_bound=0.109, batch=200
17896: loss=0.023, reward_mean=0.470, reward_bou

18003: loss=0.023, reward_mean=0.430, reward_bound=0.087, batch=194
18004: loss=0.020, reward_mean=0.510, reward_bound=0.109, batch=204
18005: loss=0.020, reward_mean=0.470, reward_bound=0.150, batch=209
18006: loss=0.020, reward_mean=0.370, reward_bound=0.185, batch=213
18007: loss=0.020, reward_mean=0.560, reward_bound=0.206, batch=218
18008: loss=0.019, reward_mean=0.410, reward_bound=0.229, batch=214
18009: loss=0.019, reward_mean=0.410, reward_bound=0.254, batch=213
18010: loss=0.019, reward_mean=0.370, reward_bound=0.254, batch=218
18011: loss=0.017, reward_mean=0.480, reward_bound=0.282, batch=210
18012: loss=0.017, reward_mean=0.460, reward_bound=0.282, batch=216
18013: loss=0.016, reward_mean=0.460, reward_bound=0.268, batch=221
18014: loss=0.018, reward_mean=0.450, reward_bound=0.314, batch=202
18015: loss=0.017, reward_mean=0.460, reward_bound=0.245, batch=211
18016: loss=0.017, reward_mean=0.390, reward_bound=0.229, batch=216
18017: loss=0.017, reward_mean=0.480, reward_bou

18124: loss=0.023, reward_mean=0.350, reward_bound=0.236, batch=225
18125: loss=0.020, reward_mean=0.440, reward_bound=0.349, batch=225
18126: loss=0.020, reward_mean=0.540, reward_bound=0.356, batch=227
18127: loss=0.020, reward_mean=0.510, reward_bound=0.387, batch=223
18128: loss=0.020, reward_mean=0.430, reward_bound=0.372, batch=226
18129: loss=0.020, reward_mean=0.490, reward_bound=0.387, batch=227
18130: loss=0.021, reward_mean=0.470, reward_bound=0.430, batch=222
18131: loss=0.020, reward_mean=0.380, reward_bound=0.387, batch=224
18132: loss=0.020, reward_mean=0.420, reward_bound=0.254, batch=226
18133: loss=0.020, reward_mean=0.460, reward_bound=0.368, batch=228
18134: loss=0.020, reward_mean=0.460, reward_bound=0.387, batch=227
18135: loss=0.020, reward_mean=0.520, reward_bound=0.373, batch=229
18136: loss=0.020, reward_mean=0.480, reward_bound=0.405, batch=230
18137: loss=0.020, reward_mean=0.490, reward_bound=0.430, batch=227
18138: loss=0.021, reward_mean=0.550, reward_bou

18246: loss=0.025, reward_mean=0.440, reward_bound=0.349, batch=222
18247: loss=0.025, reward_mean=0.430, reward_bound=0.360, batch=225
18248: loss=0.024, reward_mean=0.400, reward_bound=0.387, batch=150
18249: loss=0.031, reward_mean=0.380, reward_bound=0.063, batch=175
18250: loss=0.033, reward_mean=0.460, reward_bound=0.059, batch=192
18251: loss=0.033, reward_mean=0.390, reward_bound=0.089, batch=203
18252: loss=0.027, reward_mean=0.440, reward_bound=0.130, batch=212
18253: loss=0.026, reward_mean=0.470, reward_bound=0.140, batch=218
18254: loss=0.028, reward_mean=0.490, reward_bound=0.150, batch=221
18255: loss=0.025, reward_mean=0.460, reward_bound=0.185, batch=214
18256: loss=0.023, reward_mean=0.460, reward_bound=0.206, batch=211
18257: loss=0.025, reward_mean=0.500, reward_bound=0.229, batch=215
18258: loss=0.025, reward_mean=0.460, reward_bound=0.254, batch=217
18259: loss=0.024, reward_mean=0.450, reward_bound=0.282, batch=209
18260: loss=0.024, reward_mean=0.390, reward_bou

18367: loss=0.032, reward_mean=0.460, reward_bound=0.263, batch=218
18368: loss=0.030, reward_mean=0.400, reward_bound=0.317, batch=222
18369: loss=0.031, reward_mean=0.420, reward_bound=0.172, batch=225
18370: loss=0.030, reward_mean=0.440, reward_bound=0.349, batch=223
18371: loss=0.030, reward_mean=0.420, reward_bound=0.349, batch=225
18372: loss=0.030, reward_mean=0.370, reward_bound=0.349, batch=226
18373: loss=0.033, reward_mean=0.490, reward_bound=0.387, batch=221
18374: loss=0.032, reward_mean=0.460, reward_bound=0.314, batch=223
18375: loss=0.034, reward_mean=0.460, reward_bound=0.372, batch=226
18376: loss=0.034, reward_mean=0.510, reward_bound=0.331, batch=228
18377: loss=0.034, reward_mean=0.430, reward_bound=0.387, batch=227
18378: loss=0.033, reward_mean=0.520, reward_bound=0.430, batch=204
18379: loss=0.031, reward_mean=0.490, reward_bound=0.280, batch=213
18380: loss=0.030, reward_mean=0.390, reward_bound=0.254, batch=218
18381: loss=0.031, reward_mean=0.400, reward_bou

18488: loss=0.022, reward_mean=0.400, reward_bound=0.314, batch=214
18489: loss=0.022, reward_mean=0.420, reward_bound=0.282, batch=219
18490: loss=0.022, reward_mean=0.450, reward_bound=0.282, batch=222
18491: loss=0.022, reward_mean=0.410, reward_bound=0.349, batch=219
18492: loss=0.022, reward_mean=0.440, reward_bound=0.364, batch=223
18493: loss=0.020, reward_mean=0.490, reward_bound=0.387, batch=220
18494: loss=0.019, reward_mean=0.470, reward_bound=0.338, batch=224
18495: loss=0.019, reward_mean=0.380, reward_bound=0.349, batch=225
18496: loss=0.019, reward_mean=0.450, reward_bound=0.396, batch=227
18497: loss=0.019, reward_mean=0.490, reward_bound=0.422, batch=229
18498: loss=0.019, reward_mean=0.410, reward_bound=0.430, batch=192
18499: loss=0.018, reward_mean=0.370, reward_bound=0.167, batch=203
18500: loss=0.020, reward_mean=0.500, reward_bound=0.185, batch=211
18501: loss=0.017, reward_mean=0.450, reward_bound=0.206, batch=213
18502: loss=0.017, reward_mean=0.400, reward_bou

18609: loss=0.027, reward_mean=0.460, reward_bound=0.109, batch=206
18610: loss=0.032, reward_mean=0.490, reward_bound=0.150, batch=211
18611: loss=0.031, reward_mean=0.450, reward_bound=0.206, batch=217
18612: loss=0.031, reward_mean=0.450, reward_bound=0.229, batch=219
18613: loss=0.031, reward_mean=0.460, reward_bound=0.254, batch=222
18614: loss=0.024, reward_mean=0.480, reward_bound=0.282, batch=222
18615: loss=0.025, reward_mean=0.490, reward_bound=0.314, batch=219
18616: loss=0.026, reward_mean=0.510, reward_bound=0.349, batch=212
18617: loss=0.025, reward_mean=0.470, reward_bound=0.314, batch=216
18618: loss=0.025, reward_mean=0.420, reward_bound=0.314, batch=220
18619: loss=0.029, reward_mean=0.430, reward_bound=0.349, batch=223
18620: loss=0.029, reward_mean=0.480, reward_bound=0.335, batch=226
18621: loss=0.030, reward_mean=0.420, reward_bound=0.387, batch=217
18622: loss=0.030, reward_mean=0.490, reward_bound=0.335, batch=222
18623: loss=0.032, reward_mean=0.480, reward_bou

18731: loss=0.073, reward_mean=0.490, reward_bound=0.229, batch=208
18732: loss=0.076, reward_mean=0.540, reward_bound=0.257, batch=215
18733: loss=0.076, reward_mean=0.500, reward_bound=0.282, batch=208
18734: loss=0.076, reward_mean=0.480, reward_bound=0.257, batch=215
18735: loss=0.076, reward_mean=0.470, reward_bound=0.282, batch=215
18736: loss=0.075, reward_mean=0.540, reward_bound=0.289, batch=220
18737: loss=0.076, reward_mean=0.480, reward_bound=0.314, batch=212
18738: loss=0.075, reward_mean=0.390, reward_bound=0.292, batch=218
18739: loss=0.075, reward_mean=0.360, reward_bound=0.286, batch=222
18740: loss=0.074, reward_mean=0.490, reward_bound=0.292, batch=225
18741: loss=0.074, reward_mean=0.540, reward_bound=0.314, batch=225
18742: loss=0.072, reward_mean=0.540, reward_bound=0.349, batch=173
18743: loss=0.071, reward_mean=0.460, reward_bound=0.171, batch=191
18744: loss=0.070, reward_mean=0.410, reward_bound=0.122, batch=203
18745: loss=0.074, reward_mean=0.490, reward_bou

18852: loss=0.057, reward_mean=0.430, reward_bound=0.430, batch=190
18853: loss=0.054, reward_mean=0.450, reward_bound=0.167, batch=202
18854: loss=0.058, reward_mean=0.400, reward_bound=0.185, batch=208
18855: loss=0.057, reward_mean=0.470, reward_bound=0.234, batch=215
18856: loss=0.058, reward_mean=0.400, reward_bound=0.282, batch=214
18857: loss=0.057, reward_mean=0.430, reward_bound=0.226, batch=220
18858: loss=0.058, reward_mean=0.480, reward_bound=0.282, batch=221
18859: loss=0.057, reward_mean=0.430, reward_bound=0.282, batch=224
18860: loss=0.055, reward_mean=0.470, reward_bound=0.314, batch=224
18861: loss=0.055, reward_mean=0.490, reward_bound=0.252, batch=227
18862: loss=0.055, reward_mean=0.480, reward_bound=0.349, batch=221
18863: loss=0.056, reward_mean=0.450, reward_bound=0.349, batch=223
18864: loss=0.057, reward_mean=0.470, reward_bound=0.301, batch=226
18865: loss=0.058, reward_mean=0.480, reward_bound=0.368, batch=228
18866: loss=0.059, reward_mean=0.450, reward_bou

18973: loss=0.070, reward_mean=0.380, reward_bound=0.254, batch=213
18974: loss=0.071, reward_mean=0.390, reward_bound=0.282, batch=211
18975: loss=0.069, reward_mean=0.460, reward_bound=0.314, batch=207
18976: loss=0.068, reward_mean=0.450, reward_bound=0.254, batch=213
18977: loss=0.067, reward_mean=0.490, reward_bound=0.349, batch=210
18978: loss=0.065, reward_mean=0.400, reward_bound=0.247, batch=217
18979: loss=0.063, reward_mean=0.500, reward_bound=0.277, batch=222
18980: loss=0.065, reward_mean=0.400, reward_bound=0.314, batch=220
18981: loss=0.065, reward_mean=0.470, reward_bound=0.349, batch=218
18982: loss=0.066, reward_mean=0.540, reward_bound=0.387, batch=203
18983: loss=0.065, reward_mean=0.440, reward_bound=0.271, batch=212
18984: loss=0.064, reward_mean=0.400, reward_bound=0.263, batch=218
18985: loss=0.062, reward_mean=0.440, reward_bound=0.282, batch=221
18986: loss=0.063, reward_mean=0.430, reward_bound=0.314, batch=220
18987: loss=0.063, reward_mean=0.490, reward_bou

19094: loss=0.056, reward_mean=0.500, reward_bound=0.372, batch=226
19095: loss=0.055, reward_mean=0.460, reward_bound=0.430, batch=224
19096: loss=0.055, reward_mean=0.460, reward_bound=0.422, batch=227
19097: loss=0.055, reward_mean=0.420, reward_bound=0.430, batch=227
19098: loss=0.056, reward_mean=0.500, reward_bound=0.380, batch=229
19099: loss=0.054, reward_mean=0.420, reward_bound=0.387, batch=229
19100: loss=0.054, reward_mean=0.450, reward_bound=0.430, batch=229
19101: loss=0.054, reward_mean=0.510, reward_bound=0.405, batch=230
19102: loss=0.054, reward_mean=0.480, reward_bound=0.464, batch=231
19103: loss=0.054, reward_mean=0.390, reward_bound=0.282, batch=231
19104: loss=0.054, reward_mean=0.430, reward_bound=0.430, batch=231
19105: loss=0.061, reward_mean=0.550, reward_bound=0.478, batch=180
19106: loss=0.057, reward_mean=0.460, reward_bound=0.118, batch=196
19107: loss=0.058, reward_mean=0.410, reward_bound=0.136, batch=207
19108: loss=0.061, reward_mean=0.450, reward_bou

19215: loss=0.063, reward_mean=0.490, reward_bound=0.515, batch=231
19216: loss=0.063, reward_mean=0.470, reward_bound=0.430, batch=231
19217: loss=0.063, reward_mean=0.460, reward_bound=0.478, batch=231
19219: loss=0.061, reward_mean=0.510, reward_bound=0.000, batch=51
19220: loss=0.067, reward_mean=0.500, reward_bound=0.000, batch=101
19221: loss=0.067, reward_mean=0.490, reward_bound=0.002, batch=140
19222: loss=0.066, reward_mean=0.500, reward_bound=0.008, batch=168
19223: loss=0.069, reward_mean=0.430, reward_bound=0.017, batch=187
19224: loss=0.074, reward_mean=0.430, reward_bound=0.028, batch=196
19225: loss=0.077, reward_mean=0.420, reward_bound=0.038, batch=206
19226: loss=0.071, reward_mean=0.390, reward_bound=0.047, batch=212
19227: loss=0.069, reward_mean=0.470, reward_bound=0.058, batch=215
19228: loss=0.070, reward_mean=0.440, reward_bound=0.073, batch=220
19229: loss=0.072, reward_mean=0.390, reward_bound=0.089, batch=216
19230: loss=0.076, reward_mean=0.480, reward_boun

19337: loss=0.058, reward_mean=0.470, reward_bound=0.095, batch=205
19338: loss=0.063, reward_mean=0.480, reward_bound=0.122, batch=209
19339: loss=0.062, reward_mean=0.430, reward_bound=0.135, batch=211
19340: loss=0.065, reward_mean=0.440, reward_bound=0.150, batch=210
19341: loss=0.067, reward_mean=0.500, reward_bound=0.185, batch=204
19342: loss=0.065, reward_mean=0.460, reward_bound=0.185, batch=211
19343: loss=0.059, reward_mean=0.470, reward_bound=0.206, batch=208
19344: loss=0.054, reward_mean=0.400, reward_bound=0.229, batch=208
19345: loss=0.057, reward_mean=0.420, reward_bound=0.254, batch=206
19346: loss=0.057, reward_mean=0.420, reward_bound=0.254, batch=213
19347: loss=0.056, reward_mean=0.510, reward_bound=0.254, batch=218
19348: loss=0.056, reward_mean=0.480, reward_bound=0.282, batch=208
19349: loss=0.055, reward_mean=0.410, reward_bound=0.229, batch=211
19350: loss=0.059, reward_mean=0.550, reward_bound=0.314, batch=199
19351: loss=0.059, reward_mean=0.410, reward_bou

19458: loss=0.055, reward_mean=0.470, reward_bound=0.024, batch=141
19459: loss=0.061, reward_mean=0.420, reward_bound=0.031, batch=168
19460: loss=0.060, reward_mean=0.470, reward_bound=0.052, batch=186
19461: loss=0.064, reward_mean=0.410, reward_bound=0.065, batch=198
19462: loss=0.063, reward_mean=0.450, reward_bound=0.089, batch=207
19463: loss=0.065, reward_mean=0.410, reward_bound=0.098, batch=214
19464: loss=0.067, reward_mean=0.430, reward_bound=0.122, batch=215
19465: loss=0.060, reward_mean=0.420, reward_bound=0.135, batch=218
19466: loss=0.056, reward_mean=0.420, reward_bound=0.150, batch=216
19467: loss=0.054, reward_mean=0.500, reward_bound=0.167, batch=220
19468: loss=0.053, reward_mean=0.530, reward_bound=0.185, batch=221
19469: loss=0.053, reward_mean=0.440, reward_bound=0.206, batch=208
19470: loss=0.052, reward_mean=0.460, reward_bound=0.229, batch=200
19471: loss=0.052, reward_mean=0.570, reward_bound=0.254, batch=200
19472: loss=0.055, reward_mean=0.350, reward_bou

19579: loss=0.066, reward_mean=0.450, reward_bound=0.430, batch=228
19580: loss=0.066, reward_mean=0.490, reward_bound=0.435, batch=229
19581: loss=0.067, reward_mean=0.450, reward_bound=0.401, batch=230
19582: loss=0.062, reward_mean=0.500, reward_bound=0.478, batch=157
19583: loss=0.052, reward_mean=0.460, reward_bound=0.072, batch=179
19584: loss=0.057, reward_mean=0.440, reward_bound=0.089, batch=194
19585: loss=0.063, reward_mean=0.440, reward_bound=0.122, batch=205
19586: loss=0.066, reward_mean=0.400, reward_bound=0.150, batch=211
19587: loss=0.063, reward_mean=0.480, reward_bound=0.167, batch=216
19588: loss=0.061, reward_mean=0.510, reward_bound=0.196, batch=221
19589: loss=0.068, reward_mean=0.570, reward_bound=0.229, batch=218
19590: loss=0.064, reward_mean=0.440, reward_bound=0.254, batch=213
19591: loss=0.068, reward_mean=0.460, reward_bound=0.282, batch=210
19592: loss=0.072, reward_mean=0.410, reward_bound=0.247, batch=217
19593: loss=0.070, reward_mean=0.370, reward_bou

19700: loss=0.067, reward_mean=0.470, reward_bound=0.422, batch=229
19701: loss=0.067, reward_mean=0.340, reward_bound=0.430, batch=228
19702: loss=0.067, reward_mean=0.430, reward_bound=0.435, batch=229
19703: loss=0.067, reward_mean=0.420, reward_bound=0.424, batch=230
19704: loss=0.067, reward_mean=0.430, reward_bound=0.406, batch=231
19705: loss=0.067, reward_mean=0.470, reward_bound=0.430, batch=230
19706: loss=0.067, reward_mean=0.360, reward_bound=0.451, batch=231
19707: loss=0.067, reward_mean=0.530, reward_bound=0.430, batch=231
19708: loss=0.067, reward_mean=0.360, reward_bound=0.478, batch=202
19709: loss=0.069, reward_mean=0.540, reward_bound=0.206, batch=213
19710: loss=0.068, reward_mean=0.490, reward_bound=0.244, batch=219
19711: loss=0.067, reward_mean=0.430, reward_bound=0.254, batch=222
19712: loss=0.067, reward_mean=0.450, reward_bound=0.314, batch=222
19713: loss=0.067, reward_mean=0.430, reward_bound=0.314, batch=224
19714: loss=0.067, reward_mean=0.470, reward_bou

19822: loss=0.047, reward_mean=0.390, reward_bound=0.036, batch=181
19823: loss=0.055, reward_mean=0.380, reward_bound=0.098, batch=193
19824: loss=0.053, reward_mean=0.520, reward_bound=0.185, batch=201
19825: loss=0.055, reward_mean=0.360, reward_bound=0.206, batch=201
19826: loss=0.054, reward_mean=0.510, reward_bound=0.206, batch=210
19827: loss=0.053, reward_mean=0.470, reward_bound=0.229, batch=210
19828: loss=0.055, reward_mean=0.490, reward_bound=0.254, batch=207
19829: loss=0.052, reward_mean=0.440, reward_bound=0.282, batch=209
19830: loss=0.051, reward_mean=0.420, reward_bound=0.157, batch=216
19831: loss=0.052, reward_mean=0.410, reward_bound=0.254, batch=219
19832: loss=0.052, reward_mean=0.460, reward_bound=0.229, batch=222
19833: loss=0.052, reward_mean=0.410, reward_bound=0.254, batch=223
19834: loss=0.052, reward_mean=0.450, reward_bound=0.282, batch=225
19835: loss=0.055, reward_mean=0.410, reward_bound=0.314, batch=218
19836: loss=0.055, reward_mean=0.410, reward_bou

19943: loss=0.050, reward_mean=0.520, reward_bound=0.282, batch=218
19944: loss=0.051, reward_mean=0.480, reward_bound=0.349, batch=220
19945: loss=0.051, reward_mean=0.540, reward_bound=0.338, batch=224
19946: loss=0.051, reward_mean=0.460, reward_bound=0.387, batch=224
19947: loss=0.047, reward_mean=0.550, reward_bound=0.430, batch=171
19948: loss=0.043, reward_mean=0.500, reward_bound=0.098, batch=189
19949: loss=0.041, reward_mean=0.490, reward_bound=0.157, batch=202
19950: loss=0.041, reward_mean=0.450, reward_bound=0.185, batch=208
19951: loss=0.046, reward_mean=0.420, reward_bound=0.229, batch=212
19952: loss=0.046, reward_mean=0.450, reward_bound=0.191, batch=218
19953: loss=0.045, reward_mean=0.410, reward_bound=0.254, batch=212
19954: loss=0.044, reward_mean=0.490, reward_bound=0.254, batch=214
19955: loss=0.043, reward_mean=0.300, reward_bound=0.280, batch=220
19956: loss=0.046, reward_mean=0.450, reward_bound=0.282, batch=222
19957: loss=0.050, reward_mean=0.470, reward_bou

20064: loss=0.032, reward_mean=0.520, reward_bound=0.229, batch=216
20065: loss=0.034, reward_mean=0.370, reward_bound=0.254, batch=219
20066: loss=0.034, reward_mean=0.470, reward_bound=0.282, batch=221
20067: loss=0.033, reward_mean=0.300, reward_bound=0.282, batch=224
20068: loss=0.034, reward_mean=0.430, reward_bound=0.314, batch=224
20069: loss=0.035, reward_mean=0.390, reward_bound=0.342, batch=227
20070: loss=0.037, reward_mean=0.420, reward_bound=0.349, batch=219
20071: loss=0.036, reward_mean=0.560, reward_bound=0.282, batch=222
20072: loss=0.036, reward_mean=0.340, reward_bound=0.324, batch=225
20073: loss=0.037, reward_mean=0.430, reward_bound=0.349, batch=226
20074: loss=0.037, reward_mean=0.480, reward_bound=0.368, batch=228
20075: loss=0.035, reward_mean=0.490, reward_bound=0.387, batch=214
20076: loss=0.037, reward_mean=0.360, reward_bound=0.229, batch=219
20077: loss=0.035, reward_mean=0.520, reward_bound=0.349, batch=220
20078: loss=0.034, reward_mean=0.430, reward_bou

20185: loss=0.039, reward_mean=0.490, reward_bound=0.277, batch=222
20186: loss=0.037, reward_mean=0.390, reward_bound=0.282, batch=219
20187: loss=0.038, reward_mean=0.400, reward_bound=0.265, batch=223
20188: loss=0.037, reward_mean=0.410, reward_bound=0.220, batch=226
20189: loss=0.038, reward_mean=0.490, reward_bound=0.282, batch=225
20190: loss=0.039, reward_mean=0.530, reward_bound=0.314, batch=213
20191: loss=0.037, reward_mean=0.400, reward_bound=0.178, batch=219
20192: loss=0.037, reward_mean=0.440, reward_bound=0.265, batch=223
20193: loss=0.040, reward_mean=0.430, reward_bound=0.301, batch=226
20194: loss=0.041, reward_mean=0.500, reward_bound=0.349, batch=215
20195: loss=0.039, reward_mean=0.480, reward_bound=0.260, batch=220
20196: loss=0.041, reward_mean=0.440, reward_bound=0.304, batch=224
20197: loss=0.039, reward_mean=0.450, reward_bound=0.314, batch=225
20198: loss=0.038, reward_mean=0.550, reward_bound=0.321, batch=227
20199: loss=0.038, reward_mean=0.370, reward_bou

20306: loss=0.039, reward_mean=0.360, reward_bound=0.430, batch=221
20307: loss=0.039, reward_mean=0.420, reward_bound=0.387, batch=224
20308: loss=0.038, reward_mean=0.600, reward_bound=0.426, batch=227
20309: loss=0.038, reward_mean=0.340, reward_bound=0.422, batch=229
20310: loss=0.040, reward_mean=0.380, reward_bound=0.328, batch=230
20311: loss=0.038, reward_mean=0.550, reward_bound=0.349, batch=229
20312: loss=0.038, reward_mean=0.420, reward_bound=0.430, batch=229
20313: loss=0.038, reward_mean=0.490, reward_bound=0.401, batch=230
20314: loss=0.038, reward_mean=0.510, reward_bound=0.464, batch=231
20315: loss=0.038, reward_mean=0.450, reward_bound=0.430, batch=231
20316: loss=0.039, reward_mean=0.510, reward_bound=0.478, batch=223
20317: loss=0.039, reward_mean=0.500, reward_bound=0.459, batch=226
20318: loss=0.038, reward_mean=0.460, reward_bound=0.454, batch=228
20319: loss=0.038, reward_mean=0.450, reward_bound=0.392, batch=229
20320: loss=0.038, reward_mean=0.430, reward_bou

20428: loss=0.018, reward_mean=0.380, reward_bound=0.109, batch=201
20429: loss=0.019, reward_mean=0.400, reward_bound=0.109, batch=208
20430: loss=0.017, reward_mean=0.380, reward_bound=0.122, batch=212
20431: loss=0.019, reward_mean=0.520, reward_bound=0.135, batch=213
20432: loss=0.018, reward_mean=0.460, reward_bound=0.167, batch=218
20433: loss=0.018, reward_mean=0.400, reward_bound=0.185, batch=212
20434: loss=0.018, reward_mean=0.360, reward_bound=0.206, batch=219
20435: loss=0.018, reward_mean=0.470, reward_bound=0.206, batch=219
20436: loss=0.018, reward_mean=0.430, reward_bound=0.203, batch=223
20437: loss=0.021, reward_mean=0.450, reward_bound=0.229, batch=221
20438: loss=0.023, reward_mean=0.460, reward_bound=0.254, batch=218
20439: loss=0.022, reward_mean=0.420, reward_bound=0.282, batch=214
20440: loss=0.021, reward_mean=0.440, reward_bound=0.311, batch=220
20441: loss=0.021, reward_mean=0.400, reward_bound=0.314, batch=207
20442: loss=0.020, reward_mean=0.520, reward_bou

20549: loss=0.014, reward_mean=0.510, reward_bound=0.349, batch=223
20550: loss=0.014, reward_mean=0.490, reward_bound=0.349, batch=225
20551: loss=0.015, reward_mean=0.410, reward_bound=0.387, batch=209
20552: loss=0.014, reward_mean=0.360, reward_bound=0.194, batch=216
20553: loss=0.014, reward_mean=0.380, reward_bound=0.254, batch=220
20554: loss=0.014, reward_mean=0.490, reward_bound=0.314, batch=223
20555: loss=0.014, reward_mean=0.450, reward_bound=0.229, batch=225
20556: loss=0.014, reward_mean=0.440, reward_bound=0.349, batch=225
20557: loss=0.014, reward_mean=0.430, reward_bound=0.349, batch=226
20558: loss=0.013, reward_mean=0.420, reward_bound=0.368, batch=228
20559: loss=0.014, reward_mean=0.440, reward_bound=0.387, batch=223


KeyboardInterrupt: 

## 04 frozenlake_nonslippery.py

In [15]:
import gym.wrappers
import gym.envs.toy_text.frozen_lake

In [16]:
random.seed(12345)
env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(is_slippery=False)
env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
env = DiscreteOneHotWrapper(env)
# env = gym.wrappers.Monitor(env, directory="mon", force=True)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)
writer = SummaryWriter(comment="-frozenlake-nonslippery")

full_batch = []
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
    full_batch, obs, acts, reward_bound = filter_batch(full_batch + batch, PERCENTILE)
    if not full_batch:
        continue
    obs_v = torch.FloatTensor(obs)
    acts_v = torch.LongTensor(acts)
    full_batch = full_batch[-500:]

    optimizer.zero_grad()
    action_scores_v = net(obs_v)
    loss_v = objective(action_scores_v, acts_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean=%.3f, reward_bound=%.3f, batch=%d" % (
        iter_no, loss_v.item(), reward_mean, reward_bound, len(full_batch)))
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_mean", reward_mean, iter_no)
    writer.add_scalar("reward_bound", reward_bound, iter_no)
    if reward_mean > 0.8:
        print("Solved!")
        break
writer.close()

AttributeError: 'NoneType' object has no attribute 'max_episode_steps'