In [81]:
''' imports '''

import gym
import numpy as np 


''' model imports'''
import torch 
import torch.nn as nn 

''' data imports'''
from collections import namedtuple 

''' visualisation '''
from tensorboardX import SummaryWriter

# Some Gym Concept and analysis 

    - observation space  Box(4,) 
    - action space:  Discrete(2) 
    - sample observation:  [-0.00220126 -0.01694481  0.01935941 -0.03202244]
    - high:  3.4028235e+38
    - low:  -3.4028235e+38

**Some concepts** 
***
    - The Discrete space allows a fixed range of non-negative numbers, so in this case valid actions are either 0 or 1. 
    
```python
                Discrete(4) # output will be a number from 0 to 3 inclusive
```
    
    - The Box space represents an n-dimensional box, so valid observations will be an array of n numbers. We can also check the Box’s bounds:
      
 ```python 

        Box(4,) # output will be an array of 4.
                # since it's an array it must have a range 
                # that can be found out by high and low 
            
print(env.observation_space.high)
#> array([ 2.4       ,         inf,  0.20943951,         inf])
print(env.observation_space.low)
#> array([-2.4       ,        -inf, -0.20943951,        -inf]) ```



***

**An example of environments** 

```python

################### Frozen Lake Environment ############################## 

''' description of frozen lake '''
env = gym.make('FrozenLake-v0')

print("observation space ", env.observation_space)
print("action space: ", env.action_space)

print("sample observation: ", env.reset())
print("environment " , env.render())
env.close() 


> observation space  Discrete(16)  # meaning -> output will be number between 0-15 inclusive 
> action space:  Discrete(4)       # meaning -> output will be a number from {0,1} 
> sample observation:  0

>   SFFF                           # meaning -> this is an array , the environment 
    FHFH
    FFFH
    HFFG
environment  None

```
***
```python
################### Cartpole Environment ################################### 
env = gym.make('CartPole-v0')

print("observation space ", env.observation_space)
print("action space: ", env.action_space)
print("sample observation: ", env.reset())

print(" high: ", np.sort(env.observation_space.high)[-1])
print("low: ", np.sort(env.observation_space.low)[0])

env.close() 

> observation space  Box(4,)
> action space:  Discrete(2)
> sample observation:  [-0.00220126 -0.01694481  0.01935941 -0.03202244]
> high:  3.4028235e+38
> low:  -3.4028235e+38

```


# Building The solution

**issue**:

    - observation is a discrete(16,) for frozen lake, but, our crossentropy method required Box(16,): basically an array 
    
**solution**
    - we can use observation wrapper to madify the observation to an array of (16,) by using gym.spaces 
    - other thing we can do is, we can use one hot-encoding after getting observation normally. 
    - but method 1 is more elegant and standard way of dealing 

In [23]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [27]:
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

print("observation space: ", env.observation_space)
print("action space: ", env.action_space)
print("sample observation: ", env.reset())


n_obs = env.observation_space.shape[0]
n_actions = env.action_space.n 


observation space:  Box(16,)
action space:  Discrete(4)
sample observation:  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [34]:
hidden = 128 

class Net(nn.Module):
    
    def __init__(self, n_obs, hidden, n_actions):
        super(Net, self).__init__()
        
        self.pipe = nn.Sequential(nn.Linear(in_features= n_obs, out_features= hidden),
                                  nn.ReLU(),
                                  nn.Linear(in_features= hidden, out_features= n_actions))
        
    def forward(self, x):
        return self.pipe(x) 
    
net = Net(n_obs, hidden, n_actions)
print(net)

Net(
  (pipe): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=4, bias=True)
  )
)


In [31]:
Episode = namedtuple('Episode', field_names= ("reward", "steps"))
Episode_step = namedtuple('Episode_step', field_names= ("observation", "action"))

In [89]:
def iterate_batches(env, net, batch_size):
    batch = []
    single_episode = [] 
    
    sm = nn.Softmax(dim= 1)
    obs_ = env.reset()
    
    episode_reward = 0.0 
    while True:
        # obs -> get probs -> take action based on probs
        obs = torch.FloatTensor([obs_])
        action_probs = sm(net(obs)).data.numpy()[0]
        action = np.random.choice(a = n_actions, p = action_probs)
        
        # single step in episode completes, append to the single_episode as an Episode Step 
        single_episode.append(Episode_step(observation= obs_, action= action)) 
        new_obs, reward, done, info = env.step(action)
        
        episode_reward += reward 
        
        if done:
            # episode done, so make an episode tuple and append to the batch 
            batch.append(Episode(reward= episode_reward, steps= single_episode.copy()))
            # clear for next episode 
            episode_reward = 0.0 
            single_episode.clear()
            new_obs = env.reset()
            
            if len(batch) == batch_size:
                yield batch
                batch.clear() 
                
        obs_ = new_obs 

In [92]:
def filterBatches(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    rewards_mean = np.mean(rewards)
    rewards_bound = np.percentile(a = rewards, q= percentile)
    
    train_obs = []
    train_action = []
    
    for episode in batch:
        if episode.reward >= rewards_bound:
            observations = list(map(lambda s: s.observation, episode.steps))
            actions = list(map(lambda s: s.action, episode.steps))
            
            train_obs.extend(observations)
            train_action.extend(actions) 

       
    train_obs = torch.FloatTensor(train_obs)
    train_action = torch.LongTensor(train_action)
    return train_obs, train_action, rewards_mean, rewards_bound 
    
    
    
# for batch in iterate_batches(env, net, 3):
#     break 
    
# o, a, m, b = filterBatches(batch, 70)
# print(o.shape, a.shape, m, b)

# # torch.Size([7, 16]) torch.Size([7]) 0.0 0.0

In [83]:
logs = gym.logger 
logs.set_level(gym.logger.INFO)

In [84]:
writer = SummaryWriter(comment= 'naive')

In [86]:
objective = nn.CrossEntropyLoss()
opt = torch.optim.Adam(params= net.parameters(), lr= 0.001, betas= (0.9,0.999))

In [95]:
''' main script '''
for i, batch in enumerate(iterate_batches(env, net, batch_size= 16)):
    
    # get data
    obs, actions, r_mean, r_bound = filterBatches(batch, percentile= 70)
    
    
    ############################# train ##################################
    # set gradients zero 
    opt.zero_grad()
    
    # forward
    logits = net(obs)
    # loss
    loss = objective(logits, actions)
    # gradients
    loss.backward()
    # optimize
    opt.step()
    
    ############################ writer ####################################
    logs.info("loss: %.3f mean_reward: %3f bound_reward: %3f ",loss.item(), r_mean, r_bound )
    writer.add_scalar('loss', loss.item(), i)
    writer.add_scalar('mean_reward', r_mean, i)
    writer.add_scalar('bound_reward', r_bound, i)

    if r_mean > 0.8:
        print("solved with mean of ", r_mean)
        break

INFO: loss: 1.352 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.360 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.351 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.376 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.367 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.382 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.356 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.353 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.352 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.352 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.369 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.318 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.372 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.362 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.356 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.345 mean_reward: 0.000000 

INFO: loss: 1.319 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.329 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.332 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.366 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.329 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.375 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.362 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.336 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.353 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.364 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.368 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.348 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.368 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.306 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.318 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.339 mean_reward: 0.000000 

INFO: loss: 1.307 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.261 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.353 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.249 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.244 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.342 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.335 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.371 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.332 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.276 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.302 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.260 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.301 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.298 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.295 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.264 mean_reward: 0.000000 

INFO: loss: 1.125 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.172 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.164 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.094 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.208 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.207 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.149 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.199 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.130 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.132 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.971 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.137 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.075 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.190 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.088 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.062500 

INFO: loss: 1.061 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.073 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.144 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.110 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.073 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.040 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.028 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.098 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.076 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.051 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.073 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.004 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.034 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.144 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.106 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.161 mean_reward: 0.000000 

INFO: loss: 1.232 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.224 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.151 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.259 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.131 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.109 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.105 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.106 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.041 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.015 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.136 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.122 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.195 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.095 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.183 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.962 mean_reward: 0.000000 

INFO: loss: 1.107 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.121 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.194 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.204 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.185 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.192 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.186 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.218 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.149 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.261 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.179 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.085 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.282 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.183 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.100 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.159 mean_reward: 0.062500 

INFO: loss: 1.200 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.047 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.156 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.205 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.063 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.319 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.098 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.227 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.106 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.172 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.160 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.319 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.203 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.150 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.135 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.242 mean_reward: 0.000000 

INFO: loss: 1.142 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.227 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.272 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.204 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.167 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.320 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.267 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.180 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.189 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.289 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.238 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.247 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.293 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.298 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.217 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.325 mean_reward: 0.000000 

INFO: loss: 1.196 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.272 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.349 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.251 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.277 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.230 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.237 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.218 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.247 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.262 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.190 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.243 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.199 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.270 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.284 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.253 mean_reward: 0.062500 

INFO: loss: 1.195 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.204 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.288 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.233 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.351 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.210 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.149 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.262 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.129 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.183 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.080 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.259 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.179 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.196 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.164 mean_reward: 0.062500 

INFO: loss: 1.169 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.118 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.133 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.171 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.175 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.138 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.092 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.177 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.093 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.231 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.143 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.048 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.119 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.157 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.124 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.011 mean_reward: 0.000000 

INFO: loss: 1.188 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.149 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.103 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.025 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.187 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.126 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.132 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.169 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.097 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.945 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.053 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.082 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.197 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.168 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.200 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.093 mean_reward: 0.000000 

INFO: loss: 1.223 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.213 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.280 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.220 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.287 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.176 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.221 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.181 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.161 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.143 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.192 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.144 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.106 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.232 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.248 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.247 mean_reward: 0.000000 

INFO: loss: 1.272 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.355 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.104 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.260 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.136 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.249 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.223 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.198 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.222 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.122 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.308 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.223 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.211 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.182 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.167 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.261 mean_reward: 0.000000 

INFO: loss: 1.056 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.173 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.257 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.335 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.238 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.192 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.208 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.244 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.286 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.147 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.155 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.219 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.185 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.258 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.201 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.191 mean_reward: 0.062500 

INFO: loss: 1.287 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.175 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.179 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.233 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.154 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.140 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.172 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.167 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.146 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.179 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.084 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.095 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.211 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.101 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.147 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.236 mean_reward: 0.062500 

INFO: loss: 1.182 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.189 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.198 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.189 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.101 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.230 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.259 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.274 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.238 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.115 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.142 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.196 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.252 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.324 mean_reward: 0.000000 

INFO: loss: 1.158 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 1.192 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.228 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.225 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.152 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.144 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.210 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 1.048 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.214 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.178 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.126 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.087 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.172 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.009 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.218 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.193 mean_reward: 0.062500 

INFO: loss: 1.151 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.215 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.272 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.221 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.224 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.241 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.105 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.199 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.051 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.130 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.113 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.245 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.278 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.153 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.211 mean_reward: 0.000000 

INFO: loss: 0.979 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.143 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.203 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.255 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.209 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.135 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.111 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.138 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.162 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.101 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.157 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.172 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.085 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.115 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.124 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.196 mean_reward: 0.000000 

INFO: loss: 1.075 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.216 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.216 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.148 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.284 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.077 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.040 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.063 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.138 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 1.216 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.142 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.241 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.186 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.173 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.095 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.089 mean_reward: 0.062500 

INFO: loss: 0.940 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.967 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.068 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.155 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.996 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.021 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.038 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.940 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.815 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.214 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.052 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.162 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.124 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.862 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.028 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.020 mean_reward: 0.000000 

INFO: loss: 0.805 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.785 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.855 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.897 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.027 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.764 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.775 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.774 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.900 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.817 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.920 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.959 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.976 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.827 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.876 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.902 mean_reward: 0.062500 

INFO: loss: 1.015 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.018 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.943 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.895 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.906 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.925 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.898 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.928 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.931 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.911 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.853 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.896 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.920 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.888 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.937 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.981 mean_reward: 0.000000 

INFO: loss: 0.807 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.068 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.913 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.837 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.904 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.921 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.963 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.102 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.953 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.993 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.103 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.869 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.031 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.799 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.050 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.990 mean_reward: 0.000000 

INFO: loss: 0.788 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.004 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.909 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.786 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.955 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.043 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.807 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.979 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.060 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.850 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.908 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.078 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.917 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.734 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.837 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.869 mean_reward: 0.062500 

INFO: loss: 0.967 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.895 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.779 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.026 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.804 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.677 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.855 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.852 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.856 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.815 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.785 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.914 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.832 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.766 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.699 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.861 mean_reward: 0.000000 

INFO: loss: 0.992 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.845 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.704 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.802 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.866 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.984 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.010 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.790 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.938 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 0.885 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.744 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.748 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.962 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.890 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.675 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.852 mean_reward: 0.000000 

INFO: loss: 0.877 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.832 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.892 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.875 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.887 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.801 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.751 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.804 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.782 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.851 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.748 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.849 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.794 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.953 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.787 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.881 mean_reward: 0.000000 

INFO: loss: 0.839 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.790 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.741 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.711 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.787 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.019 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.692 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.919 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 0.660 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.000 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.707 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.797 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.005 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.932 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.849 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.841 mean_reward: 0.062500 

INFO: loss: 0.893 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.859 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.749 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.906 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.842 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.143 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.860 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 0.825 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.833 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.911 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.909 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.659 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.959 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.829 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.898 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.060 mean_reward: 0.000000 

INFO: loss: 0.900 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.987 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.817 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.995 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.940 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.054 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.923 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.080 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.145 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 1.062 mean_reward: 0.187500 bound_reward: 0.000000 
INFO: loss: 0.879 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.027 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.007 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.899 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.881 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.017 mean_reward: 0.000000 

INFO: loss: 1.082 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.083 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.920 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.914 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.069 mean_reward: 0.125000 bound_reward: 0.000000 
INFO: loss: 0.970 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.918 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.984 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.930 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.024 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.047 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.048 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.934 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.039 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.083 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.987 mean_reward: 0.062500 

INFO: loss: 1.005 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.907 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.013 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.014 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.798 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.079 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.989 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 1.008 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.746 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.994 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.053 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.894 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 1.113 mean_reward: 0.062500 bound_reward: 0.000000 
INFO: loss: 0.944 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.827 mean_reward: 0.000000 bound_reward: 0.000000 
INFO: loss: 0.926 mean_reward: 0.000000 

KeyboardInterrupt: 

**<img src="./bound_reward_naive curve.svg" style = "width:400px;"> Reward bound </img>**
 > reward bound is not improving, indicationg percentile = 70 th number episode is still zero.
 
**<img src="./loss_naive curve.svg" style = "width:600px;" > loss curve </img>**
**<img src="./mean_reward_naive curve.svg" style = "width:400px;" > mean reward curve</img>**