In [16]:
import sys
sys.path.append('/home/rasa/PycharmProjects/reversi-game/')

In [17]:
import torch
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv, sync_envs_normalization

import stable_baselines3.common.callbacks as callbacks_module
from sb3_contrib.common.maskable.evaluation import evaluate_policy as masked_evaluate_policy

# Modify the namespace of EvalCallback directly
callbacks_module.evaluate_policy = masked_evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

# from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
# from stable_baselines3.common.callbacks import EvalCallback

from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.ppo_mask import MaskablePPO

from shutil import copyfile # keep track of generations
from collections import OrderedDict

from gymnasium.spaces import Discrete, Box, Dict, MultiDiscrete
from gymnasium.wrappers import FlattenObservation
import gymnasium.spaces as spaces
from game_logic import Othello
import numpy as np
import os, math
from itertools import cycle

In [23]:
# Settings
SEED = 19
NUM_TIMESTEPS = int(30_000_000)
EVAL_FREQ = int(20_000)
EVAL_EPISODES = int(200)
BEST_THRESHOLD = 0.30 # must achieve a mean score above this to replace prev best self

RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.

LOGDIR = "ppo_masked/v2/4x_64/"

In [24]:
class OthelloEnv(gym.Env):   
    def __init__(self):
        self.game = Othello()
        self.agent_turn = 1
        shape = self.game.board.shape
        self.action_space = Discrete(shape[0] * shape[1])  # sample - [x, y]
        # self.observation_space = Dict({
        #                                 'board' : Box(0, 2, shape=shape, dtype=int), 
        #                                 'player': Discrete(2, start=1)
        #                               })        
        self.observation_space = Box(low=0, high=1, shape=(64*3,), dtype=np.float32)
        self.other_agent = None
        self.reset_othello_gen = self.reset_othello()    
        self.episodes = 0       
        

    def reset_othello(self):
        '''resets game to starting position 
           and also changes starting player alternatively'''
        infinite_player_turn = cycle([1, 2])
        while True:
            game = Othello()
            model_turn = next(infinite_player_turn)
            yield game, model_turn
    
    def change_to_latest_agent(self, agent):
        self.other_agent = agent

    def get_obs(self):
        encoded_board = self.game.get_encoded_state().reshape(-1)
        return encoded_board        
        
    def check_game_ended(self):
        reward = 0
        done = False
        winner = self.game.get_winner()
        if winner is not None:
            self.episodes += 1
            if self.episodes % 10 == 0:
                print(f'Ep done - {self.episodes}.')
            
            done = True
            if winner == self.agent_turn:
                reward = 1
            elif winner == 3 - self.agent_turn: #  other agent turn/figure
                reward = -1
        return reward, done
    
    def render(self):  # todo 
        pass

    def close(self):  # todo
        pass

    def other_agent_play_move(self): 
        obs = self.get_obs()         
        action, _ = self.other_agent.predict(obs, 
                                             action_masks=self.action_masks(),
                                             deterministic=False) 
        game_action = Othello.get_decoded_field(action)
        self.game.play_move(game_action)

    def step(self, action):
        game_action = Othello.get_decoded_field(action)
        self.game.play_move(game_action)

        # do self play
        while self.game.get_winner() is None and self.game.player_turn != self.agent_turn: #  if game hasnt ended do moves if opponent doesnt have one 
            self.other_agent_play_move()

        reward, done = self.check_game_ended()
        info = {}
        truncated = False

                
        # Return step information
        return self.get_obs(), reward, done, truncated, info
    
    def reset(self, *args, **kwargs):
        self.game, self.agent_turn = next(self.reset_othello_gen)
        if self.agent_turn == 2:
            self.other_agent_play_move()
        return self.get_obs(), None

    def action_masks(self):        
        valid_moves = self.game.valid_moves()
    
        mask = np.zeros(self.game.board.shape, dtype=bool)
        
        # Set True for each index in the set
        for index in valid_moves:
            mask[index] = True
        mask.flatten()
        return mask

In [25]:
class SelfPlayCallback(EvalCallback):
    # hacked it to only save new version offrom gymnasium.wrappers import FlattenObservation best model if beats prev self by BEST_THRESHOLD score
    # after saving model, resets the best score to be BEST_THRESHOLD
    def __init__(self, train_env, eval_env, *args, **kwargs):
        super().__init__(eval_env, *args, **kwargs)
        self.best_mean_reward = BEST_THRESHOLD
        self.generation = 0
        self.train_env = train_env
        self.eval_env = eval_env
    def _on_step(self) -> bool:
        # result = super()._on_step() #  eval needs to be masked, its less efficient 
        result = super()._on_step()
        
        if result and self.best_mean_reward > BEST_THRESHOLD:
            self.generation += 1
            print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
            print("SELFPLAY: new best model, bumping up generation to", self.generation)            
            source_file = os.path.join(LOGDIR, "best_model.zip")
            backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(4)+".zip")
            copyfile(source_file, backup_file)
            self.best_mean_reward = BEST_THRESHOLD
            agent = self.model.load(source_file)
            self.train_env.unwrapped.change_to_latest_agent(agent)            
            self.eval_env.envs[0].unwrapped.change_to_latest_agent(agent)      
        return result    

In [None]:
env = OthelloEnv()
env = Monitor(env=env)

print(f'CUDA available: {torch.cuda.is_available()}')
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

policy_kwargs = {
    'net_arch': {
        'pi': [64, 64, 64, 64],  # Four hidden layers with 64 units each for the policy network
        'vf': [64, 64, 64, 64]   # Four hidden layers with 64 units each for the value network
    }
}

model = MaskablePPO(policy=MaskableActorCriticPolicy, 
                    env=env, 
                    device=device,
                    verbose=1,
                    learning_rate = 0.001,
                    policy_kwargs=policy_kwargs)
starting_model_filepath = LOGDIR + 'random_start_model'
# model = MaskablePPO.load(starting_model_filepath, env=env)
model.save(starting_model_filepath)

start_model_copy = model.load(starting_model_filepath)
env.unwrapped.change_to_latest_agent(start_model_copy)


env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)

env_eval = DummyVecEnv(env_fns=[lambda: env_eval])
env_eval.envs[0].unwrapped.change_to_latest_agent(start_model_copy)



eval_callback = SelfPlayCallback(
    env,
    env_eval,
    best_model_save_path=LOGDIR,
    log_path=LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False 
    )


model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

CUDA available: False
Using cpu device
Wrapping the env in a DummyVecEnv.




Ep done - 10.
Ep done - 20.
Ep done - 30.
Ep done - 40.
Ep done - 50.
Ep done - 60.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 0.0294   |
| time/              |          |
|    fps             | 618      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Ep done - 70.
Ep done - 80.
Ep done - 90.
Ep done - 100.
Ep done - 110.
Ep done - 120.
Ep done - 130.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0          |
| time/                   |            |
|    fps                  | 494        |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.03888946 |
|    clip_fraction     

Ep done - 620.
Ep done - 630.
Ep done - 640.
Ep done - 650.
Ep done - 660.
Ep done - 10.
Ep done - 20.
Ep done - 30.
Ep done - 40.
Ep done - 50.
Ep done - 60.
Ep done - 70.
Ep done - 80.
Ep done - 90.
Ep done - 100.
Ep done - 110.
Ep done - 120.
Ep done - 130.
Ep done - 140.
Ep done - 150.
Ep done - 160.
Ep done - 170.
Ep done - 180.
Ep done - 190.
Ep done - 200.
Eval num_timesteps=20000, episode_reward=0.27 +/- 0.95
Episode length: 30.07 +/- 0.62
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30.1       |
|    mean_reward          | 0.265      |
| time/                   |            |
|    total_timesteps      | 20000      |
| train/                  |            |
|    approx_kl            | 0.09723027 |
|    clip_fraction        | 0.56       |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.77      |
|    explained_variance   | 0.1        |
|    learning_rate        | 0.001      |
|    loss      

Ep done - 1170.
Ep done - 1180.
Ep done - 1190.
Ep done - 1200.
Ep done - 1210.
Ep done - 1220.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.4        |
| time/                   |            |
|    fps                  | 380        |
|    iterations           | 18         |
|    time_elapsed         | 97         |
|    total_timesteps      | 36864      |
| train/                  |            |
|    approx_kl            | 0.11399764 |
|    clip_fraction        | 0.543      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.52      |
|    explained_variance   | -0.0361    |
|    learning_rate        | 0.001      |
|    loss                 | -0.108     |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0879    |
|    value_loss           | 0.0546     |
----------------------------------------
Ep done - 1230.
Ep done - 1240.
Ep done - 1

Ep done - 1710.
Ep done - 1720.
Ep done - 1730.
Ep done - 1740.
Ep done - 1750.
Ep done - 1760.
Ep done - 1770.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.2        |
| time/                   |            |
|    fps                  | 364        |
|    iterations           | 26         |
|    time_elapsed         | 145        |
|    total_timesteps      | 53248      |
| train/                  |            |
|    approx_kl            | 0.13426071 |
|    clip_fraction        | 0.523      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.26      |
|    explained_variance   | 0.158      |
|    learning_rate        | 0.001      |
|    loss                 | -0.103     |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0854    |
|    value_loss           | 0.0731     |
----------------------------------------
Ep done - 1780.
Ep done - 1

Ep done - 2260.
Ep done - 2270.
Ep done - 2280.
Ep done - 2290.
Ep done - 2300.
Ep done - 2310.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30.1      |
|    ep_rew_mean          | 0.3       |
| time/                   |           |
|    fps                  | 357       |
|    iterations           | 34        |
|    time_elapsed         | 194       |
|    total_timesteps      | 69632     |
| train/                  |           |
|    approx_kl            | 0.1672743 |
|    clip_fraction        | 0.51      |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.1      |
|    explained_variance   | 0.169     |
|    learning_rate        | 0.001     |
|    loss                 | -0.12     |
|    n_updates            | 330       |
|    policy_gradient_loss | -0.0847   |
|    value_loss           | 0.0617    |
---------------------------------------
Ep done - 2320.
Ep done - 2330.
Ep done - 2340.
Ep done - 2350.


Ep done - 2800.
Ep done - 2810.
Ep done - 2820.
Ep done - 2830.
Ep done - 2840.
Ep done - 2850.
Ep done - 2860.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.28       |
| time/                   |            |
|    fps                  | 352        |
|    iterations           | 42         |
|    time_elapsed         | 244        |
|    total_timesteps      | 86016      |
| train/                  |            |
|    approx_kl            | 0.18563807 |
|    clip_fraction        | 0.496      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.955     |
|    explained_variance   | 0.0574     |
|    learning_rate        | 0.001      |
|    loss                 | -0.105     |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0816    |
|    value_loss           | 0.0772     |
----------------------------------------
Ep done - 2870.
Ep done - 2

Ep done - 3350.
Ep done - 3360.
Ep done - 3370.
Ep done - 3380.
Ep done - 3390.
Ep done - 3400.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.11       |
| time/                   |            |
|    fps                  | 350        |
|    iterations           | 50         |
|    time_elapsed         | 292        |
|    total_timesteps      | 102400     |
| train/                  |            |
|    approx_kl            | 0.20179561 |
|    clip_fraction        | 0.454      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.842     |
|    explained_variance   | 0.195      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0802    |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0727    |
|    value_loss           | 0.076      |
----------------------------------------
Ep done - 3410.
Ep done - 3420.
Ep done - 3

Ep done - 3960.
Ep done - 3970.
Ep done - 3980.
Ep done - 3990.
Ep done - 1010.
Ep done - 1020.
Ep done - 1030.
Ep done - 1040.
Ep done - 1050.
Ep done - 1060.
Ep done - 1070.
Ep done - 1080.
Ep done - 1090.
Ep done - 1100.
Ep done - 1110.
Ep done - 1120.
Ep done - 1130.
Ep done - 1140.
Ep done - 1150.
Ep done - 1160.
Ep done - 1170.
Ep done - 1180.
Ep done - 1190.
Ep done - 1200.
Eval num_timesteps=120000, episode_reward=0.26 +/- 0.95
Episode length: 29.96 +/- 0.69
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30         |
|    mean_reward          | 0.26       |
| time/                   |            |
|    total_timesteps      | 120000     |
| train/                  |            |
|    approx_kl            | 0.19773835 |
|    clip_fraction        | 0.446      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.83      |
|    explained_variance   | 0.15       |
|    learning_rate        | 0.001    

Ep done - 4510.
Ep done - 4520.
Ep done - 4530.
Ep done - 4540.
Ep done - 4550.
Ep done - 4560.
Ep done - 4570.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.03      |
| time/                   |           |
|    fps                  | 354       |
|    iterations           | 67        |
|    time_elapsed         | 387       |
|    total_timesteps      | 137216    |
| train/                  |           |
|    approx_kl            | 0.2120718 |
|    clip_fraction        | 0.451     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.831    |
|    explained_variance   | 0.189     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0979   |
|    n_updates            | 660       |
|    policy_gradient_loss | -0.0711   |
|    value_loss           | 0.0954    |
---------------------------------------
Ep done - 4580.
Ep done - 4590.
Ep done - 4600.


Ep done - 5060.
Ep done - 5070.
Ep done - 5080.
Ep done - 5090.
Ep done - 5100.
Ep done - 5110.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.3        |
| time/                   |            |
|    fps                  | 348        |
|    iterations           | 75         |
|    time_elapsed         | 440        |
|    total_timesteps      | 153600     |
| train/                  |            |
|    approx_kl            | 0.27407813 |
|    clip_fraction        | 0.455      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.801     |
|    explained_variance   | 0.279      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0825    |
|    n_updates            | 740        |
|    policy_gradient_loss | -0.0691    |
|    value_loss           | 0.0846     |
----------------------------------------
Ep done - 5120.
Ep done - 5130.
Ep done - 5

Ep done - 5600.
Ep done - 5610.
Ep done - 5620.
Ep done - 5630.
Ep done - 5640.
Ep done - 5650.
Ep done - 5660.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.21       |
| time/                   |            |
|    fps                  | 348        |
|    iterations           | 83         |
|    time_elapsed         | 487        |
|    total_timesteps      | 169984     |
| train/                  |            |
|    approx_kl            | 0.21932086 |
|    clip_fraction        | 0.438      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.744     |
|    explained_variance   | 0.226      |
|    learning_rate        | 0.001      |
|    loss                 | -0.103     |
|    n_updates            | 820        |
|    policy_gradient_loss | -0.0717    |
|    value_loss           | 0.096      |
----------------------------------------
Ep done - 5670.
Ep done - 5

Ep done - 6150.
Ep done - 6160.
Ep done - 6170.
Ep done - 6180.
Ep done - 6190.
Ep done - 6200.
Ep done - 6210.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.16       |
| time/                   |            |
|    fps                  | 347        |
|    iterations           | 91         |
|    time_elapsed         | 536        |
|    total_timesteps      | 186368     |
| train/                  |            |
|    approx_kl            | 0.22262849 |
|    clip_fraction        | 0.407      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.67      |
|    explained_variance   | 0.332      |
|    learning_rate        | 0.001      |
|    loss                 | -0.074     |
|    n_updates            | 900        |
|    policy_gradient_loss | -0.0669    |
|    value_loss           | 0.0937     |
----------------------------------------
Ep done - 6220.
Ep done - 6

Ep done - 6700.
Ep done - 6710.
Ep done - 6720.
Ep done - 6730.
Ep done - 6740.
Ep done - 6750.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | -0.09      |
| time/                   |            |
|    fps                  | 346        |
|    iterations           | 99         |
|    time_elapsed         | 585        |
|    total_timesteps      | 202752     |
| train/                  |            |
|    approx_kl            | 0.26345915 |
|    clip_fraction        | 0.408      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.68      |
|    explained_variance   | 0.424      |
|    learning_rate        | 0.001      |
|    loss                 | -0.104     |
|    n_updates            | 980        |
|    policy_gradient_loss | -0.0658    |
|    value_loss           | 0.0865     |
----------------------------------------
Ep done - 6760.
Ep done - 6770.
Ep done - 6

Ep done - 7310.
Ep done - 7320.
Ep done - 7330.
Ep done - 2010.
Ep done - 2020.
Ep done - 2030.
Ep done - 2040.
Ep done - 2050.
Ep done - 2060.
Ep done - 2070.
Ep done - 2080.
Ep done - 2090.
Ep done - 2100.
Ep done - 2110.
Ep done - 2120.
Ep done - 2130.
Ep done - 2140.
Ep done - 2150.
Ep done - 2160.
Ep done - 2170.
Ep done - 2180.
Ep done - 2190.
Ep done - 2200.
Eval num_timesteps=220000, episode_reward=0.20 +/- 0.97
Episode length: 30.00 +/- 0.59
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30         |
|    mean_reward          | 0.2        |
| time/                   |            |
|    total_timesteps      | 220000     |
| train/                  |            |
|    approx_kl            | 0.26246417 |
|    clip_fraction        | 0.405      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.687     |
|    explained_variance   | 0.341      |
|    learning_rate        | 0.001      |
|    loss   

Ep done - 7850.
Ep done - 7860.
Ep done - 7870.
Ep done - 7880.
Ep done - 7890.
Ep done - 7900.
Ep done - 7910.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.17       |
| time/                   |            |
|    fps                  | 350        |
|    iterations           | 116        |
|    time_elapsed         | 677        |
|    total_timesteps      | 237568     |
| train/                  |            |
|    approx_kl            | 0.23317206 |
|    clip_fraction        | 0.415      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.682     |
|    explained_variance   | 0.31       |
|    learning_rate        | 0.001      |
|    loss                 | -0.0208    |
|    n_updates            | 1150       |
|    policy_gradient_loss | -0.06      |
|    value_loss           | 0.126      |
----------------------------------------
Ep done - 7920.
Ep done - 7

Ep done - 8400.
Ep done - 8410.
Ep done - 8420.
Ep done - 8430.
Ep done - 8440.
Ep done - 8450.
Ep done - 8460.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30.1      |
|    ep_rew_mean          | 0.23      |
| time/                   |           |
|    fps                  | 348       |
|    iterations           | 124       |
|    time_elapsed         | 728       |
|    total_timesteps      | 253952    |
| train/                  |           |
|    approx_kl            | 0.2738371 |
|    clip_fraction        | 0.42      |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.635    |
|    explained_variance   | 0.378     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0372   |
|    n_updates            | 1230      |
|    policy_gradient_loss | -0.0652   |
|    value_loss           | 0.104     |
---------------------------------------
Ep done - 8470.
Ep done - 8480.
Ep done - 8490.


Ep done - 8950.
Ep done - 8960.
Ep done - 8970.
Ep done - 8980.
Ep done - 8990.
Ep done - 9000.
Ep done - 9010.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.06       |
| time/                   |            |
|    fps                  | 347        |
|    iterations           | 132        |
|    time_elapsed         | 778        |
|    total_timesteps      | 270336     |
| train/                  |            |
|    approx_kl            | 0.27945927 |
|    clip_fraction        | 0.417      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.633     |
|    explained_variance   | 0.171      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0821    |
|    n_updates            | 1310       |
|    policy_gradient_loss | -0.0652    |
|    value_loss           | 0.114      |
----------------------------------------
Ep done - 9020.
Ep done - 9

Ep done - 9490.
Ep done - 9500.
Ep done - 9510.
Ep done - 9520.
Ep done - 9530.
Ep done - 9540.
Ep done - 9550.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.02       |
| time/                   |            |
|    fps                  | 346        |
|    iterations           | 140        |
|    time_elapsed         | 827        |
|    total_timesteps      | 286720     |
| train/                  |            |
|    approx_kl            | 0.27507496 |
|    clip_fraction        | 0.393      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.587     |
|    explained_variance   | 0.107      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0615    |
|    n_updates            | 1390       |
|    policy_gradient_loss | -0.0598    |
|    value_loss           | 0.138      |
----------------------------------------
Ep done - 9560.
Ep done - 9

Ep done - 10040.
Ep done - 10050.
Ep done - 10060.
Ep done - 10070.
Ep done - 10080.
Ep done - 10090.
Ep done - 10100.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.06      |
| time/                   |           |
|    fps                  | 320       |
|    iterations           | 148       |
|    time_elapsed         | 946       |
|    total_timesteps      | 303104    |
| train/                  |           |
|    approx_kl            | 0.2851866 |
|    clip_fraction        | 0.392     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.591    |
|    explained_variance   | 0.0931    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0794   |
|    n_updates            | 1470      |
|    policy_gradient_loss | -0.0638   |
|    value_loss           | 0.13      |
---------------------------------------
Ep done - 10110.
Ep done - 10120.
Ep done

Ep done - 10650.
Ep done - 10660.
Ep done - 3010.
Ep done - 3020.
Ep done - 3030.
Ep done - 3040.
Ep done - 3050.
Ep done - 3060.
Ep done - 3070.
Ep done - 3080.
Ep done - 3090.
Ep done - 3100.
Ep done - 3110.
Ep done - 3120.
Ep done - 3130.
Ep done - 3140.
Ep done - 3150.
Ep done - 3160.
Ep done - 3170.
Ep done - 3180.
Ep done - 3190.
Ep done - 3200.
Eval num_timesteps=320000, episode_reward=0.03 +/- 0.99
Episode length: 30.04 +/- 0.60
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 30        |
|    mean_reward          | 0.025     |
| time/                   |           |
|    total_timesteps      | 320000    |
| train/                  |           |
|    approx_kl            | 0.3261723 |
|    clip_fraction        | 0.422     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.599    |
|    explained_variance   | 0.199     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0617   |

Ep done - 11200.
Ep done - 11210.
Ep done - 11220.
Ep done - 11230.
Ep done - 11240.
Ep done - 11250.
Ep done - 11260.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.01       |
| time/                   |            |
|    fps                  | 296        |
|    iterations           | 165        |
|    time_elapsed         | 1140       |
|    total_timesteps      | 337920     |
| train/                  |            |
|    approx_kl            | 0.28683263 |
|    clip_fraction        | 0.406      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.602     |
|    explained_variance   | 0.0595     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0402    |
|    n_updates            | 1640       |
|    policy_gradient_loss | -0.0619    |
|    value_loss           | 0.13       |
----------------------------------------
Ep done - 11270.
Ep 

Ep done - 11750.
Ep done - 11760.
Ep done - 11770.
Ep done - 11780.
Ep done - 11790.
Ep done - 11800.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.32      |
| time/                   |           |
|    fps                  | 290       |
|    iterations           | 173       |
|    time_elapsed         | 1218      |
|    total_timesteps      | 354304    |
| train/                  |           |
|    approx_kl            | 0.2631287 |
|    clip_fraction        | 0.4       |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.588    |
|    explained_variance   | 0.123     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0451   |
|    n_updates            | 1720      |
|    policy_gradient_loss | -0.0569   |
|    value_loss           | 0.11      |
---------------------------------------
Ep done - 11810.
Ep done - 11820.
Ep done - 11830.
Ep done

Ep done - 12290.
Ep done - 12300.
Ep done - 12310.
Ep done - 12320.
Ep done - 12330.
Ep done - 12340.
Ep done - 12350.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | -0.15     |
| time/                   |           |
|    fps                  | 292       |
|    iterations           | 181       |
|    time_elapsed         | 1265      |
|    total_timesteps      | 370688    |
| train/                  |           |
|    approx_kl            | 0.3029998 |
|    clip_fraction        | 0.425     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.644    |
|    explained_variance   | 0.167     |
|    learning_rate        | 0.001     |
|    loss                 | -0.068    |
|    n_updates            | 1800      |
|    policy_gradient_loss | -0.0623   |
|    value_loss           | 0.15      |
---------------------------------------
Ep done - 12360.
Ep done - 12370.
Ep done

Ep done - 12840.
Ep done - 12850.
Ep done - 12860.
Ep done - 12870.
Ep done - 12880.
Ep done - 12890.
Ep done - 12900.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.08       |
| time/                   |            |
|    fps                  | 294        |
|    iterations           | 189        |
|    time_elapsed         | 1315       |
|    total_timesteps      | 387072     |
| train/                  |            |
|    approx_kl            | 0.26569438 |
|    clip_fraction        | 0.42       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.613     |
|    explained_variance   | 0.0384     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0515    |
|    n_updates            | 1880       |
|    policy_gradient_loss | -0.0665    |
|    value_loss           | 0.152      |
----------------------------------------
Ep done - 12910.
Ep 

Ep done - 13380.
Ep done - 13390.
Ep done - 13400.
Ep done - 13410.
Ep done - 13420.
Ep done - 13430.
Ep done - 13440.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.09       |
| time/                   |            |
|    fps                  | 295        |
|    iterations           | 197        |
|    time_elapsed         | 1365       |
|    total_timesteps      | 403456     |
| train/                  |            |
|    approx_kl            | 0.27051213 |
|    clip_fraction        | 0.411      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.578     |
|    explained_variance   | 0.0372     |
|    learning_rate        | 0.001      |
|    loss                 | -0.00449   |
|    n_updates            | 1960       |
|    policy_gradient_loss | -0.0636    |
|    value_loss           | 0.144      |
----------------------------------------
Ep done - 13450.
Ep 

Ep done - 4010.
Ep done - 4020.
Ep done - 4030.
Ep done - 4040.
Ep done - 4050.
Ep done - 4060.
Ep done - 4070.
Ep done - 4080.
Ep done - 4090.
Ep done - 4100.
Ep done - 4110.
Ep done - 4120.
Ep done - 4130.
Ep done - 4140.
Ep done - 4150.
Ep done - 4160.
Ep done - 4170.
Ep done - 4180.
Ep done - 4190.
Ep done - 4200.
Eval num_timesteps=420000, episode_reward=-0.07 +/- 0.98
Episode length: 29.98 +/- 0.60
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 30        |
|    mean_reward          | -0.065    |
| time/                   |           |
|    total_timesteps      | 420000    |
| train/                  |           |
|    approx_kl            | 0.3702399 |
|    clip_fraction        | 0.394     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.578    |
|    explained_variance   | 0.0853    |
|    learning_rate        | 0.001     |
|    loss                 | -0.053    |
|    n_updates            | 2050

Ep done - 14540.
Ep done - 14550.
Ep done - 14560.
Ep done - 14570.
Ep done - 14580.
Ep done - 14590.
Ep done - 14600.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.01       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 214        |
|    time_elapsed         | 1461       |
|    total_timesteps      | 438272     |
| train/                  |            |
|    approx_kl            | 0.35595667 |
|    clip_fraction        | 0.407      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.568     |
|    explained_variance   | -0.16      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0427    |
|    n_updates            | 2130       |
|    policy_gradient_loss | -0.0618    |
|    value_loss           | 0.156      |
----------------------------------------
Ep done - 14610.
Ep 

Ep done - 15090.
Ep done - 15100.
Ep done - 15110.
Ep done - 15120.
Ep done - 15130.
Ep done - 15140.
Ep done - 15150.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | -0.07      |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 222        |
|    time_elapsed         | 1510       |
|    total_timesteps      | 454656     |
| train/                  |            |
|    approx_kl            | 0.35568744 |
|    clip_fraction        | 0.393      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.531     |
|    explained_variance   | 0.066      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0792    |
|    n_updates            | 2210       |
|    policy_gradient_loss | -0.0621    |
|    value_loss           | 0.149      |
----------------------------------------
Ep done - 15160.
Ep 

Ep done - 15640.
Ep done - 15650.
Ep done - 15660.
Ep done - 15670.
Ep done - 15680.
Ep done - 15690.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | -0.03      |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 230        |
|    time_elapsed         | 1568       |
|    total_timesteps      | 471040     |
| train/                  |            |
|    approx_kl            | 0.31100065 |
|    clip_fraction        | 0.425      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.604     |
|    explained_variance   | 0.0114     |
|    learning_rate        | 0.001      |
|    loss                 | -0.032     |
|    n_updates            | 2290       |
|    policy_gradient_loss | -0.0612    |
|    value_loss           | 0.166      |
----------------------------------------
Ep done - 15700.
Ep done - 15710.
Ep 

Ep done - 16180.
Ep done - 16190.
Ep done - 16200.
Ep done - 16210.
Ep done - 16220.
Ep done - 16230.
Ep done - 16240.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.04      |
| time/                   |            |
|    fps                  | 301        |
|    iterations           | 238        |
|    time_elapsed         | 1618       |
|    total_timesteps      | 487424     |
| train/                  |            |
|    approx_kl            | 0.32108203 |
|    clip_fraction        | 0.418      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.586     |
|    explained_variance   | -0.0445    |
|    learning_rate        | 0.001      |
|    loss                 | -0.0348    |
|    n_updates            | 2370       |
|    policy_gradient_loss | -0.0566    |
|    value_loss           | 0.168      |
----------------------------------------
Ep done - 16250.
Ep 

Ep done - 16730.
Ep done - 16740.
Ep done - 16750.
Ep done - 16760.
Ep done - 16770.
Ep done - 16780.
Ep done - 16790.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.02      |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 246       |
|    time_elapsed         | 1672      |
|    total_timesteps      | 503808    |
| train/                  |           |
|    approx_kl            | 0.2951672 |
|    clip_fraction        | 0.377     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.521    |
|    explained_variance   | 0.0698    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0353   |
|    n_updates            | 2450      |
|    policy_gradient_loss | -0.0555   |
|    value_loss           | 0.155     |
---------------------------------------
Ep done - 16800.
Ep done - 16810.
Ep done

Ep done - 17340.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.9     |
|    ep_rew_mean     | 0.01     |
| time/              |          |
|    fps             | 301      |
|    iterations      | 254      |
|    time_elapsed    | 1723     |
|    total_timesteps | 520192   |
---------------------------------
Ep done - 17350.
Ep done - 17360.
Ep done - 17370.
Ep done - 17380.
Ep done - 17390.
Ep done - 17400.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.04      |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 255        |
|    time_elapsed         | 1728       |
|    total_timesteps      | 522240     |
| train/                  |            |
|    approx_kl            | 0.36240673 |
|    clip_fraction        | 0.41       |
|    clip_range           | 0.2        |
|    ent

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.02      |
| time/                   |           |
|    fps                  | 304       |
|    iterations           | 263       |
|    time_elapsed         | 1767      |
|    total_timesteps      | 538624    |
| train/                  |           |
|    approx_kl            | 0.3492192 |
|    clip_fraction        | 0.407     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.537    |
|    explained_variance   | 0.109     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0735   |
|    n_updates            | 2620      |
|    policy_gradient_loss | -0.0577   |
|    value_loss           | 0.14      |
---------------------------------------
Ep done - 17960.
Ep done - 17970.
Ep done - 17980.
Ep done - 17990.
Ep done - 18000.
Ep done - 5210.
Ep done - 5220.
Ep done - 5230.
Ep done - 5240.
Ep done - 5

Ep done - 18440.
Ep done - 18450.
Ep done - 18460.
Ep done - 18470.
Ep done - 18480.
Ep done - 18490.
Ep done - 18500.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.06       |
| time/                   |            |
|    fps                  | 304        |
|    iterations           | 271        |
|    time_elapsed         | 1822       |
|    total_timesteps      | 555008     |
| train/                  |            |
|    approx_kl            | 0.33068103 |
|    clip_fraction        | 0.392      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.518     |
|    explained_variance   | -0.0208    |
|    learning_rate        | 0.001      |
|    loss                 | -0.0289    |
|    n_updates            | 2700       |
|    policy_gradient_loss | -0.0608    |
|    value_loss           | 0.142      |
----------------------------------------
Ep done - 18510.
Ep 

Ep done - 18990.
Ep done - 19000.
Ep done - 19010.
Ep done - 19020.
Ep done - 19030.
Ep done - 19040.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.05      |
| time/                   |           |
|    fps                  | 304       |
|    iterations           | 279       |
|    time_elapsed         | 1873      |
|    total_timesteps      | 571392    |
| train/                  |           |
|    approx_kl            | 0.3046077 |
|    clip_fraction        | 0.387     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.518    |
|    explained_variance   | -0.193    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0364   |
|    n_updates            | 2780      |
|    policy_gradient_loss | -0.0569   |
|    value_loss           | 0.18      |
---------------------------------------
Ep done - 19050.
Ep done - 19060.
Ep done - 19070.
Ep done

Ep done - 19530.
Ep done - 19540.
Ep done - 19550.
Ep done - 19560.
Ep done - 19570.
Ep done - 19580.
Ep done - 19590.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.26       |
| time/                   |            |
|    fps                  | 301        |
|    iterations           | 287        |
|    time_elapsed         | 1947       |
|    total_timesteps      | 587776     |
| train/                  |            |
|    approx_kl            | 0.28694725 |
|    clip_fraction        | 0.373      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.501     |
|    explained_variance   | -0.0733    |
|    learning_rate        | 0.001      |
|    loss                 | -0.0549    |
|    n_updates            | 2860       |
|    policy_gradient_loss | -0.0552    |
|    value_loss           | 0.155      |
----------------------------------------
Ep done - 19600.
Ep 

Ep done - 20080.
Ep done - 20090.
Ep done - 20100.
Ep done - 20110.
Ep done - 20120.
Ep done - 20130.
Ep done - 20140.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.28       |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 295        |
|    time_elapsed         | 2000       |
|    total_timesteps      | 604160     |
| train/                  |            |
|    approx_kl            | 0.32507297 |
|    clip_fraction        | 0.4        |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.504     |
|    explained_variance   | 0.187      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0451    |
|    n_updates            | 2940       |
|    policy_gradient_loss | -0.0589    |
|    value_loss           | 0.157      |
----------------------------------------
Ep done - 20150.
Ep 

Ep done - 20670.
Ep done - 20680.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.8     |
|    ep_rew_mean     | 0.23     |
| time/              |          |
|    fps             | 302      |
|    iterations      | 303      |
|    time_elapsed    | 2054     |
|    total_timesteps | 620544   |
---------------------------------
Ep done - 20690.
Ep done - 20700.
Ep done - 20710.
Ep done - 20720.
Ep done - 20730.
Ep done - 20740.
Ep done - 20750.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.8      |
|    ep_rew_mean          | 0.16      |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 304       |
|    time_elapsed         | 2061      |
|    total_timesteps      | 622592    |
| train/                  |           |
|    approx_kl            | 0.3423319 |
|    clip_fraction        | 0.355     |
|    clip_range           |

Ep done - 21300.
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 30       |
|    ep_rew_mean          | 0.32     |
| time/                   |          |
|    fps                  | 303      |
|    iterations           | 312      |
|    time_elapsed         | 2106     |
|    total_timesteps      | 638976   |
| train/                  |          |
|    approx_kl            | 0.373478 |
|    clip_fraction        | 0.386    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.495   |
|    explained_variance   | 0.0105   |
|    learning_rate        | 0.001    |
|    loss                 | -0.0368  |
|    n_updates            | 3110     |
|    policy_gradient_loss | -0.0555  |
|    value_loss           | 0.137    |
--------------------------------------
Ep done - 21310.
Ep done - 21320.
Ep done - 21330.
Ep done - 6210.
Ep done - 6220.
Ep done - 6230.
Ep done - 6240.
Ep done - 6250.
Ep done - 6260.
Ep done - 6270.
E

Ep done - 21790.
Ep done - 21800.
Ep done - 21810.
Ep done - 21820.
Ep done - 21830.
Ep done - 21840.
Ep done - 21850.
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 30       |
|    ep_rew_mean          | 0.24     |
| time/                   |          |
|    fps                  | 303      |
|    iterations           | 320      |
|    time_elapsed         | 2160     |
|    total_timesteps      | 655360   |
| train/                  |          |
|    approx_kl            | 0.356812 |
|    clip_fraction        | 0.387    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.486   |
|    explained_variance   | 0.114    |
|    learning_rate        | 0.001    |
|    loss                 | -0.0349  |
|    n_updates            | 3190     |
|    policy_gradient_loss | -0.0571  |
|    value_loss           | 0.151    |
--------------------------------------
Ep done - 21860.
Ep done - 21870.
Ep done - 21880.
Ep done - 2

Ep done - 22330.
Ep done - 22340.
Ep done - 22350.
Ep done - 22360.
Ep done - 22370.
Ep done - 22380.
Ep done - 22390.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | -0.01      |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 328        |
|    time_elapsed         | 2222       |
|    total_timesteps      | 671744     |
| train/                  |            |
|    approx_kl            | 0.31056988 |
|    clip_fraction        | 0.401      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.525     |
|    explained_variance   | 0.0403     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0325    |
|    n_updates            | 3270       |
|    policy_gradient_loss | -0.058     |
|    value_loss           | 0.175      |
----------------------------------------
Ep done - 22400.
Ep 

Ep done - 22880.
Ep done - 22890.
Ep done - 22900.
Ep done - 22910.
Ep done - 22920.
Ep done - 22930.
Ep done - 22940.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.23       |
| time/                   |            |
|    fps                  | 301        |
|    iterations           | 336        |
|    time_elapsed         | 2282       |
|    total_timesteps      | 688128     |
| train/                  |            |
|    approx_kl            | 0.31561175 |
|    clip_fraction        | 0.363      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.459     |
|    explained_variance   | 0.133      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0281    |
|    n_updates            | 3350       |
|    policy_gradient_loss | -0.0485    |
|    value_loss           | 0.18       |
----------------------------------------
Ep done - 22950.
Ep 

Ep done - 23430.
Ep done - 23440.
Ep done - 23450.
Ep done - 23460.
Ep done - 23470.
Ep done - 23480.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.09      |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 344       |
|    time_elapsed         | 2340      |
|    total_timesteps      | 704512    |
| train/                  |           |
|    approx_kl            | 0.2936527 |
|    clip_fraction        | 0.367     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.468    |
|    explained_variance   | 0.0873    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0164   |
|    n_updates            | 3430      |
|    policy_gradient_loss | -0.0554   |
|    value_loss           | 0.166     |
---------------------------------------
Ep done - 23490.
Ep done - 23500.
Ep done - 23510.
Ep done

Ep done - 24010.
Ep done - 24020.
Ep done - 24030.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.21     |
| time/              |          |
|    fps             | 300      |
|    iterations      | 352      |
|    time_elapsed    | 2400     |
|    total_timesteps | 720896   |
---------------------------------
Ep done - 24040.
Ep done - 24050.
Ep done - 24060.
Ep done - 24070.
Ep done - 24080.
Ep done - 24090.
Ep done - 24100.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.05      |
| time/                   |           |
|    fps                  | 300       |
|    iterations           | 353       |
|    time_elapsed         | 2407      |
|    total_timesteps      | 722944    |
| train/                  |           |
|    approx_kl            | 0.3995583 |
|    clip_fraction        | 0.36      |
|    clip_

Ep done - 24590.
Ep done - 24600.
Ep done - 24610.
Ep done - 24620.
Ep done - 24630.
Ep done - 24640.
Ep done - 24650.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0          |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 361        |
|    time_elapsed         | 2461       |
|    total_timesteps      | 739328     |
| train/                  |            |
|    approx_kl            | 0.32945016 |
|    clip_fraction        | 0.396      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.538     |
|    explained_variance   | 0.0615     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0316    |
|    n_updates            | 3600       |
|    policy_gradient_loss | -0.0561    |
|    value_loss           | 0.194      |
----------------------------------------
Ep done - 24660.
Ep 

Ep done - 25130.
Ep done - 25140.
Ep done - 25150.
Ep done - 25160.
Ep done - 25170.
Ep done - 25180.
Ep done - 25190.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.25       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 369        |
|    time_elapsed         | 2520       |
|    total_timesteps      | 755712     |
| train/                  |            |
|    approx_kl            | 0.28012043 |
|    clip_fraction        | 0.376      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.472     |
|    explained_variance   | 0.0253     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0696    |
|    n_updates            | 3680       |
|    policy_gradient_loss | -0.0556    |
|    value_loss           | 0.162      |
----------------------------------------
Ep done - 25200.
Ep 

Ep done - 25680.
Ep done - 25690.
Ep done - 25700.
Ep done - 25710.
Ep done - 25720.
Ep done - 25730.
Ep done - 25740.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.03       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 377        |
|    time_elapsed         | 2577       |
|    total_timesteps      | 772096     |
| train/                  |            |
|    approx_kl            | 0.34528542 |
|    clip_fraction        | 0.35       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.448     |
|    explained_variance   | 0.039      |
|    learning_rate        | 0.001      |
|    loss                 | 0.00998    |
|    n_updates            | 3760       |
|    policy_gradient_loss | -0.0518    |
|    value_loss           | 0.162      |
----------------------------------------
Ep done - 25750.
Ep 

Ep done - 26230.
Ep done - 26240.
Ep done - 26250.
Ep done - 26260.
Ep done - 26270.
Ep done - 26280.
Ep done - 26290.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.1        |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 385        |
|    time_elapsed         | 2635       |
|    total_timesteps      | 788480     |
| train/                  |            |
|    approx_kl            | 0.34660548 |
|    clip_fraction        | 0.391      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.526     |
|    explained_variance   | 0.15       |
|    learning_rate        | 0.001      |
|    loss                 | -0.0176    |
|    n_updates            | 3840       |
|    policy_gradient_loss | -0.0549    |
|    value_loss           | 0.181      |
----------------------------------------
Ep done - 26300.
Ep 

Ep done - 26770.
Ep done - 26780.
Ep done - 26790.
Ep done - 26800.
Ep done - 26810.
Ep done - 26820.
Ep done - 26830.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.01      |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 393        |
|    time_elapsed         | 2690       |
|    total_timesteps      | 804864     |
| train/                  |            |
|    approx_kl            | 0.29564908 |
|    clip_fraction        | 0.389      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.52      |
|    explained_variance   | 0.131      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0208    |
|    n_updates            | 3920       |
|    policy_gradient_loss | -0.0573    |
|    value_loss           | 0.162      |
----------------------------------------
Ep done - 26840.
Ep 

Ep done - 27350.
Ep done - 27360.
Ep done - 27370.
Ep done - 27380.
Ep done - 27390.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.4     |
|    ep_rew_mean     | -0.12    |
| time/              |          |
|    fps             | 299      |
|    iterations      | 401      |
|    time_elapsed    | 2744     |
|    total_timesteps | 821248   |
---------------------------------
Ep done - 27400.
Ep done - 27410.
Ep done - 27420.
Ep done - 27430.
Ep done - 27440.
Ep done - 27450.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.7      |
|    ep_rew_mean          | -0.18     |
| time/                   |           |
|    fps                  | 299       |
|    iterations           | 402       |
|    time_elapsed         | 2750      |
|    total_timesteps      | 823296    |
| train/                  |           |
|    approx_kl            | 0.3935948 |
|    clip_fraction        | 0.383

Ep done - 27950.
Ep done - 27960.
Ep done - 27970.
Ep done - 27980.
Ep done - 27990.
Ep done - 28000.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.02      |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 410        |
|    time_elapsed         | 2798       |
|    total_timesteps      | 839680     |
| train/                  |            |
|    approx_kl            | 0.34794158 |
|    clip_fraction        | 0.374      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.463     |
|    explained_variance   | 0.193      |
|    learning_rate        | 0.001      |
|    loss                 | -0.028     |
|    n_updates            | 4090       |
|    policy_gradient_loss | -0.0554    |
|    value_loss           | 0.148      |
----------------------------------------
Ep done - 28010.
Ep done - 8210.
Ep d

Ep done - 28490.
Ep done - 28500.
Ep done - 28510.
Ep done - 28520.
Ep done - 28530.
Ep done - 28540.
Ep done - 28550.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.02      |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 418        |
|    time_elapsed         | 2856       |
|    total_timesteps      | 856064     |
| train/                  |            |
|    approx_kl            | 0.32668537 |
|    clip_fraction        | 0.385      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.458     |
|    explained_variance   | 0.0439     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0543    |
|    n_updates            | 4170       |
|    policy_gradient_loss | -0.0569    |
|    value_loss           | 0.179      |
----------------------------------------
Ep done - 28560.
Ep 

Ep done - 29040.
Ep done - 29050.
Ep done - 29060.
Ep done - 29070.
Ep done - 29080.
Ep done - 29090.
Ep done - 29100.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.08       |
| time/                   |            |
|    fps                  | 298        |
|    iterations           | 426        |
|    time_elapsed         | 2925       |
|    total_timesteps      | 872448     |
| train/                  |            |
|    approx_kl            | 0.34910363 |
|    clip_fraction        | 0.365      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.447     |
|    explained_variance   | -0.119     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0268    |
|    n_updates            | 4250       |
|    policy_gradient_loss | -0.054     |
|    value_loss           | 0.185      |
----------------------------------------
Ep done - 29110.
Ep 

Ep done - 29590.
Ep done - 29600.
Ep done - 29610.
Ep done - 29620.
Ep done - 29630.
Ep done - 29640.
Ep done - 29650.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.7      |
|    ep_rew_mean          | 0.01      |
| time/                   |           |
|    fps                  | 298       |
|    iterations           | 434       |
|    time_elapsed         | 2982      |
|    total_timesteps      | 888832    |
| train/                  |           |
|    approx_kl            | 0.3457752 |
|    clip_fraction        | 0.356     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.432    |
|    explained_variance   | 0.0531    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0493   |
|    n_updates            | 4330      |
|    policy_gradient_loss | -0.0497   |
|    value_loss           | 0.17      |
---------------------------------------
Ep done - 29660.
Ep done - 29670.
Ep done

Ep done - 30140.
Ep done - 30150.
Ep done - 30160.
Ep done - 30170.
Ep done - 30180.
Ep done - 30190.
Ep done - 30200.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.7      |
|    ep_rew_mean          | 0.08      |
| time/                   |           |
|    fps                  | 298       |
|    iterations           | 442       |
|    time_elapsed         | 3036      |
|    total_timesteps      | 905216    |
| train/                  |           |
|    approx_kl            | 0.3448125 |
|    clip_fraction        | 0.345     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.42     |
|    explained_variance   | 0.0147    |
|    learning_rate        | 0.001     |
|    loss                 | 0.00572   |
|    n_updates            | 4410      |
|    policy_gradient_loss | -0.0495   |
|    value_loss           | 0.187     |
---------------------------------------
Ep done - 30210.
Ep done - 30220.
Ep done

Ep done - 30700.
Ep done - 30710.
Ep done - 30720.
Ep done - 30730.
Ep done - 30740.
Ep done - 30750.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 0.03     |
| time/              |          |
|    fps             | 298      |
|    iterations      | 450      |
|    time_elapsed    | 3090     |
|    total_timesteps | 921600   |
---------------------------------
Ep done - 30760.
Ep done - 30770.
Ep done - 30780.
Ep done - 30790.
Ep done - 30800.
Ep done - 30810.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.07      |
| time/                   |           |
|    fps                  | 298       |
|    iterations           | 451       |
|    time_elapsed         | 3095      |
|    total_timesteps      | 923648    |
| train/                  |           |
|    approx_kl            | 0.4092118 |
|    clip_fracti

Ep done - 31310.
Ep done - 31320.
Ep done - 31330.
Ep done - 31340.
Ep done - 31350.
Ep done - 31360.
Ep done - 9210.
Ep done - 9220.
Ep done - 9230.
Ep done - 9240.
Ep done - 9250.
Ep done - 9260.
Ep done - 9270.
Ep done - 9280.
Ep done - 9290.
Ep done - 9300.
Ep done - 9310.
Ep done - 9320.
Ep done - 9330.
Ep done - 9340.
Ep done - 9350.
Ep done - 9360.
Ep done - 9370.
Ep done - 9380.
Ep done - 9390.
Ep done - 9400.
Eval num_timesteps=940000, episode_reward=-0.16 +/- 0.97
Episode length: 29.86 +/- 0.60
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 29.9       |
|    mean_reward          | -0.16      |
| time/                   |            |
|    total_timesteps      | 940000     |
| train/                  |            |
|    approx_kl            | 0.35001242 |
|    clip_fraction        | 0.389      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.468     |
|    explained_variance   | 0.101      

Ep done - 31850.
Ep done - 31860.
Ep done - 31870.
Ep done - 31880.
Ep done - 31890.
Ep done - 31900.
Ep done - 31910.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.26       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 467        |
|    time_elapsed         | 3194       |
|    total_timesteps      | 956416     |
| train/                  |            |
|    approx_kl            | 0.34326154 |
|    clip_fraction        | 0.38       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.47      |
|    explained_variance   | -0.0176    |
|    learning_rate        | 0.001      |
|    loss                 | -0.0277    |
|    n_updates            | 4660       |
|    policy_gradient_loss | -0.0507    |
|    value_loss           | 0.2        |
----------------------------------------
Ep done - 31920.
Ep 

Ep done - 32400.
Ep done - 32410.
Ep done - 32420.
Ep done - 32430.
Ep done - 32440.
Ep done - 32450.
Ep done - 32460.
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 29.9     |
|    ep_rew_mean          | -0.03    |
| time/                   |          |
|    fps                  | 299      |
|    iterations           | 475      |
|    time_elapsed         | 3251     |
|    total_timesteps      | 972800   |
| train/                  |          |
|    approx_kl            | 0.378995 |
|    clip_fraction        | 0.355    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.444   |
|    explained_variance   | 0.0181   |
|    learning_rate        | 0.001    |
|    loss                 | -0.0247  |
|    n_updates            | 4740     |
|    policy_gradient_loss | -0.0511  |
|    value_loss           | 0.185    |
--------------------------------------
Ep done - 32470.
Ep done - 32480.
Ep done - 32490.
Ep done - 3

Ep done - 32950.
Ep done - 32960.
Ep done - 32970.
Ep done - 32980.
Ep done - 32990.
Ep done - 33000.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.15       |
| time/                   |            |
|    fps                  | 298        |
|    iterations           | 483        |
|    time_elapsed         | 3310       |
|    total_timesteps      | 989184     |
| train/                  |            |
|    approx_kl            | 0.40753093 |
|    clip_fraction        | 0.364      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.431     |
|    explained_variance   | 0.0224     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0309    |
|    n_updates            | 4820       |
|    policy_gradient_loss | -0.0478    |
|    value_loss           | 0.178      |
----------------------------------------
Ep done - 33010.
Ep done - 33020.
Ep 

Ep done - 33490.
Ep done - 33500.
Ep done - 33510.
Ep done - 33520.
Ep done - 33530.
Ep done - 33540.
Ep done - 33550.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.08       |
| time/                   |            |
|    fps                  | 298        |
|    iterations           | 491        |
|    time_elapsed         | 3364       |
|    total_timesteps      | 1005568    |
| train/                  |            |
|    approx_kl            | 0.40383422 |
|    clip_fraction        | 0.346      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.418     |
|    explained_variance   | 0.00546    |
|    learning_rate        | 0.001      |
|    loss                 | -0.0158    |
|    n_updates            | 4900       |
|    policy_gradient_loss | -0.0447    |
|    value_loss           | 0.196      |
----------------------------------------
Ep done - 33560.
Ep 

Ep done - 34040.
Ep done - 34050.
Ep done - 34060.
Ep done - 34070.
Ep done - 34080.
Ep done - 34090.
Ep done - 34100.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.9     |
|    ep_rew_mean     | 0.07     |
| time/              |          |
|    fps             | 298      |
|    iterations      | 499      |
|    time_elapsed    | 3419     |
|    total_timesteps | 1021952  |
---------------------------------
Ep done - 34110.
Ep done - 34120.
Ep done - 34130.
Ep done - 34140.
Ep done - 34150.
Ep done - 34160.
Ep done - 34170.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.04      |
| time/                   |            |
|    fps                  | 298        |
|    iterations           | 500        |
|    time_elapsed         | 3425       |
|    total_timesteps      | 1024000    |
| train/                  |            |
|    approx_

Ep done - 34660.
Ep done - 34670.
Ep done - 34680.
Ep done - 34690.
Ep done - 34700.
Ep done - 10210.
Ep done - 10220.
Ep done - 10230.
Ep done - 10240.
Ep done - 10250.
Ep done - 10260.
Ep done - 10270.
Ep done - 10280.
Ep done - 10290.
Ep done - 10300.
Ep done - 10310.
Ep done - 10320.
Ep done - 10330.
Ep done - 10340.
Ep done - 10350.
Ep done - 10360.
Ep done - 10370.
Ep done - 10380.
Ep done - 10390.
Ep done - 10400.
Eval num_timesteps=1040000, episode_reward=-0.03 +/- 0.98
Episode length: 29.93 +/- 0.55
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 29.9       |
|    mean_reward          | -0.025     |
| time/                   |            |
|    total_timesteps      | 1040000    |
| train/                  |            |
|    approx_kl            | 0.36788893 |
|    clip_fraction        | 0.346      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.406     |
|    explained_variance   | -0.158 

Ep done - 35200.
Ep done - 35210.
Ep done - 35220.
Ep done - 35230.
Ep done - 35240.
Ep done - 35250.
Ep done - 35260.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | -0.07      |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 516        |
|    time_elapsed         | 3524       |
|    total_timesteps      | 1056768    |
| train/                  |            |
|    approx_kl            | 0.32991642 |
|    clip_fraction        | 0.349      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.384     |
|    explained_variance   | 0.193      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0152    |
|    n_updates            | 5150       |
|    policy_gradient_loss | -0.0471    |
|    value_loss           | 0.161      |
----------------------------------------
Ep done - 35270.
Ep 

Ep done - 35750.
Ep done - 35760.
Ep done - 35770.
Ep done - 35780.
Ep done - 35790.
Ep done - 35800.
Ep done - 35810.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.05      |
| time/                   |           |
|    fps                  | 299       |
|    iterations           | 524       |
|    time_elapsed         | 3577      |
|    total_timesteps      | 1073152   |
| train/                  |           |
|    approx_kl            | 0.3238592 |
|    clip_fraction        | 0.326     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.382    |
|    explained_variance   | -0.125    |
|    learning_rate        | 0.001     |
|    loss                 | 0.0269    |
|    n_updates            | 5230      |
|    policy_gradient_loss | -0.0351   |
|    value_loss           | 0.205     |
---------------------------------------
Ep done - 35820.
Ep done - 35830.
Ep done

Ep done - 36300.
Ep done - 36310.
Ep done - 36320.
Ep done - 36330.
Ep done - 36340.
Ep done - 36350.
Ep done - 36360.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.25       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 532        |
|    time_elapsed         | 3635       |
|    total_timesteps      | 1089536    |
| train/                  |            |
|    approx_kl            | 0.35040253 |
|    clip_fraction        | 0.329      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.395     |
|    explained_variance   | 0.0102     |
|    learning_rate        | 0.001      |
|    loss                 | -0.0302    |
|    n_updates            | 5310       |
|    policy_gradient_loss | -0.0453    |
|    value_loss           | 0.182      |
----------------------------------------
Ep done - 36370.
Ep 

Ep done - 36840.
Ep done - 36850.
Ep done - 36860.
Ep done - 36870.
Ep done - 36880.
Ep done - 36890.
Ep done - 36900.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.11      |
| time/                   |           |
|    fps                  | 300       |
|    iterations           | 540       |
|    time_elapsed         | 3684      |
|    total_timesteps      | 1105920   |
| train/                  |           |
|    approx_kl            | 0.3207149 |
|    clip_fraction        | 0.328     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.372    |
|    explained_variance   | 0.209     |
|    learning_rate        | 0.001     |
|    loss                 | -0.00671  |
|    n_updates            | 5390      |
|    policy_gradient_loss | -0.0473   |
|    value_loss           | 0.192     |
---------------------------------------
Ep done - 36910.
Ep done - 36920.
Ep done

Ep done - 37390.
Ep done - 37400.
Ep done - 37410.
Ep done - 37420.
Ep done - 37430.
Ep done - 37440.
Ep done - 37450.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.22       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 548        |
|    time_elapsed         | 3740       |
|    total_timesteps      | 1122304    |
| train/                  |            |
|    approx_kl            | 0.32252112 |
|    clip_fraction        | 0.333      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.409     |
|    explained_variance   | 0.158      |
|    learning_rate        | 0.001      |
|    loss                 | 0.00257    |
|    n_updates            | 5470       |
|    policy_gradient_loss | -0.046     |
|    value_loss           | 0.185      |
----------------------------------------
Ep done - 37460.
Ep 

Ep done - 38010.
Ep done - 38020.
Ep done - 38030.
Ep done - 38040.
Ep done - 11210.
Ep done - 11220.
Ep done - 11230.
Ep done - 11240.
Ep done - 11250.
Ep done - 11260.
Ep done - 11270.
Ep done - 11280.
Ep done - 11290.
Ep done - 11300.
Ep done - 11310.
Ep done - 11320.
Ep done - 11330.
Ep done - 11340.
Ep done - 11350.
Ep done - 11360.
Ep done - 11370.
Ep done - 11380.
Ep done - 11390.
Ep done - 11400.
Eval num_timesteps=1140000, episode_reward=0.03 +/- 0.97
Episode length: 29.93 +/- 0.62
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 29.9       |
|    mean_reward          | 0.03       |
| time/                   |            |
|    total_timesteps      | 1140000    |
| train/                  |            |
|    approx_kl            | 0.40468025 |
|    clip_fraction        | 0.378      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.446     |
|    explained_variance   | 0.224      |
|    learnin

Ep done - 38560.
Ep done - 38570.
Ep done - 38580.
Ep done - 38590.
Ep done - 38600.
Ep done - 38610.
Ep done - 38620.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | -0.06     |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 565       |
|    time_elapsed         | 3843      |
|    total_timesteps      | 1157120   |
| train/                  |           |
|    approx_kl            | 0.3410691 |
|    clip_fraction        | 0.349     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.421    |
|    explained_variance   | 0.133     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0401   |
|    n_updates            | 5640      |
|    policy_gradient_loss | -0.0512   |
|    value_loss           | 0.168     |
---------------------------------------
Ep done - 38630.
Ep done - 38640.
Ep done

Ep done - 39100.
Ep done - 39110.
Ep done - 39120.
Ep done - 39130.
Ep done - 39140.
Ep done - 39150.
Ep done - 39160.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.2       |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 573       |
|    time_elapsed         | 3897      |
|    total_timesteps      | 1173504   |
| train/                  |           |
|    approx_kl            | 0.3630631 |
|    clip_fraction        | 0.344     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.394    |
|    explained_variance   | 0.143     |
|    learning_rate        | 0.001     |
|    loss                 | 0.002     |
|    n_updates            | 5720      |
|    policy_gradient_loss | -0.0474   |
|    value_loss           | 0.173     |
---------------------------------------
Ep done - 39170.
Ep done - 39180.
Ep done

Ep done - 39650.
Ep done - 39660.
Ep done - 39670.
Ep done - 39680.
Ep done - 39690.
Ep done - 39700.
Ep done - 39710.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.15       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 581        |
|    time_elapsed         | 3954       |
|    total_timesteps      | 1189888    |
| train/                  |            |
|    approx_kl            | 0.32950014 |
|    clip_fraction        | 0.343      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.406     |
|    explained_variance   | 0.199      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0437    |
|    n_updates            | 5800       |
|    policy_gradient_loss | -0.0427    |
|    value_loss           | 0.174      |
----------------------------------------
Ep done - 39720.
Ep 

Ep done - 40200.
Ep done - 40210.
Ep done - 40220.
Ep done - 40230.
Ep done - 40240.
Ep done - 40250.
Ep done - 40260.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.33       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 589        |
|    time_elapsed         | 4011       |
|    total_timesteps      | 1206272    |
| train/                  |            |
|    approx_kl            | 0.30270642 |
|    clip_fraction        | 0.322      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.389     |
|    explained_variance   | 0.0234     |
|    learning_rate        | 0.001      |
|    loss                 | -0.00434   |
|    n_updates            | 5880       |
|    policy_gradient_loss | -0.0399    |
|    value_loss           | 0.16       |
----------------------------------------
Ep done - 40270.
Ep 

Ep done - 40750.
Ep done - 40760.
Ep done - 40770.
Ep done - 40780.
Ep done - 40790.
Ep done - 40800.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.36       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 597        |
|    time_elapsed         | 4067       |
|    total_timesteps      | 1222656    |
| train/                  |            |
|    approx_kl            | 0.33571446 |
|    clip_fraction        | 0.349      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.403     |
|    explained_variance   | 0.058      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0516    |
|    n_updates            | 5960       |
|    policy_gradient_loss | -0.0443    |
|    value_loss           | 0.176      |
----------------------------------------
Ep done - 40810.
Ep done - 40820.
Ep 

Ep done - 41360.
Ep done - 41370.
Ep done - 41380.
Ep done - 12210.
Ep done - 12220.
Ep done - 12230.
Ep done - 12240.
Ep done - 12250.
Ep done - 12260.
Ep done - 12270.
Ep done - 12280.
Ep done - 12290.
Ep done - 12300.
Ep done - 12310.
Ep done - 12320.
Ep done - 12330.
Ep done - 12340.
Ep done - 12350.
Ep done - 12360.
Ep done - 12370.
Ep done - 12380.
Ep done - 12390.
Ep done - 12400.
Eval num_timesteps=1240000, episode_reward=0.16 +/- 0.98
Episode length: 29.89 +/- 0.63
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 29.9       |
|    mean_reward          | 0.16       |
| time/                   |            |
|    total_timesteps      | 1240000    |
| train/                  |            |
|    approx_kl            | 0.31603044 |
|    clip_fraction        | 0.357      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.431     |
|    explained_variance   | 0.216      |
|    learning_rate        | 0

Ep done - 41910.
Ep done - 41920.
Ep done - 41930.
Ep done - 41940.
Ep done - 41950.
Ep done - 41960.
Ep done - 41970.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.32       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 614        |
|    time_elapsed         | 4184       |
|    total_timesteps      | 1257472    |
| train/                  |            |
|    approx_kl            | 0.30790648 |
|    clip_fraction        | 0.327      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.381     |
|    explained_variance   | 0.173      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0263    |
|    n_updates            | 6130       |
|    policy_gradient_loss | -0.0444    |
|    value_loss           | 0.157      |
----------------------------------------
Ep done - 41980.
Ep 

Ep done - 42450.
Ep done - 42460.
Ep done - 42470.
Ep done - 42480.
Ep done - 42490.
Ep done - 42500.
Ep done - 42510.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.23       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 622        |
|    time_elapsed         | 4250       |
|    total_timesteps      | 1273856    |
| train/                  |            |
|    approx_kl            | 0.45315617 |
|    clip_fraction        | 0.32       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.384     |
|    explained_variance   | 0.296      |
|    learning_rate        | 0.001      |
|    loss                 | 0.00561    |
|    n_updates            | 6210       |
|    policy_gradient_loss | -0.0432    |
|    value_loss           | 0.151      |
----------------------------------------
Ep done - 42520.
Ep 

Ep done - 43000.
Ep done - 43010.
Ep done - 43020.
Ep done - 43030.
Ep done - 43040.
Ep done - 43050.
Ep done - 43060.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.09      |
| time/                   |           |
|    fps                  | 299       |
|    iterations           | 630       |
|    time_elapsed         | 4307      |
|    total_timesteps      | 1290240   |
| train/                  |           |
|    approx_kl            | 0.3295138 |
|    clip_fraction        | 0.359     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.434    |
|    explained_variance   | 0.31      |
|    learning_rate        | 0.001     |
|    loss                 | 0.0257    |
|    n_updates            | 6290      |
|    policy_gradient_loss | -0.0439   |
|    value_loss           | 0.156     |
---------------------------------------
Ep done - 43070.
Ep done - 43080.
Ep done

Ep done - 43550.
Ep done - 43560.
Ep done - 43570.
Ep done - 43580.
Ep done - 43590.
Ep done - 43600.
Ep done - 43610.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.24       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 638        |
|    time_elapsed         | 4362       |
|    total_timesteps      | 1306624    |
| train/                  |            |
|    approx_kl            | 0.36900288 |
|    clip_fraction        | 0.301      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.35      |
|    explained_variance   | 0.271      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0127    |
|    n_updates            | 6370       |
|    policy_gradient_loss | -0.0483    |
|    value_loss           | 0.171      |
----------------------------------------
Ep done - 43620.
Ep 

Ep done - 44100.
Ep done - 44110.
Ep done - 44120.
Ep done - 44130.
Ep done - 44140.
Ep done - 44150.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.12       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 646        |
|    time_elapsed         | 4421       |
|    total_timesteps      | 1323008    |
| train/                  |            |
|    approx_kl            | 0.49123308 |
|    clip_fraction        | 0.354      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.391     |
|    explained_variance   | 0.0993     |
|    learning_rate        | 0.001      |
|    loss                 | 0.00817    |
|    n_updates            | 6450       |
|    policy_gradient_loss | -0.0468    |
|    value_loss           | 0.203      |
----------------------------------------
Ep done - 44160.
Ep done - 44170.
Ep 

Ep done - 44710.
Ep done - 44720.
Ep done - 13210.
Ep done - 13220.
Ep done - 13230.
Ep done - 13240.
Ep done - 13250.
Ep done - 13260.
Ep done - 13270.
Ep done - 13280.
Ep done - 13290.
Ep done - 13300.
Ep done - 13310.
Ep done - 13320.
Ep done - 13330.
Ep done - 13340.
Ep done - 13350.
Ep done - 13360.
Ep done - 13370.
Ep done - 13380.
Ep done - 13390.
Ep done - 13400.
Eval num_timesteps=1340000, episode_reward=0.10 +/- 0.98
Episode length: 29.92 +/- 0.46
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 29.9      |
|    mean_reward          | 0.095     |
| time/                   |           |
|    total_timesteps      | 1340000   |
| train/                  |           |
|    approx_kl            | 0.3513152 |
|    clip_fraction        | 0.335     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.4      |
|    explained_variance   | 0.233     |
|    learning_rate        | 0.001     |
|    loss         

Ep done - 45260.
Ep done - 45270.
Ep done - 45280.
Ep done - 45290.
Ep done - 45300.
Ep done - 45310.
Ep done - 45320.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.24       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 663        |
|    time_elapsed         | 4528       |
|    total_timesteps      | 1357824    |
| train/                  |            |
|    approx_kl            | 0.33716178 |
|    clip_fraction        | 0.328      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.397     |
|    explained_variance   | 0.19       |
|    learning_rate        | 0.001      |
|    loss                 | 0.00968    |
|    n_updates            | 6620       |
|    policy_gradient_loss | -0.0391    |
|    value_loss           | 0.173      |
----------------------------------------
Ep done - 45330.
Ep 

Ep done - 45810.
Ep done - 45820.
Ep done - 45830.
Ep done - 45840.
Ep done - 45850.
Ep done - 45860.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.23       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 671        |
|    time_elapsed         | 4582       |
|    total_timesteps      | 1374208    |
| train/                  |            |
|    approx_kl            | 0.32131004 |
|    clip_fraction        | 0.333      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.422     |
|    explained_variance   | 0.237      |
|    learning_rate        | 0.001      |
|    loss                 | -0.00355   |
|    n_updates            | 6700       |
|    policy_gradient_loss | -0.0485    |
|    value_loss           | 0.197      |
----------------------------------------
Ep done - 45870.
Ep done - 45880.
Ep 

Ep done - 46350.
Ep done - 46360.
Ep done - 46370.
Ep done - 46380.
Ep done - 46390.
Ep done - 46400.
Ep done - 46410.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.15       |
| time/                   |            |
|    fps                  | 299        |
|    iterations           | 679        |
|    time_elapsed         | 4636       |
|    total_timesteps      | 1390592    |
| train/                  |            |
|    approx_kl            | 0.38772437 |
|    clip_fraction        | 0.329      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.404     |
|    explained_variance   | 0.205      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0118    |
|    n_updates            | 6780       |
|    policy_gradient_loss | -0.0437    |
|    value_loss           | 0.166      |
----------------------------------------
Ep done - 46420.
Ep 

Ep done - 46900.
Ep done - 46910.
Ep done - 46920.
Ep done - 46930.
Ep done - 46940.
Ep done - 46950.
Ep done - 46960.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.15      |
| time/                   |           |
|    fps                  | 299       |
|    iterations           | 687       |
|    time_elapsed         | 4690      |
|    total_timesteps      | 1406976   |
| train/                  |           |
|    approx_kl            | 0.3567779 |
|    clip_fraction        | 0.356     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.429    |
|    explained_variance   | 0.165     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0172   |
|    n_updates            | 6860      |
|    policy_gradient_loss | -0.047    |
|    value_loss           | 0.188     |
---------------------------------------
Ep done - 46970.
Ep done - 46980.
Ep done

Ep done - 47450.
Ep done - 47460.
Ep done - 47470.
Ep done - 47480.
Ep done - 47490.
Ep done - 47500.
Ep done - 47510.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.39      |
| time/                   |           |
|    fps                  | 299       |
|    iterations           | 695       |
|    time_elapsed         | 4745      |
|    total_timesteps      | 1423360   |
| train/                  |           |
|    approx_kl            | 0.4066128 |
|    clip_fraction        | 0.311     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.384    |
|    explained_variance   | 0.206     |
|    learning_rate        | 0.001     |
|    loss                 | -0.00668  |
|    n_updates            | 6940      |
|    policy_gradient_loss | -0.0372   |
|    value_loss           | 0.182     |
---------------------------------------
Ep done - 47520.
Ep done - 47530.
Ep done

Ep done - 48060.
Ep done - 14210.
Ep done - 14220.
Ep done - 14230.
Ep done - 14240.
Ep done - 14250.
Ep done - 14260.
Ep done - 14270.
Ep done - 14280.
Ep done - 14290.
Ep done - 14300.
Ep done - 14310.
Ep done - 14320.
Ep done - 14330.
Ep done - 14340.
Ep done - 14350.
Ep done - 14360.
Ep done - 14370.
Ep done - 14380.
Ep done - 14390.
Ep done - 14400.
Eval num_timesteps=1440000, episode_reward=0.22 +/- 0.97
Episode length: 30.01 +/- 0.65
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 30        |
|    mean_reward          | 0.22      |
| time/                   |           |
|    total_timesteps      | 1440000   |
| train/                  |           |
|    approx_kl            | 0.3462218 |
|    clip_fraction        | 0.36      |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.419    |
|    explained_variance   | 0.0952    |
|    learning_rate        | 0.001     |
|    loss                 | 0.0114 

Ep done - 48610.
Ep done - 48620.
Ep done - 48630.
Ep done - 48640.
Ep done - 48650.
Ep done - 48660.
Ep done - 48670.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.8       |
|    ep_rew_mean          | 0.16       |
| time/                   |            |
|    fps                  | 300        |
|    iterations           | 712        |
|    time_elapsed         | 4849       |
|    total_timesteps      | 1458176    |
| train/                  |            |
|    approx_kl            | 0.29309225 |
|    clip_fraction        | 0.313      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.358     |
|    explained_variance   | 0.191      |
|    learning_rate        | 0.001      |
|    loss                 | 0.0231     |
|    n_updates            | 7110       |
|    policy_gradient_loss | -0.0437    |
|    value_loss           | 0.203      |
----------------------------------------
Ep done - 48680.
Ep 

Ep done - 49160.
Ep done - 49170.
Ep done - 49180.
Ep done - 49190.
Ep done - 49200.
Ep done - 49210.
Ep done - 49220.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 29.9      |
|    ep_rew_mean          | 0.27      |
| time/                   |           |
|    fps                  | 300       |
|    iterations           | 720       |
|    time_elapsed         | 4903      |
|    total_timesteps      | 1474560   |
| train/                  |           |
|    approx_kl            | 0.3168323 |
|    clip_fraction        | 0.294     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.333    |
|    explained_variance   | 0.254     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0383   |
|    n_updates            | 7190      |
|    policy_gradient_loss | -0.0421   |
|    value_loss           | 0.165     |
---------------------------------------
Ep done - 49230.
Ep done - 49240.
Ep done

Ep done - 49710.
Ep done - 49720.
Ep done - 49730.
Ep done - 49740.
Ep done - 49750.
Ep done - 49760.
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 30       |
|    ep_rew_mean          | 0.29     |
| time/                   |          |
|    fps                  | 301      |
|    iterations           | 728      |
|    time_elapsed         | 4952     |
|    total_timesteps      | 1490944  |
| train/                  |          |
|    approx_kl            | 0.309111 |
|    clip_fraction        | 0.377    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.449   |
|    explained_variance   | 0.303    |
|    learning_rate        | 0.001    |
|    loss                 | -0.0315  |
|    n_updates            | 7270     |
|    policy_gradient_loss | -0.0481  |
|    value_loss           | 0.172    |
--------------------------------------
Ep done - 49770.
Ep done - 49780.
Ep done - 49790.
Ep done - 49800.
Ep done - 4

Ep done - 50250.
Ep done - 50260.
Ep done - 50270.
Ep done - 50280.
Ep done - 50290.
Ep done - 50300.
Ep done - 50310.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.38      |
| time/                   |           |
|    fps                  | 301       |
|    iterations           | 736       |
|    time_elapsed         | 4998      |
|    total_timesteps      | 1507328   |
| train/                  |           |
|    approx_kl            | 0.3398344 |
|    clip_fraction        | 0.317     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.387    |
|    explained_variance   | 0.175     |
|    learning_rate        | 0.001     |
|    loss                 | -0.0183   |
|    n_updates            | 7350      |
|    policy_gradient_loss | -0.0432   |
|    value_loss           | 0.177     |
---------------------------------------
Ep done - 50320.
Ep done - 50330.
Ep done

Ep done - 50800.
Ep done - 50810.
Ep done - 50820.
Ep done - 50830.
Ep done - 50840.
Ep done - 50850.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | -0.05      |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 744        |
|    time_elapsed         | 5044       |
|    total_timesteps      | 1523712    |
| train/                  |            |
|    approx_kl            | 0.31429437 |
|    clip_fraction        | 0.349      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.413     |
|    explained_variance   | 0.343      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0412    |
|    n_updates            | 7430       |
|    policy_gradient_loss | -0.0465    |
|    value_loss           | 0.173      |
----------------------------------------
Ep done - 50860.
Ep done - 50870.
Ep 

Ep done - 15400.
Eval num_timesteps=1540000, episode_reward=0.15 +/- 0.98
Episode length: 30.03 +/- 0.61
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30         |
|    mean_reward          | 0.155      |
| time/                   |            |
|    total_timesteps      | 1540000    |
| train/                  |            |
|    approx_kl            | 0.28961203 |
|    clip_fraction        | 0.326      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.413     |
|    explained_variance   | 0.144      |
|    learning_rate        | 0.001      |
|    loss                 | 0.0482     |
|    n_updates            | 7510       |
|    policy_gradient_loss | -0.0429    |
|    value_loss           | 0.209      |
----------------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 0.23     |
| time/              

Ep done - 51960.
Ep done - 51970.
Ep done - 51980.
Ep done - 51990.
Ep done - 52000.
Ep done - 52010.
Ep done - 52020.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.31       |
| time/                   |            |
|    fps                  | 303        |
|    iterations           | 761        |
|    time_elapsed         | 5136       |
|    total_timesteps      | 1558528    |
| train/                  |            |
|    approx_kl            | 0.32553625 |
|    clip_fraction        | 0.333      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.4       |
|    explained_variance   | 0.0294     |
|    learning_rate        | 0.001      |
|    loss                 | 0.0127     |
|    n_updates            | 7600       |
|    policy_gradient_loss | -0.0425    |
|    value_loss           | 0.206      |
----------------------------------------
Ep done - 52030.
Ep 

Ep done - 52500.
Ep done - 52510.
Ep done - 52520.
Ep done - 52530.
Ep done - 52540.
Ep done - 52550.
Ep done - 52560.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.44      |
| time/                   |           |
|    fps                  | 303       |
|    iterations           | 769       |
|    time_elapsed         | 5196      |
|    total_timesteps      | 1574912   |
| train/                  |           |
|    approx_kl            | 0.3285213 |
|    clip_fraction        | 0.304     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.368    |
|    explained_variance   | 0.268     |
|    learning_rate        | 0.001     |
|    loss                 | 0.0001    |
|    n_updates            | 7680      |
|    policy_gradient_loss | -0.0405   |
|    value_loss           | 0.155     |
---------------------------------------
Ep done - 52570.
Ep done - 52580.
Ep done

Ep done - 53050.
Ep done - 53060.
Ep done - 53070.
Ep done - 53080.
Ep done - 53090.
Ep done - 53100.
Ep done - 53110.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | -0.07      |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 777        |
|    time_elapsed         | 5257       |
|    total_timesteps      | 1591296    |
| train/                  |            |
|    approx_kl            | 0.39696074 |
|    clip_fraction        | 0.35       |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.434     |
|    explained_variance   | 0.239      |
|    learning_rate        | 0.001      |
|    loss                 | -0.016     |
|    n_updates            | 7760       |
|    policy_gradient_loss | -0.0475    |
|    value_loss           | 0.184      |
----------------------------------------
Ep done - 53120.
Ep 

Ep done - 53590.
Ep done - 53600.
Ep done - 53610.
Ep done - 53620.
Ep done - 53630.
Ep done - 53640.
Ep done - 53650.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.9       |
|    ep_rew_mean          | 0.02       |
| time/                   |            |
|    fps                  | 302        |
|    iterations           | 785        |
|    time_elapsed         | 5308       |
|    total_timesteps      | 1607680    |
| train/                  |            |
|    approx_kl            | 0.38654244 |
|    clip_fraction        | 0.323      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.4       |
|    explained_variance   | 0.236      |
|    learning_rate        | 0.001      |
|    loss                 | 0.0258     |
|    n_updates            | 7840       |
|    policy_gradient_loss | -0.0384    |
|    value_loss           | 0.205      |
----------------------------------------
Ep done - 53660.
Ep 

Ep done - 54140.
Ep done - 54150.
Ep done - 54160.
Ep done - 54170.
Ep done - 54180.
Ep done - 54190.
Ep done - 54200.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.35       |
| time/                   |            |
|    fps                  | 303        |
|    iterations           | 793        |
|    time_elapsed         | 5359       |
|    total_timesteps      | 1624064    |
| train/                  |            |
|    approx_kl            | 0.29875377 |
|    clip_fraction        | 0.313      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.387     |
|    explained_variance   | 0.164      |
|    learning_rate        | 0.001      |
|    loss                 | -0.00138   |
|    n_updates            | 7920       |
|    policy_gradient_loss | -0.0397    |
|    value_loss           | 0.181      |
----------------------------------------
Ep done - 54210.
Ep 

Ep done - 54740.
Ep done - 54750.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 0.29     |
| time/              |          |
|    fps             | 303      |
|    iterations      | 801      |
|    time_elapsed    | 5409     |
|    total_timesteps | 1640448  |
---------------------------------
Ep done - 54760.
Ep done - 54770.
Ep done - 54780.
Ep done - 54790.
Ep done - 54800.
Ep done - 54810.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.34       |
| time/                   |            |
|    fps                  | 303        |
|    iterations           | 802        |
|    time_elapsed         | 5415       |
|    total_timesteps      | 1642496    |
| train/                  |            |
|    approx_kl            | 0.31688622 |
|    clip_fraction        | 0.331      |
|    clip_range           | 0.2 

Ep done - 55360.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.41       |
| time/                   |            |
|    fps                  | 303        |
|    iterations           | 810        |
|    time_elapsed         | 5461       |
|    total_timesteps      | 1658880    |
| train/                  |            |
|    approx_kl            | 0.33053112 |
|    clip_fraction        | 0.291      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.343     |
|    explained_variance   | 0.102      |
|    learning_rate        | 0.001      |
|    loss                 | -0.0304    |
|    n_updates            | 8090       |
|    policy_gradient_loss | -0.0373    |
|    value_loss           | 0.18       |
----------------------------------------
Ep done - 55370.
Ep done - 55380.
Ep done - 55390.
Ep done - 55400.
Ep done - 16410.
Ep done - 16420.
Ep done - 16430.
Ep 

Ep done - 55850.
Ep done - 55860.
Ep done - 55870.
Ep done - 55880.
Ep done - 55890.
Ep done - 55900.
Ep done - 55910.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30        |
|    ep_rew_mean          | 0.15      |
| time/                   |           |
|    fps                  | 303       |
|    iterations           | 818       |
|    time_elapsed         | 5518      |
|    total_timesteps      | 1675264   |
| train/                  |           |
|    approx_kl            | 0.3139801 |
|    clip_fraction        | 0.319     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.407    |
|    explained_variance   | 0.0795    |
|    learning_rate        | 0.001     |
|    loss                 | -0.0139   |
|    n_updates            | 8170      |
|    policy_gradient_loss | -0.0447   |
|    value_loss           | 0.204     |
---------------------------------------
Ep done - 55920.
Ep done - 55930.
Ep done

In [130]:
obs = env.unwrapped.get_obs()
obs

OrderedDict([('board',
              array([[0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 1, 2, 0, 0, 0],
                     [0, 0, 0, 2, 1, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0]])),
             ('player', 1)])

In [131]:
new_obs = spaces.flatten(env.unwrapped.observation_space, obs)
new_obs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [133]:
for i in range(15):
    print(model.predict(new_obs))

(array(22), None)
(array(29), None)
(array(41), None)
(array(28), None)
(array(5), None)
(array(46), None)
(array(24), None)
(array(13), None)
(array(40), None)
(array(59), None)
(array(13), None)
(array(45), None)
(array(18), None)
(array(62), None)
(array(51), None)


In [115]:
new_obs

array([0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 2, 2, 2, 1, 0, 1, 0, 2, 2, 2, 2, 2,
       1, 2, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0,
       2, 2, 1, 1, 0, 2, 1, 0, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1])

In [147]:
env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)
env_eval = FlattenObservation(env_eval)

# env_eval = DummyVecEnv(env_fns=[lambda: env_eval])

In [161]:
model1 = MaskablePPO.load('history_00000385.zip')
model_random = MaskablePPO.load('ppo_masked_selfplay_2/history_00000218.zip')

In [162]:
# env_eval.envs[0].unwrapped.change_to_latest_agent(model1)
env_eval.unwrapped.change_to_latest_agent(model1)

In [37]:
evaluate_policy = masked_evaluate_policy
episode_rewards, episode_lengths = evaluate_policy(
                model_random,
                env_eval,
                n_eval_episodes=100,                
                deterministic=True,
                return_episode_rewards=True,
                warn=True
            )

In [38]:
np.mean(episode_rewards)

0.33

In [163]:
def my_play(env, model, episodes = 100):     
    wins = 0
    for episode in range(1, episodes+1):
        obs, _ = env.reset()        
        done = False
        score = 0 
        
        while not done:
            # env.render()
            action, _ = model.predict(obs, action_masks=env.unwrapped.action_masks(), deterministic=False)            
            obs, reward, done, _, info = env.step(action)            
            score+=reward
        won = env.game.get_winner() == env.agent_turn
        # print(f'Episode:{episode} Score:{score}, play as {env.agent_turn}, won = {won}')
        if won:
            wins+=1
    return wins


In [166]:
my_play(env_eval, model_random, 100)

51

In [176]:
np.append(env_eval.game.board.flatten(), [3, 4])

array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4])