In [1]:
import sys, os
# sys.path.append('/home/user/PycharmProjects/reversi-game/')
source_dir = os.path.abspath(os.path.join(os.getcwd(), '../../../'))
sys.path.append(source_dir)

In [2]:
import numpy as np

In [3]:
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor

import stable_baselines3.common.callbacks as callbacks_module
from sb3_contrib.common.maskable.evaluation import evaluate_policy as masked_evaluate_policy

# Modify the namespace of EvalCallback directly
callbacks_module.evaluate_policy = masked_evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

# from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
# from stable_baselines3.common.callbacks import EvalCallback

from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy, MaskableMultiInputActorCriticPolicy
from sb3_contrib.ppo_mask import MaskablePPO

from shutil import copyfile # keep track of generations
from collections import OrderedDict

from gymnasium.spaces import Discrete, Box, Dict, MultiDiscrete
from gymnasium.wrappers import FlattenObservation
import gymnasium.spaces as spaces
from game_logic import Othello
import numpy as np
import os, math
from itertools import cycle

In [4]:
import torch as th

In [5]:
# Settings
SEED = 19
NUM_TIMESTEPS = int(30_000_000)
EVAL_FREQ = int(10_000)
EVAL_EPISODES = int(100)
BEST_THRESHOLD = 0.25 # must achieve a mean score above this to replace prev best self

RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.

LOGDIR = "models/delete"
# LOGDIR = "delete_me"

In [6]:
class OthelloEnv(gym.Env):   
    def __init__(self):
        self.game = Othello()
        self.agent_turn = 1
        shape = self.game.board.shape  
        self.action_mapping = self.am(shape)
        self.action_space = Discrete(shape[0] * shape[1])  # sample - [x, y]
        self.observation_space = Dict({
                                        'board' : Box(0, 2, shape=shape, dtype=int),
                                        'chips' : MultiDiscrete([65, 65]),
                                        'player': Discrete(2, start=1)
                                      })        
        self.other_agent = None
        # self.reset_othello_gen = self.reset_othello()   
        
        self.episodes = 0    
        # self.global_reward = 0
    

    def am(self, shape):
        x, y = shape
        return [(n//x, n%x) for n in range(x * y)]
        

    # def reset_othello(self):
    #     '''resets game to starting position 
    #        and also changes starting player alternatively'''
    #     infinite_player_turn = cycle([1]*10 + [2]*10)
    #     while True:
    #         game = Othello()
    #         model_turn = next(infinite_player_turn)
    #         yield game, model_turn
    
    def change_to_latest_agent(self, agent):
        self.other_agent = agent

    def get_obs(self):        
        return OrderedDict({
            'board' : self.game.board,
            'chips' : np.array(self.game.chips),
            'player': self.game.player_turn
        })

    def get_chips_diff(self): #  from agent perspective
        idx = self.agent_turn - 1  # map [1, 2] to [0, 1]
        diff = self.game.chips[idx] - self.game.chips[1 - idx]
        return diff
        
    def check_game_ended(self):
        reward = 0
        done = False
        winner = self.game.get_winner()
        
        if winner is not None:
            self.episodes += 1
            if self.episodes % 10 == 0:
                print(f'Ep done - {self.episodes}.')
                # print(f'global_reward -- {self.global_reward}, -- won: {winner == self.agent_turn}')
            
            done = True
            if winner == self.agent_turn:
                # reward = max(abs(self.global_reward)*2, 1000)
                reward = 1
            elif winner == 3 - self.agent_turn: #  other agent turn/figure
                # reward = min(-abs(self.global_reward)*2, -1000)
                reward = -1
        return reward, done
    
    def render(self):  # todo 
        pass

    def close(self):  # todo
        pass

    def other_agent_play_move(self): 
        obs = self.get_obs()
        obs = spaces.flatten(self.observation_space, obs)#  need to flatten observation         
        action, _ = self.other_agent.predict(obs,
                                             action_masks=self.action_masks(),
                                             deterministic=False) 
        game_action = self.action_mapping[action]
        self.game.play_move(game_action)

    def step(self, action):
        # diff_chips_before = self.get_chips_diff()
        
        game_action = self.action_mapping[action]  
        self.game.play_move(game_action)

        # inner agent plays
        while self.game.get_winner() is None and self.game.player_turn != self.agent_turn: #  if game hasnt ended do moves if opponent doesnt have one 
            self.other_agent_play_move()

        # diff_chips_after = self.get_chips_diff()
        
        reward, done = self.check_game_ended()

        # turn = self.game.turn
        # if turn <= 58: #  not sure... feel like at the end theres no more moves to choose and high reward would be bad            
        #     factor = (turn // 10) + 1
        #     step_reward = factor * (diff_chips_after - diff_chips_before)
        #     self.global_reward += step_reward
        #     reward += factor * (diff_chips_after - diff_chips_before)

        
        info = {}
        truncated = False

                
        # Return step information
        return self.get_obs(), reward, done, truncated, info
    
    def reset(self, *args, **kwargs):
        # self.global_reward = 0
        self.game = Othello() # self.game, self.agent_turn = next(self.reset_othello_gen)
        if self.agent_turn == 2:
            self.other_agent_play_move()
        return self.get_obs(), None

    def action_masks(self):        
        valid_moves = self.game.valid_moves()
    
        mask = np.zeros(self.game.board.shape, dtype=bool)
        
        # Set True for each index in the set
        for index in valid_moves:
            mask[index] = True
        mask.flatten()
        return mask

In [7]:
class SelfPlayCallback(EvalCallback):
    # hacked it to only save new version offrom gymnasium.wrappers import FlattenObservation best model if beats prev self by BEST_THRESHOLD score
    # after saving model, resets the best score to be BEST_THRESHOLD
    def __init__(self, eval_env, *args, **kwargs):
        super().__init__(eval_env, *args, **kwargs)
        self.best_mean_reward = BEST_THRESHOLD
        self.generation = 0        
    def _on_step(self) -> bool:
        # result = super()._on_step() #  eval needs to be masked, its less efficient 
        result = super()._on_step()
        
        if result and self.best_mean_reward > BEST_THRESHOLD:
            self.generation += 1
            print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
            print("SELFPLAY: new best model, bumping up generation to", self.generation)            
            source_file = os.path.join(LOGDIR, "best_model.zip")
            backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
            copyfile(source_file, backup_file)
            self.best_mean_reward = BEST_THRESHOLD
            agent = self.model.load(source_file)
                
            self.training_env.env_method("change_to_latest_agent", agent)           
            self.eval_env.env_method("change_to_latest_agent", agent) #  .env_method("method_name", args1, args2, kwargs1=kwargs1) 
        return result    

In [37]:
env = OthelloEnv()
env = Monitor(env=env)
env = FlattenObservation(env)

In [38]:
# policy_kwargs = dict(activation_fn=th.nn.ReLU,
#                      net_arch=dict(pi=[128, 64], vf=[64, 64]))

In [39]:
# model = MaskablePPO(policy=MaskableActorCriticPolicy, 
#                     env=env, 
#                     verbose=1,
#                     # policy_kwargs=policy_kwargs#,
#                     # learning_rate=1e-5, 
#                     # n_steps=6144
#                    )

In [45]:
model = MaskablePPO.load("models/history_00000329",
                         env=env,
                         learning_rate=0.007, 
                         n_steps=6144) # load existing model

Wrapping the env in a DummyVecEnv.


In [46]:
starting_model_filepath = 'models/history_00000329'
start_model_copy = MaskablePPO.load(starting_model_filepath)
env.unwrapped.change_to_latest_agent(start_model_copy)

In [47]:
# starting_model_filepath = LOGDIR + '/start_model'
# model.save(starting_model_filepath)
# start_model_copy = model.load(starting_model_filepath)
# env.unwrapped.change_to_latest_agent(start_model_copy)

In [48]:
env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)
env_eval = FlattenObservation(env_eval)

env_eval = DummyVecEnv(env_fns=[lambda: env_eval])
env_eval.envs[0].unwrapped.change_to_latest_agent(start_model_copy)

In [None]:
eval_callback = SelfPlayCallback(    
    env_eval,
    best_model_save_path=LOGDIR,
    log_path=LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False 
    )



model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

Ep done - 7550.
Ep done - 7560.
Ep done - 7570.
Ep done - 7580.
Ep done - 7590.
Ep done - 7600.
Ep done - 7610.
Ep done - 7620.
Ep done - 7630.
Ep done - 7640.
Ep done - 7650.
Ep done - 7660.
Ep done - 7670.
Ep done - 7680.
Ep done - 7690.
Ep done - 7700.
Ep done - 7710.
Ep done - 7720.
Ep done - 7730.
Ep done - 7740.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.55     |
| time/              |          |
|    fps             | 675      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 6144     |
---------------------------------
Ep done - 7750.
Ep done - 7760.
Ep done - 7770.
Ep done - 7780.
Ep done - 7790.
Ep done - 7800.
Ep done - 7810.
Ep done - 7820.
Ep done - 7830.
Ep done - 7840.
Ep done - 7850.
Ep done - 7860.
Ep done - 7870.
Ep done - 10.
Ep done - 20.
Ep done - 30.
Ep done - 40.
Ep done - 50.
Ep done - 60.
Ep done - 70.
Ep done - 80.
Ep done - 90.
Ep don

New best mean reward!
SELFPLAY: mean_reward achieved: 0.28
SELFPLAY: new best model, bumping up generation to 36
Ep done - 15570.
Ep done - 15580.
Ep done - 15590.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.2     |
|    ep_rew_mean     | 0.31     |
| time/              |          |
|    fps             | 153      |
|    iterations      | 230      |
|    time_elapsed    | 3060     |
|    total_timesteps | 471040   |
---------------------------------
Ep done - 15600.
Ep done - 15610.
Ep done - 15620.
Ep done - 15630.
Ep done - 15640.
Ep done - 15650.
Ep done - 15660.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.3        |
|    ep_rew_mean          | 0.42        |
| time/                   |             |
|    fps                  | 154         |
|    iterations           | 231         |
|    time_elapsed         | 3069        |
|    total_timesteps      | 473088      |

Ep done - 16150.
Ep done - 16160.
Ep done - 16170.
Ep done - 16180.
Ep done - 16190.
Ep done - 16200.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.26        |
| time/                   |             |
|    fps                  | 155         |
|    iterations           | 239         |
|    time_elapsed         | 3152        |
|    total_timesteps      | 489472      |
| train/                  |             |
|    approx_kl            | 0.029371463 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.448      |
|    explained_variance   | 0.185       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0133     |
|    n_updates            | 2380        |
|    policy_gradient_loss | -0.034      |
|    value_loss           | 0.15        |
-----------------------------------------
Ep done - 16210.

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.54        |
| time/                   |             |
|    fps                  | 155         |
|    iterations           | 246         |
|    time_elapsed         | 3235        |
|    total_timesteps      | 503808      |
| train/                  |             |
|    approx_kl            | 0.036805235 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.391      |
|    explained_variance   | 0.373       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0109     |
|    n_updates            | 2450        |
|    policy_gradient_loss | -0.035      |
|    value_loss           | 0.112       |
-----------------------------------------
Ep done - 16690.
Ep done - 16700.
Ep done - 16710.
Ep done - 16720.
Ep done - 16730.
Ep done - 16740.
----------------

Ep done - 17160.
Ep done - 17170.
Ep done - 17180.
Ep done - 17190.
Ep done - 17200.
Ep done - 17210.
Ep done - 5110.
Ep done - 5120.
Ep done - 5130.
Ep done - 5140.
Ep done - 5150.
Ep done - 5160.
Ep done - 5170.
Ep done - 5180.
Ep done - 5190.
Ep done - 5200.
Eval num_timesteps=520000, episode_reward=0.57 +/- 0.80
Episode length: 30.28 +/- 0.47
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.3        |
|    mean_reward          | 0.57        |
| time/                   |             |
|    total_timesteps      | 520000      |
| train/                  |             |
|    approx_kl            | 0.029928576 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.375      |
|    explained_variance   | 0.275       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00441     |
|    n_updates            | 2530        |
|    policy_gradient_

Ep done - 17680.
Ep done - 17690.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.4       |
|    ep_rew_mean          | 0.73       |
| time/                   |            |
|    fps                  | 157        |
|    iterations           | 261        |
|    time_elapsed         | 3403       |
|    total_timesteps      | 534528     |
| train/                  |            |
|    approx_kl            | 0.03239207 |
|    clip_fraction        | 0.159      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.389     |
|    explained_variance   | 0.471      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0359    |
|    n_updates            | 2600       |
|    policy_gradient_loss | -0.0298    |
|    value_loss           | 0.0558     |
----------------------------------------
Ep done - 17700.
Ep done - 17710.
Ep done - 17720.
Ep done - 17730.
Ep done - 17740.
Ep done - 17750.
Ep 

Ep done - 18180.
Ep done - 18190.
Ep done - 18200.
Ep done - 18210.
Ep done - 5410.
Ep done - 5420.
Ep done - 5430.
Ep done - 5440.
Ep done - 5450.
Ep done - 5460.
Ep done - 5470.
Ep done - 5480.
Ep done - 5490.
Ep done - 5500.
Eval num_timesteps=550000, episode_reward=0.65 +/- 0.74
Episode length: 30.22 +/- 0.50
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30.2       |
|    mean_reward          | 0.65       |
| time/                   |            |
|    total_timesteps      | 550000     |
| train/                  |            |
|    approx_kl            | 0.03591458 |
|    clip_fraction        | 0.166      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.373     |
|    explained_variance   | 0.341      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0262     |
|    n_updates            | 2680       |
|    policy_gradient_loss | -0.0293    |
|    value_loss           | 0

Ep done - 18720.
Ep done - 18730.
Ep done - 18740.
Ep done - 18750.
Ep done - 18760.
Ep done - 18770.
Ep done - 18780.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.69        |
| time/                   |             |
|    fps                  | 158         |
|    iterations           | 277         |
|    time_elapsed         | 3577        |
|    total_timesteps      | 567296      |
| train/                  |             |
|    approx_kl            | 0.032685764 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.357      |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00193     |
|    n_updates            | 2760        |
|    policy_gradient_loss | -0.0253     |
|    value_loss           | 0.0791      |
-----------------------------------------

New best mean reward!
SELFPLAY: mean_reward achieved: 0.73
SELFPLAY: new best model, bumping up generation to 47
Ep done - 19210.
Ep done - 19220.
Ep done - 19230.
Ep done - 19240.
Ep done - 19250.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.61     |
| time/              |          |
|    fps             | 158      |
|    iterations      | 284      |
|    time_elapsed    | 3660     |
|    total_timesteps | 581632   |
---------------------------------
Ep done - 19260.
Ep done - 19270.
Ep done - 19280.
Ep done - 19290.
Ep done - 19300.
Ep done - 19310.
Ep done - 19320.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.3        |
|    ep_rew_mean          | 0.57        |
| time/                   |             |
|    fps                  | 159         |
|    iterations           | 285         |
|    time_elapsed         | 3669        |
|    to

Ep done - 19810.
Ep done - 19820.
Ep done - 19830.
Ep done - 19840.
Ep done - 19850.
Ep done - 19860.
Ep done - 5910.
Ep done - 5920.
Ep done - 5930.
Ep done - 5940.
Ep done - 5950.
Ep done - 5960.
Ep done - 5970.
Ep done - 5980.
Ep done - 5990.
Ep done - 6000.
Eval num_timesteps=600000, episode_reward=0.49 +/- 0.85
Episode length: 30.28 +/- 0.51
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.3        |
|    mean_reward          | 0.49        |
| time/                   |             |
|    total_timesteps      | 600000      |
| train/                  |             |
|    approx_kl            | 0.044609495 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.325      |
|    explained_variance   | 0.349       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0364      |
|    n_updates            | 2920        |
|    policy_gradient_

Ep done - 20350.
Ep done - 20360.
Ep done - 20370.
Ep done - 20380.
Ep done - 20390.
Ep done - 20400.
Ep done - 20410.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.54       |
| time/                   |            |
|    fps                  | 160        |
|    iterations           | 301        |
|    time_elapsed         | 3844       |
|    total_timesteps      | 616448     |
| train/                  |            |
|    approx_kl            | 0.03403146 |
|    clip_fraction        | 0.142      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.294     |
|    explained_variance   | 0.258      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.00674    |
|    n_updates            | 3000       |
|    policy_gradient_loss | -0.0262    |
|    value_loss           | 0.125      |
----------------------------------------
Ep done - 20420.
Ep 

New best mean reward!
SELFPLAY: mean_reward achieved: 0.5
SELFPLAY: new best model, bumping up generation to 52
Ep done - 20860.
Ep done - 20870.
Ep done - 20880.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.3     |
|    ep_rew_mean     | 0.64     |
| time/              |          |
|    fps             | 160      |
|    iterations      | 308      |
|    time_elapsed    | 3927     |
|    total_timesteps | 630784   |
---------------------------------
Ep done - 20890.
Ep done - 20900.
Ep done - 20910.
Ep done - 20920.
Ep done - 20930.
Ep done - 20940.
Ep done - 20950.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.63        |
| time/                   |             |
|    fps                  | 160         |
|    iterations           | 309         |
|    time_elapsed         | 3936        |
|    total_timesteps      | 632832      |


Ep done - 21430.
Ep done - 21440.
Ep done - 21450.
Ep done - 21460.
Ep done - 21470.
Ep done - 21480.
Ep done - 21490.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.43        |
| time/                   |             |
|    fps                  | 161         |
|    iterations           | 317         |
|    time_elapsed         | 4019        |
|    total_timesteps      | 649216      |
| train/                  |             |
|    approx_kl            | 0.044958454 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.307      |
|    explained_variance   | 0.325       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0311      |
|    n_updates            | 3160        |
|    policy_gradient_loss | -0.0268     |
|    value_loss           | 0.13        |
-----------------------------------------

Ep done - 21960.
Ep done - 21970.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.3       |
|    ep_rew_mean          | 0.65       |
| time/                   |            |
|    fps                  | 161        |
|    iterations           | 324        |
|    time_elapsed         | 4101       |
|    total_timesteps      | 663552     |
| train/                  |            |
|    approx_kl            | 0.03811264 |
|    clip_fraction        | 0.143      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.3       |
|    explained_variance   | 0.298      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0053    |
|    n_updates            | 3230       |
|    policy_gradient_loss | -0.0248    |
|    value_loss           | 0.131      |
----------------------------------------
Ep done - 21980.
Ep done - 21990.
Ep done - 22000.
Ep done - 22010.
Ep done - 22020.
Ep done - 22030.
---

Ep done - 22450.
Ep done - 22460.
Ep done - 22470.
Ep done - 22480.
Ep done - 22490.
Ep done - 22500.
Ep done - 22510.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.71        |
| time/                   |             |
|    fps                  | 162         |
|    iterations           | 332         |
|    time_elapsed         | 4184        |
|    total_timesteps      | 679936      |
| train/                  |             |
|    approx_kl            | 0.041124206 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.291      |
|    explained_variance   | 0.3         |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0357      |
|    n_updates            | 3310        |
|    policy_gradient_loss | -0.022      |
|    value_loss           | 0.124       |
-----------------------------------------

Ep done - 22990.
Ep done - 23000.
Ep done - 23010.
Ep done - 23020.
Ep done - 23030.
Ep done - 23040.
Ep done - 23050.
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 30.1      |
|    ep_rew_mean          | 0.65      |
| time/                   |           |
|    fps                  | 162       |
|    iterations           | 340       |
|    time_elapsed         | 4276      |
|    total_timesteps      | 696320    |
| train/                  |           |
|    approx_kl            | 0.0529887 |
|    clip_fraction        | 0.152     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.278    |
|    explained_variance   | 0.263     |
|    learning_rate        | 0.0003    |
|    loss                 | 0.0262    |
|    n_updates            | 3390      |
|    policy_gradient_loss | -0.026    |
|    value_loss           | 0.124     |
---------------------------------------
Ep done - 23060.
Ep done - 23070.
Ep done

New best mean reward!
SELFPLAY: mean_reward achieved: 0.69
SELFPLAY: new best model, bumping up generation to 60
Ep done - 23520.
Ep done - 23530.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.59     |
| time/              |          |
|    fps             | 163      |
|    iterations      | 347      |
|    time_elapsed    | 4358     |
|    total_timesteps | 710656   |
---------------------------------
Ep done - 23540.
Ep done - 23550.
Ep done - 23560.
Ep done - 23570.
Ep done - 23580.
Ep done - 23590.
Ep done - 23600.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.68       |
| time/                   |            |
|    fps                  | 163        |
|    iterations           | 348        |
|    time_elapsed         | 4367       |
|    total_timesteps      | 712704     |
| train/                 

Ep done - 24080.
Ep done - 24090.
Ep done - 24100.
Ep done - 24110.
Ep done - 24120.
Ep done - 24130.
Ep done - 24140.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.63        |
| time/                   |             |
|    fps                  | 163         |
|    iterations           | 356         |
|    time_elapsed         | 4450        |
|    total_timesteps      | 729088      |
| train/                  |             |
|    approx_kl            | 0.037916128 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.286      |
|    explained_variance   | 0.316       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0591      |
|    n_updates            | 3550        |
|    policy_gradient_loss | -0.0254     |
|    value_loss           | 0.152       |
-----------------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.54        |
| time/                   |             |
|    fps                  | 163         |
|    iterations           | 363         |
|    time_elapsed         | 4533        |
|    total_timesteps      | 743424      |
| train/                  |             |
|    approx_kl            | 0.041021675 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.307      |
|    explained_variance   | 0.266       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0171      |
|    n_updates            | 3620        |
|    policy_gradient_loss | -0.0251     |
|    value_loss           | 0.144       |
-----------------------------------------
Ep done - 24620.
Ep done - 24630.
Ep done - 24640.
Ep done - 24650.
Ep done - 24660.
Ep done - 24670.
Ep done - 24680.

Ep done - 25100.
Ep done - 25110.
Ep done - 25120.
Ep done - 25130.
Ep done - 25140.
Ep done - 25150.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.68        |
| time/                   |             |
|    fps                  | 164         |
|    iterations           | 371         |
|    time_elapsed         | 4616        |
|    total_timesteps      | 759808      |
| train/                  |             |
|    approx_kl            | 0.045327045 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.28       |
|    explained_variance   | 0.316       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0351      |
|    n_updates            | 3700        |
|    policy_gradient_loss | -0.0265     |
|    value_loss           | 0.116       |
-----------------------------------------
Ep done - 25160.

Ep done - 25630.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.56       |
| time/                   |            |
|    fps                  | 164        |
|    iterations           | 378        |
|    time_elapsed         | 4699       |
|    total_timesteps      | 774144     |
| train/                  |            |
|    approx_kl            | 0.05117338 |
|    clip_fraction        | 0.151      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.284     |
|    explained_variance   | 0.495      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00643   |
|    n_updates            | 3770       |
|    policy_gradient_loss | -0.0268    |
|    value_loss           | 0.125      |
----------------------------------------
Ep done - 25640.
Ep done - 25650.
Ep done - 25660.
Ep done - 25670.
Ep done - 25680.
Ep done - 25690.
Ep done - 25700.
---

Ep done - 26110.
Ep done - 26120.
Ep done - 26130.
Ep done - 26140.
Ep done - 26150.
Ep done - 7810.
Ep done - 7820.
Ep done - 7830.
Ep done - 7840.
Ep done - 7850.
Ep done - 7860.
Ep done - 7870.
Ep done - 7880.
Ep done - 7890.
Ep done - 7900.
Eval num_timesteps=790000, episode_reward=0.56 +/- 0.82
Episode length: 30.19 +/- 0.48
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30.2       |
|    mean_reward          | 0.56       |
| time/                   |            |
|    total_timesteps      | 790000     |
| train/                  |            |
|    approx_kl            | 0.04302481 |
|    clip_fraction        | 0.149      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.249     |
|    explained_variance   | 0.393      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0273     |
|    n_updates            | 3850       |
|    policy_gradient_loss | -0.0255    |
|    value_l

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.45        |
| time/                   |             |
|    fps                  | 165         |
|    iterations           | 393         |
|    time_elapsed         | 4865        |
|    total_timesteps      | 804864      |
| train/                  |             |
|    approx_kl            | 0.041192144 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.254      |
|    explained_variance   | 0.316       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0264      |
|    n_updates            | 3920        |
|    policy_gradient_loss | -0.0221     |
|    value_loss           | 0.18        |
-----------------------------------------
Ep done - 26660.
Ep done - 26670.
Ep done - 26680.
Ep done - 26690.
Ep done - 26700.
Ep done - 26710.
----------------

Ep done - 27130.
Ep done - 27140.
Ep done - 27150.
Ep done - 8110.
Ep done - 8120.
Ep done - 8130.
Ep done - 8140.
Ep done - 8150.
Ep done - 8160.
Ep done - 8170.
Ep done - 8180.
Ep done - 8190.
Ep done - 8200.
Eval num_timesteps=820000, episode_reward=0.62 +/- 0.77
Episode length: 30.22 +/- 0.54
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.2        |
|    mean_reward          | 0.62        |
| time/                   |             |
|    total_timesteps      | 820000      |
| train/                  |             |
|    approx_kl            | 0.040978573 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.245      |
|    explained_variance   | 0.323       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0687      |
|    n_updates            | 4000        |
|    policy_gradient_loss | -0.0219     |
|    value_loss           | 0.

Ep done - 27670.
Ep done - 27680.
Ep done - 27690.
Ep done - 27700.
Ep done - 27710.
Ep done - 27720.
Ep done - 27730.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.78       |
| time/                   |            |
|    fps                  | 166        |
|    iterations           | 409        |
|    time_elapsed         | 5041       |
|    total_timesteps      | 837632     |
| train/                  |            |
|    approx_kl            | 0.04047399 |
|    clip_fraction        | 0.145      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.262     |
|    explained_variance   | 0.316      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0153    |
|    n_updates            | 4080       |
|    policy_gradient_loss | -0.0229    |
|    value_loss           | 0.0849     |
----------------------------------------
Ep done - 27740.
Ep 

New best mean reward!
SELFPLAY: mean_reward achieved: 0.71
SELFPLAY: new best model, bumping up generation to 74
Ep done - 28150.
Ep done - 28160.
Ep done - 28170.
Ep done - 28180.
Ep done - 28190.
Ep done - 28200.
Ep done - 28210.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.2     |
|    ep_rew_mean     | 0.64     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 416      |
|    time_elapsed    | 5126     |
|    total_timesteps | 851968   |
---------------------------------
Ep done - 28220.
Ep done - 28230.
Ep done - 28240.
Ep done - 28250.
Ep done - 28260.
Ep done - 28270.
Ep done - 28280.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.43        |
| time/                   |             |
|    fps                  | 166         |
|    iterations           | 417         |
|    time_elaps

Ep done - 28750.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.71       |
| time/                   |            |
|    fps                  | 166        |
|    iterations           | 424        |
|    time_elapsed         | 5209       |
|    total_timesteps      | 868352     |
| train/                  |            |
|    approx_kl            | 0.04622655 |
|    clip_fraction        | 0.133      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.22      |
|    explained_variance   | 0.209      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00651   |
|    n_updates            | 4230       |
|    policy_gradient_loss | -0.0186    |
|    value_loss           | 0.106      |
----------------------------------------
Ep done - 28760.
Ep done - 28770.
Ep done - 28780.
Ep done - 28790.
Ep done - 28800.
Ep done - 28810.
Ep done - 8610.
Ep d

Ep done - 29240.
Ep done - 29250.
Ep done - 29260.
Ep done - 29270.
Ep done - 29280.
Ep done - 29290.
Ep done - 29300.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.6        |
|    ep_rew_mean          | 0.35        |
| time/                   |             |
|    fps                  | 166         |
|    iterations           | 432         |
|    time_elapsed         | 5302        |
|    total_timesteps      | 884736      |
| train/                  |             |
|    approx_kl            | 0.037188195 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.228      |
|    explained_variance   | 0.228       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0351      |
|    n_updates            | 4310        |
|    policy_gradient_loss | -0.0213     |
|    value_loss           | 0.175       |
-----------------------------------------

Ep done - 29800.
Ep done - 29810.
Ep done - 29820.
Ep done - 8910.
Ep done - 8920.
Ep done - 8930.
Ep done - 8940.
Ep done - 8950.
Ep done - 8960.
Ep done - 8970.
Ep done - 8980.
Ep done - 8990.
Ep done - 9000.
Eval num_timesteps=900000, episode_reward=0.57 +/- 0.82
Episode length: 28.61 +/- 5.41
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 28.6        |
|    mean_reward          | 0.57        |
| time/                   |             |
|    total_timesteps      | 900000      |
| train/                  |             |
|    approx_kl            | 0.044796735 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.214      |
|    explained_variance   | 0.364       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0423      |
|    n_updates            | 4390        |
|    policy_gradient_loss | -0.0212     |
|    value_loss           | 0.

Ep done - 30340.
Ep done - 30350.
Ep done - 30360.
Ep done - 30370.
Ep done - 30380.
Ep done - 30390.
Ep done - 30400.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30         |
|    ep_rew_mean          | 0.34       |
| time/                   |            |
|    fps                  | 167        |
|    iterations           | 448        |
|    time_elapsed         | 5479       |
|    total_timesteps      | 917504     |
| train/                  |            |
|    approx_kl            | 0.03981182 |
|    clip_fraction        | 0.143      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.255     |
|    explained_variance   | 0.337      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0227     |
|    n_updates            | 4470       |
|    policy_gradient_loss | -0.025     |
|    value_loss           | 0.216      |
----------------------------------------
Ep done - 30410.
Ep 

New best mean reward!
SELFPLAY: mean_reward achieved: 0.39
SELFPLAY: new best model, bumping up generation to 82
Ep done - 30830.
Ep done - 30840.
Ep done - 30850.
Ep done - 30860.
Ep done - 30870.
Ep done - 30880.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.8     |
|    ep_rew_mean     | 0.43     |
| time/              |          |
|    fps             | 167      |
|    iterations      | 455      |
|    time_elapsed    | 5562     |
|    total_timesteps | 931840   |
---------------------------------
Ep done - 30890.
Ep done - 30900.
Ep done - 30910.
Ep done - 30920.
Ep done - 30930.
Ep done - 30940.
Ep done - 30950.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 29.6       |
|    ep_rew_mean          | 0.34       |
| time/                   |            |
|    fps                  | 167        |
|    iterations           | 456        |
|    time_elapsed         | 5572       

Ep done - 31430.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.52       |
| time/                   |            |
|    fps                  | 167        |
|    iterations           | 463        |
|    time_elapsed         | 5646       |
|    total_timesteps      | 948224     |
| train/                  |            |
|    approx_kl            | 0.03817923 |
|    clip_fraction        | 0.144      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.256     |
|    explained_variance   | 0.248      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0538     |
|    n_updates            | 4620       |
|    policy_gradient_loss | -0.0255    |
|    value_loss           | 0.207      |
----------------------------------------
Ep done - 31440.
Ep done - 31450.
Ep done - 31460.
Ep done - 31470.
Ep done - 31480.
Ep done - 9410.
Ep done - 9420.
Ep do

Ep done - 31910.
Ep done - 31920.
Ep done - 31930.
Ep done - 31940.
Ep done - 31950.
Ep done - 31960.
Ep done - 31970.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.3       |
|    ep_rew_mean          | 0.55       |
| time/                   |            |
|    fps                  | 168        |
|    iterations           | 471        |
|    time_elapsed         | 5739       |
|    total_timesteps      | 964608     |
| train/                  |            |
|    approx_kl            | 0.04042045 |
|    clip_fraction        | 0.142      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.26      |
|    explained_variance   | 0.384      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0404     |
|    n_updates            | 4700       |
|    policy_gradient_loss | -0.0268    |
|    value_loss           | 0.136      |
----------------------------------------
Ep done - 31980.
Ep 

Ep done - 32450.
Ep done - 32460.
Ep done - 32470.
Ep done - 32480.
Ep done - 9710.
Ep done - 9720.
Ep done - 9730.
Ep done - 9740.
Ep done - 9750.
Ep done - 9760.
Ep done - 9770.
Ep done - 9780.
Ep done - 9790.
Ep done - 9800.
Eval num_timesteps=980000, episode_reward=0.58 +/- 0.80
Episode length: 30.17 +/- 0.45
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.2        |
|    mean_reward          | 0.58        |
| time/                   |             |
|    total_timesteps      | 980000      |
| train/                  |             |
|    approx_kl            | 0.045215447 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.241      |
|    explained_variance   | 0.385       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0432      |
|    n_updates            | 4780        |
|    policy_gradient_loss | -0.0244     |
|    value_lo

Ep done - 32990.
Ep done - 33000.
Ep done - 33010.
Ep done - 33020.
Ep done - 33030.
Ep done - 33040.
Ep done - 33050.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.44        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 487         |
|    time_elapsed         | 5915        |
|    total_timesteps      | 997376      |
| train/                  |             |
|    approx_kl            | 0.039672844 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.272      |
|    explained_variance   | 0.427       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0303      |
|    n_updates            | 4860        |
|    policy_gradient_loss | -0.027      |
|    value_loss           | 0.126       |
-----------------------------------------

New best mean reward!
SELFPLAY: mean_reward achieved: 0.61
SELFPLAY: new best model, bumping up generation to 90
Ep done - 33480.
Ep done - 33490.
Ep done - 33500.
Ep done - 33510.
Ep done - 33520.
Ep done - 33530.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.53     |
| time/              |          |
|    fps             | 168      |
|    iterations      | 494      |
|    time_elapsed    | 5998     |
|    total_timesteps | 1011712  |
---------------------------------
Ep done - 33540.
Ep done - 33550.
Ep done - 33560.
Ep done - 33570.
Ep done - 33580.
Ep done - 33590.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.49        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 495         |
|    time_elapsed         | 6007        |
|    to

Ep done - 34080.
Ep done - 34090.
Ep done - 34100.
Ep done - 34110.
Ep done - 34120.
Ep done - 34130.
Ep done - 10210.
Ep done - 10220.
Ep done - 10230.
Ep done - 10240.
Ep done - 10250.
Ep done - 10260.
Ep done - 10270.
Ep done - 10280.
Ep done - 10290.
Ep done - 10300.
Eval num_timesteps=1030000, episode_reward=0.68 +/- 0.72
Episode length: 30.23 +/- 0.49
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 30.2       |
|    mean_reward          | 0.68       |
| time/                   |            |
|    total_timesteps      | 1030000    |
| train/                  |            |
|    approx_kl            | 0.03874775 |
|    clip_fraction        | 0.141      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.252     |
|    explained_variance   | 0.176      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.00963    |
|    n_updates            | 5020       |
|    policy_gradient_loss

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.53        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 510         |
|    time_elapsed         | 6175        |
|    total_timesteps      | 1044480     |
| train/                  |             |
|    approx_kl            | 0.045740172 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.238      |
|    explained_variance   | 0.3         |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00204    |
|    n_updates            | 5090        |
|    policy_gradient_loss | -0.022      |
|    value_loss           | 0.129       |
-----------------------------------------
Ep done - 34620.
Ep done - 34630.
Ep done - 34640.
Ep done - 34650.
Ep done - 34660.
Ep done - 34670.
Ep done - 34680.

Ep done - 35100.
Ep done - 35110.
Ep done - 35120.
Ep done - 35130.
Ep done - 10510.
Ep done - 10520.
Ep done - 10530.
Ep done - 10540.
Ep done - 10550.
Ep done - 10560.
Ep done - 10570.
Ep done - 10580.
Ep done - 10590.
Ep done - 10600.
Eval num_timesteps=1060000, episode_reward=0.50 +/- 0.85
Episode length: 30.11 +/- 0.47
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.1        |
|    mean_reward          | 0.5         |
| time/                   |             |
|    total_timesteps      | 1060000     |
| train/                  |             |
|    approx_kl            | 0.060327254 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.255      |
|    explained_variance   | 0.433       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0271      |
|    n_updates            | 5170        |
|    policy_gradient_loss | -0.0266     |
| 

Ep done - 35630.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.46        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 525         |
|    time_elapsed         | 6343        |
|    total_timesteps      | 1075200     |
| train/                  |             |
|    approx_kl            | 0.050280638 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.224      |
|    explained_variance   | 0.224       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0228      |
|    n_updates            | 5240        |
|    policy_gradient_loss | -0.0223     |
|    value_loss           | 0.137       |
-----------------------------------------
Ep done - 35640.
Ep done - 35650.
Ep done - 35660.
Ep done - 35670.
Ep done - 35680.
Ep done - 35690.

Ep done - 36120.
Ep done - 10810.
Ep done - 10820.
Ep done - 10830.
Ep done - 10840.
Ep done - 10850.
Ep done - 10860.
Ep done - 10870.
Ep done - 10880.
Ep done - 10890.
Ep done - 10900.
Eval num_timesteps=1090000, episode_reward=0.80 +/- 0.58
Episode length: 30.15 +/- 0.46
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.1        |
|    mean_reward          | 0.8         |
| time/                   |             |
|    total_timesteps      | 1090000     |
| train/                  |             |
|    approx_kl            | 0.050621454 |
|    clip_fraction        | 0.0956      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.145      |
|    explained_variance   | 0.291       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0304      |
|    n_updates            | 5320        |
|    policy_gradient_loss | -0.0183     |
|    value_loss           | 0.11        |
-----------

Ep done - 36650.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.62        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 540         |
|    time_elapsed         | 6512        |
|    total_timesteps      | 1105920     |
| train/                  |             |
|    approx_kl            | 0.052182376 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.195      |
|    explained_variance   | 0.185       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0805      |
|    n_updates            | 5390        |
|    policy_gradient_loss | -0.0158     |
|    value_loss           | 0.128       |
-----------------------------------------
Ep done - 36660.
Ep done - 36670.
Ep done - 36680.
Ep done - 36690.
Ep done - 36700.
Ep done - 36710.

New best mean reward!
SELFPLAY: mean_reward achieved: 0.78
SELFPLAY: new best model, bumping up generation to 101
Ep done - 37130.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.61     |
| time/              |          |
|    fps             | 169      |
|    iterations      | 547      |
|    time_elapsed    | 6595     |
|    total_timesteps | 1120256  |
---------------------------------
Ep done - 37140.
Ep done - 37150.
Ep done - 37160.
Ep done - 37170.
Ep done - 37180.
Ep done - 37190.
Ep done - 37200.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.73        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 548         |
|    time_elapsed         | 6605        |
|    total_timesteps      | 1122304     |
| train/                  |     

Ep done - 37680.
Ep done - 37690.
Ep done - 37700.
Ep done - 37710.
Ep done - 37720.
Ep done - 37730.
Ep done - 37740.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.54       |
| time/                   |            |
|    fps                  | 170        |
|    iterations           | 556        |
|    time_elapsed         | 6689       |
|    total_timesteps      | 1138688    |
| train/                  |            |
|    approx_kl            | 0.02916764 |
|    clip_fraction        | 0.0876     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.128     |
|    explained_variance   | 0.247      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0247     |
|    n_updates            | 5550       |
|    policy_gradient_loss | -0.0157    |
|    value_loss           | 0.096      |
----------------------------------------
Ep done - 37750.
Ep 

Ep done - 38220.
Ep done - 38230.
Ep done - 38240.
Ep done - 38250.
Ep done - 38260.
Ep done - 38270.
Ep done - 38280.
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.7        |
| time/                   |            |
|    fps                  | 170        |
|    iterations           | 564        |
|    time_elapsed         | 6782       |
|    total_timesteps      | 1155072    |
| train/                  |            |
|    approx_kl            | 0.03573143 |
|    clip_fraction        | 0.109      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.151     |
|    explained_variance   | -0.0294    |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0723     |
|    n_updates            | 5630       |
|    policy_gradient_loss | -0.0183    |
|    value_loss           | 0.119      |
----------------------------------------
Ep done - 38290.
Ep 

Ep done - 38770.
Ep done - 38780.
Ep done - 11610.
Ep done - 11620.
Ep done - 11630.
Ep done - 11640.
Ep done - 11650.
Ep done - 11660.
Ep done - 11670.
Ep done - 11680.
Ep done - 11690.
Ep done - 11700.
Eval num_timesteps=1170000, episode_reward=0.44 +/- 0.89
Episode length: 30.13 +/- 0.56
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 30.1        |
|    mean_reward          | 0.44        |
| time/                   |             |
|    total_timesteps      | 1170000     |
| train/                  |             |
|    approx_kl            | 0.047705427 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.198      |
|    explained_variance   | 0.14        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0237      |
|    n_updates            | 5710        |
|    policy_gradient_loss | -0.0235     |
|    value_loss           | 0.16    

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.1       |
|    ep_rew_mean          | 0.6        |
| time/                   |            |
|    fps                  | 170        |
|    iterations           | 579        |
|    time_elapsed         | 6951       |
|    total_timesteps      | 1185792    |
| train/                  |            |
|    approx_kl            | 0.03711496 |
|    clip_fraction        | 0.118      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.176     |
|    explained_variance   | 0.318      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0127     |
|    n_updates            | 5780       |
|    policy_gradient_loss | -0.0205    |
|    value_loss           | 0.109      |
----------------------------------------
Ep done - 39310.
Ep done - 39320.
Ep done - 39330.
Ep done - 39340.
Ep done - 39350.
Ep done - 39360.
Ep done - 39370.
--------------------

New best mean reward!
SELFPLAY: mean_reward achieved: 0.79
SELFPLAY: new best model, bumping up generation to 109
Ep done - 39780.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.78     |
| time/              |          |
|    fps             | 170      |
|    iterations      | 586      |
|    time_elapsed    | 7035     |
|    total_timesteps | 1200128  |
---------------------------------
Ep done - 39790.
Ep done - 39800.
Ep done - 39810.
Ep done - 39820.
Ep done - 39830.
Ep done - 39840.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.87        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 587         |
|    time_elapsed         | 7044        |
|    total_timesteps      | 1202176     |
| train/                  |             |
|    ap

Ep done - 40330.
Ep done - 40340.
Ep done - 40350.
Ep done - 40360.
Ep done - 40370.
Ep done - 40380.
Ep done - 40390.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.55        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 595         |
|    time_elapsed         | 7128        |
|    total_timesteps      | 1218560     |
| train/                  |             |
|    approx_kl            | 0.075213075 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.178      |
|    explained_variance   | 0.464       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0644      |
|    n_updates            | 5940        |
|    policy_gradient_loss | -0.0249     |
|    value_loss           | 0.164       |
-----------------------------------------

Ep done - 40860.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.1        |
|    ep_rew_mean          | 0.66        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 602         |
|    time_elapsed         | 7212        |
|    total_timesteps      | 1232896     |
| train/                  |             |
|    approx_kl            | 0.041216135 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.164      |
|    explained_variance   | 0.11        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0357      |
|    n_updates            | 6010        |
|    policy_gradient_loss | -0.0209     |
|    value_loss           | 0.112       |
-----------------------------------------
Ep done - 40870.
Ep done - 40880.
Ep done - 40890.
Ep done - 40900.
Ep done - 40910.
Ep done - 40920.

Ep done - 41350.
Ep done - 41360.
Ep done - 41370.
Ep done - 41380.
Ep done - 41390.
Ep done - 41400.
Ep done - 41410.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.69        |
| time/                   |             |
|    fps                  | 171         |
|    iterations           | 610         |
|    time_elapsed         | 7296        |
|    total_timesteps      | 1249280     |
| train/                  |             |
|    approx_kl            | 0.040091306 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.154      |
|    explained_variance   | 0.339       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0225      |
|    n_updates            | 6090        |
|    policy_gradient_loss | -0.0176     |
|    value_loss           | 0.121       |
-----------------------------------------

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 30.2       |
|    ep_rew_mean          | 0.68       |
| time/                   |            |
|    fps                  | 171        |
|    iterations           | 617        |
|    time_elapsed         | 7380       |
|    total_timesteps      | 1263616    |
| train/                  |            |
|    approx_kl            | 0.04153005 |
|    clip_fraction        | 0.124      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.225     |
|    explained_variance   | 0.233      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0283     |
|    n_updates            | 6160       |
|    policy_gradient_loss | -0.0192    |
|    value_loss           | 0.119      |
----------------------------------------
Ep done - 41890.
Ep done - 41900.
Ep done - 41910.
Ep done - 41920.
Ep done - 41930.
Ep done - 41940.
Ep done - 41950.
--------------------

In [None]:
env

(4, 1)

In [145]:
# start_model_copy.predict(env.reset()[0], action_masks=env.action_masks())
# model.predict(env.reset()[0], action_masks=env.action_masks())

In [134]:
env.reset()[0]

OrderedDict([('board',
              array([[0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 1, 2, 0, 0, 0],
                     [0, 0, 0, 2, 1, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0]])),
             ('player', 1)])

In [None]:
obs = env.unwrapped.get_obs()
obs

In [131]:
new_obs = spaces.flatten(env.unwrapped.observation_space, obs)
new_obs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [133]:
for i in range(15):
    print(model.predict(new_obs))

(array(22), None)
(array(29), None)
(array(41), None)
(array(28), None)
(array(5), None)
(array(46), None)
(array(24), None)
(array(13), None)
(array(40), None)
(array(59), None)
(array(13), None)
(array(45), None)
(array(18), None)
(array(62), None)
(array(51), None)


In [115]:
new_obs

array([0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 2, 2, 2, 1, 0, 1, 0, 2, 2, 2, 2, 2,
       1, 2, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0,
       2, 2, 1, 1, 0, 2, 1, 0, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1])

In [142]:
env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)
env_eval = FlattenObservation(env_eval)

env_eval = DummyVecEnv(env_fns=[lambda: env_eval])

In [147]:
model1 = MaskablePPO.load('ppo_masked_selfplay/history_00000385.zip')
model_random = MaskablePPO.load('ppo_masked_selfplay/history_00000170.zip')

In [148]:
env_eval.envs[0].unwrapped.change_to_latest_agent(model_random)

In [153]:
episode_rewards, episode_lengths = evaluate_policy(
                model1,
                env_eval,
                n_eval_episodes=100,                
                deterministic=True,
                return_episode_rewards=True,
                warn=True
            )

Ep done - 310.
Ep done - 320.
Ep done - 330.
Ep done - 340.
Ep done - 350.
Ep done - 360.
Ep done - 370.
Ep done - 380.
Ep done - 390.
Ep done - 400.


In [154]:
np.mean(episode_rewards)

-0.12