In [1]:
from stable_baselines3.common.env_checker import check_env
from gym_reversi import ReversiEnv

env = ReversiEnv(player_color='black', opponent = "random", observation_type="numpy3c", 
                illegal_place_mode="lose", board_size=8)
# It will check your custom environment and output additional warnings if needed
check_env(env)




In [3]:
import os
import time
import gymnasium as gym
import numpy as np
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecFrameStack
from stable_baselines3 import PPO
from gym_reversi import ReversiEnv


class ReversiEnvWrapper(gym.Wrapper):
    def __init__(self, opponent="random", is_train=True, board_size=8, is_finished_reward=True, verbose=0):
        env = ReversiEnv(opponent=opponent, is_train=is_train, board_size=board_size, 
                         is_finished_reward=is_finished_reward, verbose=verbose)
        super().__init__(env)
        self.env = env

    def reset(self, seed=None, options=None):
        observation, info = self.env.reset(seed=None)
        return observation, info

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        return observation, reward, terminated, truncated, info

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()

# 运行报错，先不用
def make_vectorized_env(env_wrapper, dumm, n):
    if dumm:
        env = DummyVecEnv([env_wrapper] * n)
    else:
        env = SubprocVecEnv([env_wrapper] * n)
    return env


class ReversiModelTrain(object):
    def __init__(self, board_size=8, check_point_timesteps=100000, n_envs=16, model_path=None,
                 opponent_model_path="random", tensorboard_log=None):
        self.board_size = board_size
        self.check_point_timesteps = check_point_timesteps
        self.n_envs = n_envs
        self.model_path = model_path
        self.opponent_model_path = opponent_model_path
        self.tensorboard_log = tensorboard_log

    def reversi_model_train_step(self, check_point_timesteps):
        if self.opponent_model_path != "random":
            opponent_model = PPO.load(self.opponent_model_path)
        else:
            opponent_model = "random"

        env = ReversiEnv(opponent=opponent_model, is_train=True, board_size=self.board_size,
                         is_finished_reward=True, verbose=0)

        vec_env = env
        if self.n_envs > 1:
            # multi-worker training (n_envs=4 => 4 environments)
            vec_env = make_vec_env(ReversiEnvWrapper, n_envs=self.n_envs, seed=None,
                                   env_kwargs={
                                       "opponent": opponent_model,
                                       "is_train": True,
                                       "board_size": self.board_size,
                                       "is_finished_reward": True,
                                       "verbose": 0},
                                )

            # vec_env = make_vectorized_env(ReversiEnvWrapper, dumm=False, n=8)

        try:
            model = PPO.load(self.model_path, env=vec_env)
        except Exception:
            print(f"load model from self.model_path: {self.model_path} error")
            model = PPO('MlpPolicy', vec_env,
                          policy_kwargs=dict(net_arch=[256, 256]),
                          learning_rate=2.5e-4,  # learning_rate=2.5e-4,
                          ent_coef=0.01,
                          n_steps=64, # n_steps=128,
                          n_epochs=4,
                          batch_size=32, # batch_size=256,
                          gamma=0.99,
                          gae_lambda=0.95,
                          clip_range=0.1,
                          vf_coef=0.5,
                          verbose=1,
                          tensorboard_log=self.tensorboard_log)

        t0 = time.time()
        # model.learn(int(2e4))
        model.learn(total_timesteps=check_point_timesteps)
        model.save(self.model_path)
        print(f"train time: {time.time()-t0}")

    def reversi_model_train(self, total_timesteps=1000000):
        n_check_point = int(np.ceil(total_timesteps/self.check_point_timesteps))
        for i in range(n_check_point):
            self.reversi_model_train_step(self.check_point_timesteps)

    def game_play(self, model_path, opponent_model_path="random", player_color='black', max_round=100):

        # opponent_model = "random"
        # opponent_model = PPO.load("models/Reversi_ppo/model4x4_50w")
        # opponent_model = PPO.load("models/Reversi_ppo/model")
        if self.opponent_model_path != "random":
            opponent_model = PPO.load(opponent_model_path)
        else:
            opponent_model = "random"

        env = ReversiEnv(opponent=opponent_model, is_train=False, board_size=self.board_size, player_color=player_color,
                         is_finished_reward=True, verbose=0)

        model = PPO.load(model_path)
        # model = PPO.load("models/Reversi_ppo/model4x4_50w")

        total_round = 0
        total_win = 0
        total_failure = 0
        total_equal = 0

        t0 = time.time()
        obs, info = env.reset()
        while total_round < max_round:
            action, _states = model.predict(obs, deterministic=False)
            obs, rewards, dones, truncated, info = env.step(action)

            #     print(f"---- round:{total_round} --------")
            #     print(f"action: {action}")
            #     env.render("human")

            if dones:
                print(f"---- round:{total_round} --------")
                #         env.render("human")
                obs, info = env.reset()
                total_round += 1
                if rewards > 0:
                    total_win += 1
                elif rewards < 0:
                    total_failure += 1
                else:
                    total_equal += 1

                print(f"total_win:{total_win}, total_failure: {total_failure}, total_equal:{total_equal}\n")

        # print(f"total_win:{total_win}, total_failure: {total_failure}")
        print(f"train time: {time.time() - t0}")


# if __name__ == '__main__':

#     board_size = 8
#     check_point_timesteps = 100000
#     n_envs = 8
#     tensorboard_log = f"models/Reversi_ppo_{board_size}x{board_size}/"
#     if not os.path.isdir(tensorboard_log):
#         os.makedirs(tensorboard_log)
#     model_path = os.path.join(tensorboard_log, "model")
#     opponent_model_path = "random"

#     train_obj = ReversiModelTrain(board_size=board_size,
#                                   check_point_timesteps=check_point_timesteps,
#                                   n_envs=n_envs,
#                                   model_path=model_path,
#                                   opponent_model_path=opponent_model_path,
#                                   tensorboard_log=tensorboard_log)

#     t0 = time.time()
#     total_timesteps = 1000000
#     train_obj.reversi_model_train(total_timesteps)
#     print(f"total train time: {time.time() - t0}")


In [5]:
board_size = 4
check_point_timesteps = 10000
n_envs = 8
tensorboard_log = f"models/Reversi_ppo_{board_size}x{board_size}/"
if not os.path.isdir(tensorboard_log):
    os.makedirs(tensorboard_log)
model_path = os.path.join(tensorboard_log, "model")
opponent_model_path = "random"

train_obj = ReversiModelTrain(board_size=board_size,
                              check_point_timesteps=check_point_timesteps,
                              n_envs=n_envs,
                              model_path=model_path,
                              opponent_model_path=opponent_model_path,
                              tensorboard_log=tensorboard_log)

t0 = time.time()
total_timesteps = 10000
train_obj.reversi_model_train(total_timesteps)
print(f"total train time: {time.time() - t0}")


Logging to models/Reversi_ppo_4x4/PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.87     |
|    ep_rew_mean     | -0.96    |
| time/              |          |
|    fps             | 1122     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 512      |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.08        |
|    ep_rew_mean          | -0.8        |
| time/                   |             |
|    fps                  | 613         |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.010857185 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.798      |
|    explained_variance   | -0.1

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.22         |
|    ep_rew_mean          | -0.68        |
| time/                   |              |
|    fps                  | 460          |
|    iterations           | 11           |
|    time_elapsed         | 12           |
|    total_timesteps      | 5632         |
| train/                  |              |
|    approx_kl            | 0.0069761574 |
|    clip_fraction        | 0.213        |
|    clip_range           | 0.1          |
|    entropy_loss         | -0.748       |
|    explained_variance   | 0.522        |
|    learning_rate        | 0.00025      |
|    loss                 | -0.0201      |
|    n_updates            | 908          |
|    policy_gradient_loss | -0.00304     |
|    value_loss           | 0.137        |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

train time: 23.397716999053955
total train time: 23.456395149230957


In [None]:
# import sys
import os
# current_path = os.getcwd()
# sys.path.append(current_path)

from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3 import PPO
import time
from gym_reversi import ReversiEnv

t0=time.time()

# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multi-worker training (n_envs=4 => 4 environments)
# vec_env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0)
# env = make_atari_env("BreakoutNoFrameskip-v4", seed=0)
# vec_env = make_atari_env("BreakoutNoFrameskip-v4", n_envs=4, 
# #                          seed=0
#                         )
# # Frame-stacking with 4 frames
# vec_env = VecFrameStack(vec_env, n_stack=4)

board_size=4

PolicyModel = PPO
tensorboard_log = f"models/Reversi_ppo_{board_size}x{board_size}/"
if not os.path.isdir(tensorboard_log):
    os.makedirs(tensorboard_log)
model_path = os.path.join(tensorboard_log, "model")
opponent_model_path="random"
# opponent_model_path=os.path.join(tensorboard_log, "opponent_model")
n_envs = 4


if opponent_model_path != "random":
    opponent_model = PolicyModel.load(self.opponent_model_path)
else:
    opponent_model = "random"

env = ReversiEnv(opponent=opponent_model, is_train=True, board_size=board_size,
                 is_finished_reward=True, verbose=0)

vec_env = env
if n_envs > 1:
    # multi-worker training (n_envs=4 => 4 environments)
    vec_env = make_vec_env(ReversiEnvWrapper, n_envs=n_envs, seed=None,
                           env_kwargs={
                               "opponent": opponent_model,
                               "is_train": True,
                               "board_size": board_size,
                               "is_finished_reward": True,
                               "verbose": 0},
                        )

    # vec_env = make_vectorized_env(ReversiEnvWrapper, dumm=False, n=8)

try:
    model = PolicyModel.load(model_path, env=vec_env)
except Exception:
    print(f"load model from self.model_path: {self.model_path} error")
    model = PolicyModel('MlpPolicy', vec_env,
                  policy_kwargs=dict(net_arch=[256, 256]),
                  learning_rate=2.5e-4,  # learning_rate=2.5e-4,
                  ent_coef=0.01,
                  n_steps=64, # n_steps=128,
                  n_epochs=4,
                  batch_size=32, # batch_size=256,
                  gamma=0.99,
                  gae_lambda=0.95,
                  clip_range=0.1,
                  vf_coef=0.5,
                  verbose=1,
                  tensorboard_log=self.tensorboard_log)

t0 = time.time()
# model.learn(int(2e4))
model.learn(total_timesteps=10_0000)
model.save(model_path)
print(f"train time: {time.time()-t0}")

# vec_env = make_atari_env("BreakoutNoFrameskip-v4", n_envs=1)
# vec_env = VecFrameStack(vec_env, n_stack=4)


Logging to models/Reversi_ppo_4x4/PPO_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.3      |
|    ep_rew_mean     | -0.614   |
| time/              |          |
|    fps             | 84       |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 256      |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.31         |
|    ep_rew_mean          | -0.63        |
| time/                   |              |
|    fps                  | 139          |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 512          |
| train/                  |              |
|    approx_kl            | 0.0055270786 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.1          |
|    entropy_loss         | -0.658       |
|    explained_var

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.25        |
|    ep_rew_mean          | -0.6        |
| time/                   |             |
|    fps                  | 303         |
|    iterations           | 11          |
|    time_elapsed         | 9           |
|    total_timesteps      | 2816        |
| train/                  |             |
|    approx_kl            | 0.008001774 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.657      |
|    explained_variance   | 0.614       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.166       |
|    n_updates            | 988         |
|    policy_gradient_loss | -0.00358    |
|    value_loss           | 0.301       |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.28

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.96        |
|    ep_rew_mean          | -0.66       |
| time/                   |             |
|    fps                  | 337         |
|    iterations           | 21          |
|    time_elapsed         | 15          |
|    total_timesteps      | 5376        |
| train/                  |             |
|    approx_kl            | 0.011891709 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.629      |
|    explained_variance   | 0.723       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.0773      |
|    n_updates            | 1028        |
|    policy_gradient_loss | -0.00667    |
|    value_loss           | 0.217       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.78  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.08        |
|    ep_rew_mean          | -0.52       |
| time/                   |             |
|    fps                  | 355         |
|    iterations           | 31          |
|    time_elapsed         | 22          |
|    total_timesteps      | 7936        |
| train/                  |             |
|    approx_kl            | 0.011365094 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.622      |
|    explained_variance   | 0.636       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.14        |
|    n_updates            | 1068        |
|    policy_gradient_loss | -0.00636    |
|    value_loss           | 0.291       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.91  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.44       |
|    ep_rew_mean          | -0.44      |
| time/                   |            |
|    fps                  | 364        |
|    iterations           | 41         |
|    time_elapsed         | 28         |
|    total_timesteps      | 10496      |
| train/                  |            |
|    approx_kl            | 0.00603879 |
|    clip_fraction        | 0.0918     |
|    clip_range           | 0.1        |
|    entropy_loss         | -0.464     |
|    explained_variance   | 0.604      |
|    learning_rate        | 0.00025    |
|    loss                 | 0.124      |
|    n_updates            | 1108       |
|    policy_gradient_loss | -0.00118   |
|    value_loss           | 0.344      |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.24        |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.39        |
|    ep_rew_mean          | -0.38       |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 51          |
|    time_elapsed         | 35          |
|    total_timesteps      | 13056       |
| train/                  |             |
|    approx_kl            | 0.010901889 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.578      |
|    explained_variance   | 0.779       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.136       |
|    n_updates            | 1148        |
|    policy_gradient_loss | -0.00176    |
|    value_loss           | 0.19        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.19  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.23        |
|    ep_rew_mean          | -0.42       |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 61          |
|    time_elapsed         | 42          |
|    total_timesteps      | 15616       |
| train/                  |             |
|    approx_kl            | 0.013141552 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.617      |
|    explained_variance   | 0.681       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.109       |
|    n_updates            | 1188        |
|    policy_gradient_loss | -0.00819    |
|    value_loss           | 0.259       |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.1 

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.25       |
|    ep_rew_mean          | -0.28      |
| time/                   |            |
|    fps                  | 373        |
|    iterations           | 71         |
|    time_elapsed         | 48         |
|    total_timesteps      | 18176      |
| train/                  |            |
|    approx_kl            | 0.02467528 |
|    clip_fraction        | 0.132      |
|    clip_range           | 0.1        |
|    entropy_loss         | -0.563     |
|    explained_variance   | 0.649      |
|    learning_rate        | 0.00025    |
|    loss                 | 0.144      |
|    n_updates            | 1228       |
|    policy_gradient_loss | -0.0037    |
|    value_loss           | 0.307      |
----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.23         |
|    ep_re

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.26        |
|    ep_rew_mean          | -0.32       |
| time/                   |             |
|    fps                  | 373         |
|    iterations           | 81          |
|    time_elapsed         | 55          |
|    total_timesteps      | 20736       |
| train/                  |             |
|    approx_kl            | 0.012541132 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.717      |
|    explained_variance   | 0.845       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.092       |
|    n_updates            | 1268        |
|    policy_gradient_loss | -0.00619    |
|    value_loss           | 0.125       |
-----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 4.17      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.06        |
|    ep_rew_mean          | -0.48       |
| time/                   |             |
|    fps                  | 375         |
|    iterations           | 91          |
|    time_elapsed         | 61          |
|    total_timesteps      | 23296       |
| train/                  |             |
|    approx_kl            | 0.014139438 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.1         |
|    entropy_loss         | -0.721      |
|    explained_variance   | 0.689       |
|    learning_rate        | 0.00025     |
|    loss                 | 0.137       |
|    n_updates            | 1308        |
|    policy_gradient_loss | 0.00288     |
|    value_loss           | 0.266       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.15  

In [3]:
tensorboard --logdir ./reversi/models/Reversi_ppo/PPO_7/ --port=6016

8

In [4]:
25000/3600

6.944444444444445

In [5]:
import time
from stable_baselines3 import PPO
from gym_reversi import ReversiEnv

PolicyModel = PPO

env = ReversiEnv(opponent="random", board_size=6, player_color='white', 
                 is_finished_reward=True, verbose=0)

model = PolicyModel.load("models/Reversi_ppo/model", env=env)

max_round = 1000

total_round = 0
total_win = 0
total_failure = 0
total_equal = 0


t0=time.time()
obs, info = env.reset()
while total_round < max_round:
    action, _states = model.predict(obs, deterministic=False)
    obs, rewards, dones, truncated, info = env.step(action)

#     print(f"---- round:{total_round} --------")
#     print(f"action: {action}")
#     env.render("human")

    if dones:
        print(f"\n\n---- round:{total_round} --------")
#         env.render("human")
        obs, info = env.reset()
        total_round += 1
        if rewards > 0:
            total_win+=1
        elif rewards < 0:
            total_failure += 1
        else:
            total_equal += 1

        print(f"total_win:{total_win}, total_failure: {total_failure}, total_equal:{total_equal}\n\n")

# print(f"total_win:{total_win}, total_failure: {total_failure}")
print(f"train time: {time.time()-t0}")


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


---- round:0 --------
total_win:1, total_failure: 0, total_equal:0




---- round:1 --------
total_win:2, total_failure: 0, total_equal:0




---- round:2 --------
total_win:3, total_failure: 0, total_equal:0




---- round:3 --------
total_win:4, total_failure: 0, total_equal:0




---- round:4 --------
total_win:5, total_failure: 0, total_equal:0




---- round:5 --------
total_win:6, total_failure: 0, total_equal:0




---- round:6 --------
total_win:7, total_failure: 0, total_equal:0




---- round:7 --------
total_win:8, total_failure: 0, total_equal:0




---- round:8 --------
total_win:9, total_failure: 0, total_equal:0




---- round:9 --------
total_win:10, total_failure: 0, total_equal:0




---- round:10 --------
total_win:11, total_failure: 0, total_equal:0




---- round:11 --------
total_win:12, total_failure: 0, total_equal:0




---- round:12 --------
total_win:12, total_failure: 1, total_equ



---- round:112 --------
total_win:111, total_failure: 1, total_equal:1




---- round:113 --------
total_win:112, total_failure: 1, total_equal:1




---- round:114 --------
total_win:113, total_failure: 1, total_equal:1




---- round:115 --------
total_win:114, total_failure: 1, total_equal:1




---- round:116 --------
total_win:115, total_failure: 1, total_equal:1




---- round:117 --------
total_win:116, total_failure: 1, total_equal:1




---- round:118 --------
total_win:117, total_failure: 1, total_equal:1




---- round:119 --------
total_win:118, total_failure: 1, total_equal:1




---- round:120 --------
total_win:119, total_failure: 1, total_equal:1




---- round:121 --------
total_win:120, total_failure: 1, total_equal:1




---- round:122 --------
total_win:121, total_failure: 1, total_equal:1




---- round:123 --------
total_win:122, total_failure: 1, total_equal:1




---- round:124 --------
total_win:123, total_failure: 1, total_equal:1




---- round:125 --------



---- round:228 --------
total_win:221, total_failure: 7, total_equal:1




---- round:229 --------
total_win:222, total_failure: 7, total_equal:1




---- round:230 --------
total_win:223, total_failure: 7, total_equal:1




---- round:231 --------
total_win:223, total_failure: 8, total_equal:1




---- round:232 --------
total_win:224, total_failure: 8, total_equal:1




---- round:233 --------
total_win:224, total_failure: 9, total_equal:1




---- round:234 --------
total_win:225, total_failure: 9, total_equal:1




---- round:235 --------
total_win:226, total_failure: 9, total_equal:1




---- round:236 --------
total_win:227, total_failure: 9, total_equal:1




---- round:237 --------
total_win:228, total_failure: 9, total_equal:1




---- round:238 --------
total_win:229, total_failure: 9, total_equal:1




---- round:239 --------
total_win:230, total_failure: 9, total_equal:1




---- round:240 --------
total_win:231, total_failure: 9, total_equal:1




---- round:241 --------



---- round:346 --------
total_win:330, total_failure: 14, total_equal:3




---- round:347 --------
total_win:331, total_failure: 14, total_equal:3




---- round:348 --------
total_win:332, total_failure: 14, total_equal:3




---- round:349 --------
total_win:333, total_failure: 14, total_equal:3




---- round:350 --------
total_win:334, total_failure: 14, total_equal:3




---- round:351 --------
total_win:335, total_failure: 14, total_equal:3




---- round:352 --------
total_win:336, total_failure: 14, total_equal:3




---- round:353 --------
total_win:337, total_failure: 14, total_equal:3




---- round:354 --------
total_win:338, total_failure: 14, total_equal:3




---- round:355 --------
total_win:339, total_failure: 14, total_equal:3




---- round:356 --------
total_win:340, total_failure: 14, total_equal:3




---- round:357 --------
total_win:341, total_failure: 14, total_equal:3




---- round:358 --------
total_win:342, total_failure: 14, total_equal:3




---- round



---- round:459 --------
total_win:435, total_failure: 21, total_equal:4




---- round:460 --------
total_win:436, total_failure: 21, total_equal:4




---- round:461 --------
total_win:437, total_failure: 21, total_equal:4




---- round:462 --------
total_win:438, total_failure: 21, total_equal:4




---- round:463 --------
total_win:439, total_failure: 21, total_equal:4




---- round:464 --------
total_win:440, total_failure: 21, total_equal:4




---- round:465 --------
total_win:441, total_failure: 21, total_equal:4




---- round:466 --------
total_win:441, total_failure: 21, total_equal:5




---- round:467 --------
total_win:442, total_failure: 21, total_equal:5




---- round:468 --------
total_win:443, total_failure: 21, total_equal:5




---- round:469 --------
total_win:444, total_failure: 21, total_equal:5




---- round:470 --------
total_win:445, total_failure: 21, total_equal:5




---- round:471 --------
total_win:446, total_failure: 21, total_equal:5




---- round



---- round:573 --------
total_win:543, total_failure: 26, total_equal:5




---- round:574 --------
total_win:544, total_failure: 26, total_equal:5




---- round:575 --------
total_win:545, total_failure: 26, total_equal:5




---- round:576 --------
total_win:546, total_failure: 26, total_equal:5




---- round:577 --------
total_win:547, total_failure: 26, total_equal:5




---- round:578 --------
total_win:548, total_failure: 26, total_equal:5




---- round:579 --------
total_win:549, total_failure: 26, total_equal:5




---- round:580 --------
total_win:550, total_failure: 26, total_equal:5




---- round:581 --------
total_win:551, total_failure: 26, total_equal:5




---- round:582 --------
total_win:552, total_failure: 26, total_equal:5




---- round:583 --------
total_win:553, total_failure: 26, total_equal:5




---- round:584 --------
total_win:554, total_failure: 26, total_equal:5




---- round:585 --------
total_win:555, total_failure: 26, total_equal:5




---- round



---- round:693 --------
total_win:660, total_failure: 28, total_equal:6




---- round:694 --------
total_win:661, total_failure: 28, total_equal:6




---- round:695 --------
total_win:662, total_failure: 28, total_equal:6




---- round:696 --------
total_win:663, total_failure: 28, total_equal:6




---- round:697 --------
total_win:664, total_failure: 28, total_equal:6




---- round:698 --------
total_win:665, total_failure: 28, total_equal:6




---- round:699 --------
total_win:666, total_failure: 28, total_equal:6




---- round:700 --------
total_win:667, total_failure: 28, total_equal:6




---- round:701 --------
total_win:668, total_failure: 28, total_equal:6




---- round:702 --------
total_win:669, total_failure: 28, total_equal:6




---- round:703 --------
total_win:670, total_failure: 28, total_equal:6




---- round:704 --------
total_win:671, total_failure: 28, total_equal:6




---- round:705 --------
total_win:672, total_failure: 28, total_equal:6




---- round



---- round:816 --------
total_win:772, total_failure: 37, total_equal:8




---- round:817 --------
total_win:773, total_failure: 37, total_equal:8




---- round:818 --------
total_win:774, total_failure: 37, total_equal:8




---- round:819 --------
total_win:775, total_failure: 37, total_equal:8




---- round:820 --------
total_win:776, total_failure: 37, total_equal:8




---- round:821 --------
total_win:777, total_failure: 37, total_equal:8




---- round:822 --------
total_win:778, total_failure: 37, total_equal:8




---- round:823 --------
total_win:779, total_failure: 37, total_equal:8




---- round:824 --------
total_win:780, total_failure: 37, total_equal:8




---- round:825 --------
total_win:781, total_failure: 37, total_equal:8




---- round:826 --------
total_win:782, total_failure: 37, total_equal:8




---- round:827 --------
total_win:783, total_failure: 37, total_equal:8




---- round:828 --------
total_win:784, total_failure: 37, total_equal:8




---- round



---- round:934 --------
total_win:885, total_failure: 40, total_equal:10




---- round:935 --------
total_win:886, total_failure: 40, total_equal:10




---- round:936 --------
total_win:887, total_failure: 40, total_equal:10




---- round:937 --------
total_win:888, total_failure: 40, total_equal:10




---- round:938 --------
total_win:889, total_failure: 40, total_equal:10




---- round:939 --------
total_win:890, total_failure: 40, total_equal:10




---- round:940 --------
total_win:891, total_failure: 40, total_equal:10




---- round:941 --------
total_win:892, total_failure: 40, total_equal:10




---- round:942 --------
total_win:893, total_failure: 40, total_equal:10




---- round:943 --------
total_win:894, total_failure: 40, total_equal:10




---- round:944 --------
total_win:895, total_failure: 40, total_equal:10




---- round:945 --------
total_win:896, total_failure: 40, total_equal:10




---- round:946 --------
total_win:897, total_failure: 40, total_equal:10



1

In [3]:
from gymnasium.utils import seeding


In [4]:
seeding.np_random??

In [5]:
import numpy.random as npr
rng=npr.default_rng()
size=(3,4)
C=rng.uniform(4,7,size)
print(f"{C=}")


C=array([[5.43019184, 6.40250697, 6.1004084 , 5.13867582],
       [6.29431658, 5.80315648, 5.16903573, 4.87083518],
       [4.42687125, 4.45198253, 5.73728282, 6.59846583]])


In [10]:
rng.integers(2)

0

In [None]:
np.random.Generator()