In [1]:
import sys
sys.path.append('/home/rasa/PycharmProjects/reversiProject/')

In [16]:
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv, sync_envs_normalization

import stable_baselines3.common.callbacks as callbacks_module
from sb3_contrib.common.maskable.evaluation import evaluate_policy as masked_evaluate_policy

# Modify the namespace of EvalCallback directly
callbacks_module.evaluate_policy = masked_evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

# from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
# from stable_baselines3.common.callbacks import EvalCallback

from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.ppo_mask import MaskablePPO

from shutil import copyfile # keep track of generations
from collections import OrderedDict

from gymnasium.spaces import Discrete, Box, Dict, MultiDiscrete
from gymnasium.wrappers import FlattenObservation
import gymnasium.spaces as spaces
from game_logic import Othello
import numpy as np
import os, math
from itertools import cycle

In [17]:
# Settings
SEED = 19
NUM_TIMESTEPS = int(30_000_000)
EVAL_FREQ = int(20_000)
EVAL_EPISODES = int(200)
BEST_THRESHOLD = 0.30 # must achieve a mean score above this to replace prev best self

RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.

LOGDIR = "ppo_masked_selfplay_3"

In [21]:
class OthelloEnv(gym.Env):   
    def __init__(self):
        self.game = Othello()
        self.agent_turn = 1
        shape = self.game.board.shape  
        self.action_mapping = self.am(shape)
        self.action_space = Discrete(shape[0] * shape[1])  # sample - [x, y]
        # self.observation_space = Dict({
        #                                 'board' : Box(0, 2, shape=shape, dtype=int), 
        #                                 'player': Discrete(2, start=1)
        #                               })        
        self.observation_space = Box(low=np.array([0]*64 + [1]), high=np.array([2]*65), dtype=int)
        self.other_agent = None
        self.reset_othello_gen = self.reset_othello()    
        self.episodes = 0    
    

    def am(self, shape):
        x, y = shape
        return [(n//x, n%x) for n in range(x * y)]
        

    def reset_othello(self):
        '''resets game to starting position 
           and also changes starting player alternatively'''
        infinite_player_turn = cycle([1, 2])
        while True:
            game = Othello()
            model_turn = next(infinite_player_turn)
            yield game, model_turn
    
    def change_to_latest_agent(self, agent):
        self.other_agent = agent

    def get_obs(self):
        flattened_board = self.game.board.flatten()
        return np.append(flattened_board, self.game.player_turn)        
        
    def check_game_ended(self):
        reward = 0
        done = False
        winner = self.game.get_winner()
        if winner is not None:
            self.episodes += 1
            if self.episodes % 10 == 0:
                print(f'Ep done - {self.episodes}.')
            
            done = True
            if winner == self.agent_turn:
                reward = 1
            elif winner == 3 - self.agent_turn: #  other agent turn/figure
                reward = -1
        return reward, done
    
    def render(self):  # todo 
        pass

    def close(self):  # todo
        pass

    def other_agent_play_move(self): 
        obs = self.get_obs()         
        action, _ = self.other_agent.predict(obs, 
                                             action_masks=self.action_masks(),
                                             deterministic=True) 
        game_action = self.action_mapping[action]
        self.game.play_move(game_action)

    def step(self, action):
        game_action = self.action_mapping[action]
        self.game.play_move(game_action)

        # do self play
        while self.game.get_winner() is None and self.game.player_turn != self.agent_turn: #  if game hasnt ended do moves if opponent doesnt have one 
            self.other_agent_play_move()

        reward, done = self.check_game_ended()
        info = {}
        truncated = False

                
        # Return step information
        return self.get_obs(), reward, done, truncated, info
    
    def reset(self, *args, **kwargs):
        self.game, self.agent_turn = next(self.reset_othello_gen)
        if self.agent_turn == 2:
            self.other_agent_play_move()
        return self.get_obs(), None

    def action_masks(self):        
        valid_moves = self.game.valid_moves()
    
        mask = np.zeros(self.game.board.shape, dtype=bool)
        
        # Set True for each index in the set
        for index in valid_moves:
            mask[index] = True
        mask.flatten()
        return mask

In [22]:
class SelfPlayCallback(EvalCallback):
    # hacked it to only save new version offrom gymnasium.wrappers import FlattenObservation best model if beats prev self by BEST_THRESHOLD score
    # after saving model, resets the best score to be BEST_THRESHOLD
    def __init__(self, train_env, eval_env, *args, **kwargs):
        super().__init__(eval_env, *args, **kwargs)
        self.best_mean_reward = BEST_THRESHOLD
        self.generation = 0
        self.train_env = train_env
        self.eval_env = eval_env
    def _on_step(self) -> bool:
        # result = super()._on_step() #  eval needs to be masked, its less efficient 
        result = super()._on_step()
        
        if result and self.best_mean_reward > BEST_THRESHOLD:
            self.generation += 1
            print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
            print("SELFPLAY: new best model, bumping up generation to", self.generation)            
            source_file = os.path.join(LOGDIR, "best_model.zip")
            backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(4)+".zip")
            copyfile(source_file, backup_file)
            self.best_mean_reward = BEST_THRESHOLD
            agent = self.model.load(source_file)
            self.train_env.unwrapped.change_to_latest_agent(agent)            
            self.eval_env.envs[0].unwrapped.change_to_latest_agent(agent)      
        return result    

In [None]:
env = OthelloEnv()
env = Monitor(env=env)



model = MaskablePPO(policy=MaskableActorCriticPolicy, env=env, verbose=1)
starting_model_filepath = LOGDIR + 'random_start_model'
# model = MaskablePPO.load(starting_model_filepath, env=env)
model.save(starting_model_filepath)

start_model_copy = model.load(starting_model_filepath)
env.unwrapped.change_to_latest_agent(start_model_copy)


env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)

env_eval = DummyVecEnv(env_fns=[lambda: env_eval])
env_eval.envs[0].unwrapped.change_to_latest_agent(start_model_copy)



eval_callback = SelfPlayCallback(
    env,
    env_eval,
    best_model_save_path=LOGDIR,
    log_path=LOGDIR,
    eval_freq=EVAL_FREQ,
    n_eval_episodes=EVAL_EPISODES,
    deterministic=False 
    )


model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

Using cpu device
Wrapping the env in a DummyVecEnv.
Ep done - 10.
Ep done - 20.
Ep done - 30.
Ep done - 40.
Ep done - 50.
Ep done - 60.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.1     |
|    ep_rew_mean     | 0.338    |
| time/              |          |
|    fps             | 509      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Ep done - 70.
Ep done - 80.
Ep done - 90.
Ep done - 100.
Ep done - 110.
Ep done - 120.
Ep done - 130.
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.2        |
|    ep_rew_mean          | 0.46        |
| time/                   |             |
|    fps                  | 163         |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 4096        |
| train/                  |             |
| 

In [130]:
obs = env.unwrapped.get_obs()
obs

OrderedDict([('board',
              array([[0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 1, 2, 0, 0, 0],
                     [0, 0, 0, 2, 1, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0],
                     [0, 0, 0, 0, 0, 0, 0, 0]])),
             ('player', 1)])

In [131]:
new_obs = spaces.flatten(env.unwrapped.observation_space, obs)
new_obs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [133]:
for i in range(15):
    print(model.predict(new_obs))

(array(22), None)
(array(29), None)
(array(41), None)
(array(28), None)
(array(5), None)
(array(46), None)
(array(24), None)
(array(13), None)
(array(40), None)
(array(59), None)
(array(13), None)
(array(45), None)
(array(18), None)
(array(62), None)
(array(51), None)


In [115]:
new_obs

array([0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 2, 2, 2, 1, 0, 1, 0, 2, 2, 2, 2, 2,
       1, 2, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 1, 0,
       2, 2, 1, 1, 0, 2, 1, 0, 2, 2, 2, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1])

In [147]:
env_eval = OthelloEnv()
env_eval = Monitor(env=env_eval)
env_eval = FlattenObservation(env_eval)

# env_eval = DummyVecEnv(env_fns=[lambda: env_eval])

In [161]:
model1 = MaskablePPO.load('history_00000385.zip')
model_random = MaskablePPO.load('ppo_masked_selfplay_2/history_00000218.zip')

In [162]:
# env_eval.envs[0].unwrapped.change_to_latest_agent(model1)
env_eval.unwrapped.change_to_latest_agent(model1)

In [37]:
evaluate_policy = masked_evaluate_policy
episode_rewards, episode_lengths = evaluate_policy(
                model_random,
                env_eval,
                n_eval_episodes=100,                
                deterministic=True,
                return_episode_rewards=True,
                warn=True
            )

In [38]:
np.mean(episode_rewards)

0.33

In [163]:
def my_play(env, model, episodes = 100):     
    wins = 0
    for episode in range(1, episodes+1):
        obs, _ = env.reset()        
        done = False
        score = 0 
        
        while not done:
            # env.render()
            action, _ = model.predict(obs, action_masks=env.unwrapped.action_masks(), deterministic=False)            
            obs, reward, done, _, info = env.step(action)            
            score+=reward
        won = env.game.get_winner() == env.agent_turn
        # print(f'Episode:{episode} Score:{score}, play as {env.agent_turn}, won = {won}')
        if won:
            wins+=1
    return wins


In [166]:
my_play(env_eval, model_random, 100)

51

In [176]:
np.append(env_eval.game.board.flatten(), [3, 4])

array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4])