In [1]:
import gymnasium as gym
from gymnasium.envs.registration import register
import numpy as np
from pp_utils import pp_utils

pp_utils_obj = pp_utils.PP_Utils()

register(
    id='PileupPoker-v0',
    entry_point='pileup_poker.pileup_poker:PileupPokerEnv',
)

In [None]:
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from sb3_contrib.common.maskable.policies import MaskableMultiInputActorCriticPolicy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

class RewardLoggingCallback(BaseCallback):
    def __init__(self, check_freq, verbose=1):
        super(RewardLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            mean_reward = np.mean(self.locals['rewards'])
            print(f"Step: {self.num_timesteps}, Mean Reward: {mean_reward}")
        return True

def mask_fn(env: gym.Env) -> np.ndarray:
    return env.get_action_mask()

# Linear learning rate schedule
def linear_schedule(initial_value):
    def func(progress_remaining):
        # Progress will decrease from 1 (start) to 0 (end)
        return progress_remaining * initial_value
    return func

TIMESTEPS_TO_RUN = 1600000
CHECK_FREQ = TIMESTEPS_TO_RUN/10
TEST_NAME = "UsualTrainingFull_linearSchedule"

# Set up callbacks
checkpoint_callback = CheckpointCallback(save_freq=CHECK_FREQ, save_path='./models/'+TEST_NAME, name_prefix=TEST_NAME)
reward_logging_callback = RewardLoggingCallback(check_freq=CHECK_FREQ)

env = gym.make('PileupPoker-v0')
# Wrap to enable masking
env = ActionMasker(env, mask_fn) 
# Masking PPO reduces the training time given there are only a limited number of good actions
# out of the total posssible actions, as the game approaches an end state
model = MaskablePPO(MaskableMultiInputActorCriticPolicy, env, tensorboard_log="./ppo_poker_tensorboard", learning_rate=linear_schedule(3e-2))

eval_callback = EvalCallback(
    env,
    best_model_save_path='./logs/'+ TEST_NAME +'/best_model/',
    log_path='./logs/' + TEST_NAME + '/results/',
    eval_freq=CHECK_FREQ,
    deterministic=True,
    render=False
)

model.learn(total_timesteps=TIMESTEPS_TO_RUN, callback=[reward_logging_callback, checkpoint_callback, eval_callback], tb_log_name=TEST_NAME)

obs, info = env.reset()
done = False
while not done:
    action, _states = model.predict(obs, action_masks=env.get_action_mask())
    obs, rewards, done, info, _ = env.step(action)
    env.render()

print("Final reward: ",  rewards)
print("Num moves made: ", len(env.actions_made_this_game))

In [None]:
# Test with random decks and deterministic output
obs, info = env.reset()
done = False
while not done:
    action, _states = model.predict(obs, action_masks=env.get_action_mask(), deterministic=True)
    obs, rewards, done, info, _ = env.step(action)
    env.render()
    pp_utils_obj.plot_probs_heatmap(model.policy.action_dist.distribution.probs[0])
print("Final reward: ",  rewards)