In [5]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda if torch.cuda.is_available() else "Not Installed")
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


CUDA Available: True
CUDA Version: 12.6
GPU Name: NVIDIA GeForce RTX 2060 SUPER


In [7]:
import numpy as np
import time
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback, CallbackList
from stable_baselines3.common.logger import configure
import sbx
from sbx import TD3 as TD3_sbx
from sbx import CrossQ as CrossQ_sbx
import hockey.hockey_env as h_env
from importlib import reload

In [8]:
np.set_printoptions(suppress=True)
reload(h_env)

<module 'hockey.hockey_env' from 'c:\\Users\\Raul\\Desktop\\hockey-env\\hockey\\hockey_env.py'>

In [9]:
class IgnoreSeedResetWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)

    def reset(self, **kwargs):
        # Discard 'seed', 'options', etc. and just call the original reset().
        return self.env.reset()
env = h_env.HockeyEnv()
env = IgnoreSeedResetWrapper(env)

## Train

In [5]:
env = h_env.HockeyEnv_BasicOpponent(weak_opponent=False)
env = IgnoreSeedResetWrapper(env)


In [6]:
model = TD3("MlpPolicy", env, verbose=1)
start = time.time()
model.learn(total_timesteps=100000)
print("Training time: ", time.time() - start)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 251      |
|    ep_rew_mean     | -25.8    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 112      |
|    time_elapsed    | 8        |
|    total_timesteps | 1004     |
| train/             |          |
|    actor_loss      | 0.506    |
|    critic_loss     | 0.00195  |
|    learning_rate   | 0.001    |
|    n_updates       | 903      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 190      |
|    ep_rew_mean     | -20.9    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 109      |
|    time_elapsed    | 13       |
|    total_timesteps | 1518     |
| train/             |          |
|    actor_loss      | 0.714    |
|    critic_loss     

In [11]:
model = TD3.load("models/td3/td3_hockey_v0.1.2")

In [12]:
eval_env = h_env.HockeyEnv_BasicOpponent(weak_opponent=False)
eval_callback = EvalCallback(
    eval_env, 
    best_model_save_path='./logs/',
    log_path='./logs/', 
    eval_freq=10000,
    deterministic=True, 
    render=False,
    n_eval_episodes=10)

checkpoint_callback = CheckpointCallback(
    save_freq=100000,             
    save_path='./checkpoints/',   
    name_prefix='td3_hockey_v0.1.3',
    verbose=1
)
callback = CallbackList([checkpoint_callback, eval_callback])

# Set up TensorBoard logging
new_logger = configure("./logs/tensorboard/", ["tensorboard"])

model.set_logger(new_logger)

In [9]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

In [None]:
#model = TD3.load("models/td3/td3_hockey_v0.1.1", env=env)
model.action_noise = action_noise
model.learning_rate = 0.0001
model.buffer_size = 1000000
model.learning_starts = 1000

model.learn(total_timesteps=2000000, callback=checkpoint_callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 176      |
|    ep_rew_mean     | -16.1    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2186     |
|    time_elapsed    | 0        |
|    total_timesteps | 705      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -10.6    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 263      |
|    time_elapsed    | 6        |
|    total_timesteps | 1602     |
| train/             |          |
|    actor_loss      | 9.84     |
|    critic_loss     | 0.992    |
|    learning_rate   | 0.0001   |
|    n_updates       | 299501   |
---------------------------------
---------------------------------
| rollout/           |          |
|    

<stable_baselines3.td3.td3.TD3 at 0x202c45970e0>

In [10]:
#save model
model.save("models/td3/td3_hockey_v0.1.2")
model.save_replay_buffer("models/td3/td3_hockey_v0.1.2_replay_buffer")

### Train Defense

In [None]:
env_defense = h_env.HockeyEnv_BasicOpponent(mode=h_env.HockeyEnv.TRAIN_DEFENSE, weak_opponent=True)
env_defense = IgnoreSeedResetWrapper(env_defense)
model.action_noise = NormalActionNoise(mean=np.zeros(4), sigma=0.1 * np.ones(4))
model.set_env(env_defense)
model.learning_rate = 0.001
model.buffer_size = 1000000
model.learning_starts = 1000
model.learn(total_timesteps=1000000, callback=checkpoint_callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


## Test

### HockeyEnv_BasicOpponent

In [158]:
env = h_env.HockeyEnv_BasicOpponent(weak_opponent=False)

In [None]:
#model2 = CrossQ_sbx.load("model", env=env)
#model2 = TD3_sbx.load("model", env=env)	

In [7]:
model2 = TD3.load("td3_hockey_v0.1", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [164]:
obs, _ = env.reset()
done = False

while not done:
    env.render()
    action, _ = model.predict(obs, deterministic=True)  # Predict action for agent 1
    #action = player2.act(obs)
    obs, reward, done, _, _ = env.step(action)

### HockeEnv 2 Agents

In [37]:
class HockeyEnv_2Agents(h_env.HockeyEnv):
  def __init__(self, agent2_model, mode=h_env.HockeyEnv.NORMAL):
    super().__init__(mode=mode, keep_mode=True)
    self.action_space = spaces.Box(-1.0, 1.0, (4,), dtype=np.float32)
    self.agent2_model = agent2_model
  
  def step(self, action_agent1):
    obs2 = self.obs_agent_two()
    action_agent2, _ = self.agent2_model.predict(obs2, deterministic=True)
    action = np.hstack([action_agent1, action_agent2])
    return super().step(action)
  
  def reset(self, one_starts=None):
    obs = super().reset(one_starts)
    return obs
    

In [165]:
model2=TD3.load("models/td3/td3_hockey_v0.1.2")

In [38]:
opponent_model = TD3.load("models/td3/td3_hockey_v0.1.1")
env = HockeyEnv_2Agents(agent2_model=opponent_model, mode=h_env.HockeyEnv.NORMAL)

In [None]:
#model1 = TD3.load("td3_hockey", env=env)
#model2 = TD3_sbx.load("model", env=env)	

In [189]:
obs, info = env.reset()
done = False
while not done:
    action, _ = model2.predict(obs, deterministic=False)
    obs, r, done, t, info = env.step(action)
    done = done or t
    env.render()

In [101]:
env.close()

## Evaluation

### Success Rate

In [102]:
def evaluate_agents(agent_a, agent_b, env, n_eval_episodes=100, deterministic=True):
    """
    Runs `n_eval_episodes` episodes of Agent A vs. Agent B in `env`.
    Returns the success rate (fraction of episodes Agent A wins).
    """
    wins = 0
    for _ in range(n_eval_episodes):
        obs, info = env.reset()
        done = False
        episode_reward = 0.0
        while not done:
            obs_a = obs
            obs_b = env.obs_agent_two() 
            # Predict actions
            action_a, _ = agent_a.predict(obs_a, deterministic=deterministic)
            action_b, _ = agent_b.predict(obs_b, deterministic=deterministic)
            obs, r, d, t, info = env.step(np.hstack([action_a, action_b]))
            done = d or t
            episode_reward += r
        if episode_reward > 0:
            wins += 1

    success_rate = wins / n_eval_episodes
    return success_rate

In [132]:
def evaluate_agent_vs_basic(agent_model, weak=False, n_eval_episodes=10, deterministic=True):
    """
    Runs `n_eval_episodes` episodes of `agent_model` vs. a built-in basic opponent.
    Returns the success rate (fraction of episodes in which your agent 'wins').
    
    Assumes:
      - The environment is single-agent from SB3's perspective (action_space = 4D).
      - The environment automatically handles the basic opponent logic in step().
      - A 'win' is determined by final episode reward > 0 (or you can adapt).
    """
    env = h_env.HockeyEnv_BasicOpponent(weak_opponent=weak)
    wins = 0
    for _ in range(n_eval_episodes):
        obs, info = env.reset()
        done = False
        episode_reward = 0
        
        while not done:
            action, _ = agent_model.predict(obs, deterministic=deterministic)
            obs, r, d, t, info = env.step(action)
            done = d or t
            episode_reward += r
        
        if episode_reward > 0:
            wins += 1
    
    success_rate = wins / n_eval_episodes
    return success_rate

In [140]:
str(True)

'True'

In [143]:
sr = evaluate_agent_vs_basic(model, weak=True, n_eval_episodes=20)
print(f"Success Rate vs. Basic Opponent: {sr*100:.2f}%")

Success Rate vs. Basic Opponent: 45.00%
