In [1]:
import os
from datetime import datetime

from envs.arpod_HCW import HCWSE2Env
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import is_wrapped

run_specs="20kg_256x1024x64"
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = os.path.join("runs", f"sac_arpod_{run_id}_{run_specs}")
os.makedirs(out_dir, exist_ok=True)
best_dir = os.path.join(out_dir, "best_models")
logs_dir = os.path.join(out_dir, "logs")
os.makedirs(best_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)


n_envs = 12
seed = 42
spawn_radius=(2.5, 5.0)
env = DummyVecEnv([lambda: Monitor(HCWSE2Env(seed=seed,spawn_radius=spawn_radius), logs_dir)] * n_envs)
# env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)0)

policy_kwargs = dict(
    net_arch=dict(pi=[256, 1024,64], qf=[256, 1024,64])
)
model = SAC(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    buffer_size=int(1e6),
    learning_starts=20_000,
    batch_size=256,
    tau=0.005,
    gamma=0.999,
    train_freq=(64,"step"),
    gradient_steps=128,
    ent_coef="auto_0.1",
    target_entropy="auto",
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log=logs_dir,
    seed=seed
)
model.save(os.path.join(best_dir, "model_0M"))

# model_path = os.path.join(best_dir, "model_0M.zip")

Using cuda device


In [2]:

eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)

model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback])
model.save(os.path.join(out_dir, "model_1M"))



Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.27e+03 |
|    ep_rew_mean     | -717     |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 10       |
|    fps             | 2773     |
|    time_elapsed    | 10       |
|    total_timesteps | 27900    |
| train/             |          |
|    actor_loss      | 6.43     |
|    critic_loss     | 0.365    |
|    ent_coef        | 0.0692   |
|    ent_coef_loss   | -8.96    |
|    learning_rate   | 0.0003   |
|    n_updates       | 1280     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.35e+03 |
|    ep_rew_mean     | -800     |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 20       |
|    fps             | 1907     |
|    time_elapsed    | 18       |
|    total_t

In [3]:

# > 1M steps, increase spawn radius
spawn_radius=(2.5, 10.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
    print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)
model.save(os.path.join(out_dir, "model_2M"))




Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Increasing spawn radius to  (2.5, 10.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 256      |
|    ep_rew_mean     | 4.01     |
|    success_rate    | 0.23     |
| time/              |          |
|    episodes        | 1440     |
|    fps             | 930      |
|    time_elapsed    | 1        |
|    total_timesteps | 1002384  |
| train/             |          |
|    actor_loss      | 143      |
|    critic_loss     | 20.1     |
|    ent

In [4]:

# > 2M steps, increase spawn radius
spawn_radius=(2.5, 20.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)
model.save(os.path.join(out_dir, "model_3M"))




Increasing spawn radius to  (2.5, 20.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 153      |
|    ep_rew_mean     | 194      |
|    success_rate    | 1        |
| time/              |          |
|    episodes        | 6580     |
|    fps             | 6855     |
|    time_elapsed    | 0        |
|    total_timesteps | 2001456  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 152      |
|    ep_rew_mean     | 194      |
|    success_rate    | 1        |
| time/              |          |
|    episodes        | 6590     |
|    fps             | 1176     |
|    time_elapsed    | 1        |
|    total_timesteps | 2003448  |
| train/             |          |
|    actor_loss      | -144     |
|    critic_loss     | 18.2     |
|    ent_coef        | 0.0442   |
|    ent_coef_loss   | 0.000257 |
|    l

In [5]:

# > 3M steps, increase spawn radius
spawn_radius=(2.5, 40.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_4M"))



Increasing spawn radius to  (2.5, 40.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 205      |
|    ep_rew_mean     | 196      |
|    success_rate    | 0.99     |
| time/              |          |
|    episodes        | 11180    |
|    fps             | 13383    |
|    time_elapsed    | 0        |
|    total_timesteps | 3002568  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 208      |
|    ep_rew_mean     | 196      |
|    success_rate    | 0.99     |
| time/              |          |
|    episodes        | 11190    |
|    fps             | 992      |
|    time_elapsed    | 4        |
|    total_timesteps | 3006120  |
| train/             |          |
|    actor_loss      | -150     |
|    critic_loss     | 6.13     |
|    ent_coef        | 0.0186   |
|    ent_coef_loss   | 0.0764   |
|    l

In [6]:
# > 4M steps, increase spawn radius
spawn_radius=(2.5, 60.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_5M"))



Increasing spawn radius to  (2.5, 60.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 301      |
|    ep_rew_mean     | 210      |
|    success_rate    | 1        |
| time/              |          |
|    episodes        | 13390    |
|    fps             | 953      |
|    time_elapsed    | 1        |
|    total_timesteps | 4004532  |
| train/             |          |
|    actor_loss      | 222      |
|    critic_loss     | 72.3     |
|    ent_coef        | 0.0935   |
|    ent_coef_loss   | -0.203   |
|    learning_rate   | 0.0003   |
|    n_updates       | 664064   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 311      |
|    ep_rew_mean     | 211      |
|    success_rate    | 1        |
| time/              |          |
|    episodes        | 13400    |
|    fps             | 895      |
|    t

In [7]:

# > 5M steps, increase spawn radius
spawn_radius=(2.5, 60.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_6M"))


Increasing spawn radius to  (2.5, 60.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 267      |
|    ep_rew_mean     | 212      |
|    success_rate    | 0.99     |
| time/              |          |
|    episodes        | 16610    |
|    fps             | 1077     |
|    time_elapsed    | 0        |
|    total_timesteps | 5004492  |
| train/             |          |
|    actor_loss      | -152     |
|    critic_loss     | 7.67     |
|    ent_coef        | 0.0214   |
|    ent_coef_loss   | 0.0115   |
|    learning_rate   | 0.0003   |
|    n_updates       | 830720   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 264      |
|    ep_rew_mean     | 211      |
|    success_rate    | 0.99     |
| time/              |          |
|    episodes        | 16620    |
|    fps             | 1071     |
|    t

In [8]:

# > 6M steps, increase spawn radius
spawn_radius=(2.5, 80.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_7M"))


Increasing spawn radius to  (2.5, 80.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 282      |
|    ep_rew_mean     | 218      |
|    success_rate    | 0.99     |
| time/              |          |
|    episodes        | 20120    |
|    fps             | 13888    |
|    time_elapsed    | 0        |
|    total_timesteps | 6004608  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 279      |
|    ep_rew_mean     | 215      |
|    success_rate    | 0.98     |
| time/              |          |
|    episodes        | 20130    |
|    fps             | 978      |
|    time_elapsed    | 3        |
|    total_timesteps | 6007440  |
| train/             |          |
|    actor_loss      | -144     |
|    critic_loss     | 14.2     |
|    ent_coef        | 0.0179   |
|    ent_coef_loss   | -0.0754  |
|    l

In [9]:

# > 7M steps, increase spawn radius
spawn_radius=(2.5, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_8M"))


Increasing spawn radius to  (2.5, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 271      |
|    ep_rew_mean     | 215      |
|    success_rate    | 0.96     |
| time/              |          |
|    episodes        | 23400    |
|    fps             | 1304     |
|    time_elapsed    | 1        |
|    total_timesteps | 7007088  |
| train/             |          |
|    actor_loss      | -172     |
|    critic_loss     | 3.42     |
|    ent_coef        | 0.00728  |
|    ent_coef_loss   | -0.0703  |
|    learning_rate   | 0.0003   |
|    n_updates       | 1164416  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 276      |
|    ep_rew_mean     | 215      |
|    success_rate    | 0.96     |
| time/              |          |
|    episodes        | 23410    |
|    fps             | 1060     |
|    

In [10]:
# > 8M steps, stabilize spawn radius
spawn_radius=(2.5, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_9M"))


Increasing spawn radius to  (2.5, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 362      |
|    ep_rew_mean     | 227      |
|    success_rate    | 0.95     |
| time/              |          |
|    episodes        | 25960    |
|    fps             | 13527    |
|    time_elapsed    | 0        |
|    total_timesteps | 8006052  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 362      |
|    ep_rew_mean     | 222      |
|    success_rate    | 0.93     |
| time/              |          |
|    episodes        | 25970    |
|    fps             | 1022     |
|    time_elapsed    | 4        |
|    total_timesteps | 8009892  |
| train/             |          |
|    actor_loss      | -88      |
|    critic_loss     | 28.3     |
|    ent_coef        | 0.0322   |
|    ent_coef_loss   | 0.494    |
|    

In [11]:
# > 9M steps, stabilize spawn radius
spawn_radius=(2.5, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_10M"))

Increasing spawn radius to  (2.5, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 360      |
|    ep_rew_mean     | 164      |
|    success_rate    | 0.93     |
| time/              |          |
|    episodes        | 28690    |
|    fps             | 1027     |
|    time_elapsed    | 2        |
|    total_timesteps | 9008916  |
| train/             |          |
|    actor_loss      | 84.9     |
|    critic_loss     | 524      |
|    ent_coef        | 0.107    |
|    ent_coef_loss   | 0.0744   |
|    learning_rate   | 0.0003   |
|    n_updates       | 1498112  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 373      |
|    ep_rew_mean     | 162      |
|    success_rate    | 0.92     |
| time/              |          |
|    episodes        | 28700    |
|    fps             | 991      |
|    

In [12]:
# > 10M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_11M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 365      |
|    ep_rew_mean     | 208      |
|    success_rate    | 0.9      |
| time/              |          |
|    episodes        | 31430    |
|    fps             | 1616     |
|    time_elapsed    | 0        |
|    total_timesteps | 10008372 |
| train/             |          |
|    actor_loss      | 207      |
|    critic_loss     | 341      |
|    ent_coef        | 0.122    |
|    ent_coef_loss   | 0.0507   |
|    learning_rate   | 0.0003   |
|    n_updates       | 1664640  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 364      |
|    ep_rew_mean     | 208      |
|    success_rate    | 0.9      |
| time/              |          |
|    episodes        | 31440    |
|    fps             | 996      |
|    t

In [13]:
# > 11M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_12M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 386      |
|    ep_rew_mean     | 139      |
|    success_rate    | 0.62     |
| time/              |          |
|    episodes        | 34140    |
|    fps             | 1508     |
|    time_elapsed    | 0        |
|    total_timesteps | 11008992 |
| train/             |          |
|    actor_loss      | 24.1     |
|    critic_loss     | 85.3     |
|    ent_coef        | 0.0985   |
|    ent_coef_loss   | 0.0198   |
|    learning_rate   | 0.0003   |
|    n_updates       | 1831424  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 385      |
|    ep_rew_mean     | 127      |
|    success_rate    | 0.57     |
| time/              |          |
|    episodes        | 34150    |
|    fps             | 952      |
|    t

In [14]:
# > 12M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_13M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 499      |
|    ep_rew_mean     | 146      |
|    success_rate    | 0.62     |
| time/              |          |
|    episodes        | 36830    |
|    fps             | 1097     |
|    time_elapsed    | 2        |
|    total_timesteps | 12011112 |
| train/             |          |
|    actor_loss      | -25.6    |
|    critic_loss     | 79.3     |
|    ent_coef        | 0.0879   |
|    ent_coef_loss   | -0.261   |
|    learning_rate   | 0.0003   |
|    n_updates       | 1998464  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 495      |
|    ep_rew_mean     | 163      |
|    success_rate    | 0.7      |
| time/              |          |
|    episodes        | 36840    |
|    fps             | 954      |
|    t

In [15]:
# > 13M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_14M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 373      |
|    ep_rew_mean     | 97.2     |
|    success_rate    | 0.86     |
| time/              |          |
|    episodes        | 39420    |
|    fps             | 1040     |
|    time_elapsed    | 0        |
|    total_timesteps | 13009968 |
| train/             |          |
|    actor_loss      | 157      |
|    critic_loss     | 477      |
|    ent_coef        | 0.119    |
|    ent_coef_loss   | -0.00724 |
|    learning_rate   | 0.0003   |
|    n_updates       | 2164992  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 359      |
|    ep_rew_mean     | 153      |
|    success_rate    | 0.86     |
| time/              |          |
|    episodes        | 39430    |
|    fps             | 1140     |
|    t

In [16]:
# > 14M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_15M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 362      |
|    ep_rew_mean     | 226      |
|    success_rate    | 0.96     |
| time/              |          |
|    episodes        | 42050    |
|    fps             | 982      |
|    time_elapsed    | 2        |
|    total_timesteps | 14012304 |
| train/             |          |
|    actor_loss      | 15.2     |
|    critic_loss     | 94.1     |
|    ent_coef        | 0.0823   |
|    ent_coef_loss   | -0.215   |
|    learning_rate   | 0.0003   |
|    n_updates       | 2332032  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 353      |
|    ep_rew_mean     | 225      |
|    success_rate    | 0.96     |
| time/              |          |
|    episodes        | 42060    |
|    fps             | 942      |
|    t

In [17]:
# > 15M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_16M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 398      |
|    ep_rew_mean     | 184      |
|    success_rate    | 0.83     |
| time/              |          |
|    episodes        | 44790    |
|    fps             | 1012     |
|    time_elapsed    | 2        |
|    total_timesteps | 15013068 |
| train/             |          |
|    actor_loss      | 210      |
|    critic_loss     | 351      |
|    ent_coef        | 0.16     |
|    ent_coef_loss   | 0.018    |
|    learning_rate   | 0.0003   |
|    n_updates       | 2498816  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 393      |
|    ep_rew_mean     | 179      |
|    success_rate    | 0.82     |
| time/              |          |
|    episodes        | 44800    |
|    fps             | 1015     |
|    t

In [18]:
# > 16M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_17M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 320      |
|    ep_rew_mean     | 148      |
|    success_rate    | 0.62     |
| time/              |          |
|    episodes        | 47770    |
|    fps             | 1380     |
|    time_elapsed    | 0        |
|    total_timesteps | 16012440 |
| train/             |          |
|    actor_loss      | -36.4    |
|    critic_loss     | 33.5     |
|    ent_coef        | 0.0892   |
|    ent_coef_loss   | -0.0213  |
|    learning_rate   | 0.0003   |
|    n_updates       | 2665344  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 315      |
|    ep_rew_mean     | 151      |
|    success_rate    | 0.64     |
| time/              |          |
|    episodes        | 47780    |
|    fps             | 966      |
|    t

In [19]:
# > 17M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_18M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 396      |
|    ep_rew_mean     | 236      |
|    success_rate    | 0.98     |
| time/              |          |
|    episodes        | 50530    |
|    fps             | 985      |
|    time_elapsed    | 4        |
|    total_timesteps | 17016660 |
| train/             |          |
|    actor_loss      | 87.3     |
|    critic_loss     | 377      |
|    ent_coef        | 0.102    |
|    ent_coef_loss   | -0.0467  |
|    learning_rate   | 0.0003   |
|    n_updates       | 2832768  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 387      |
|    ep_rew_mean     | 237      |
|    success_rate    | 0.98     |
| time/              |          |
|    episodes        | 50540    |
|    fps             | 961      |
|    t

In [20]:
# > 18M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_19M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 370      |
|    ep_rew_mean     | 234      |
|    success_rate    | 0.98     |
| time/              |          |
|    episodes        | 53110    |
|    fps             | 1355     |
|    time_elapsed    | 1        |
|    total_timesteps | 18014928 |
| train/             |          |
|    actor_loss      | 21.4     |
|    critic_loss     | 44.6     |
|    ent_coef        | 0.0827   |
|    ent_coef_loss   | -0.0478  |
|    learning_rate   | 0.0003   |
|    n_updates       | 2999040  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 374      |
|    ep_rew_mean     | 236      |
|    success_rate    | 0.98     |
| time/              |          |
|    episodes        | 53120    |
|    fps             | 1052     |
|    t

In [21]:
# > 19M steps, stabilize spawn radius
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_20M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 341      |
|    ep_rew_mean     | 202      |
|    success_rate    | 0.84     |
| time/              |          |
|    episodes        | 55900    |
|    fps             | 1106     |
|    time_elapsed    | 1        |
|    total_timesteps | 19015188 |
| train/             |          |
|    actor_loss      | -144     |
|    critic_loss     | 5.77     |
|    ent_coef        | 0.0297   |
|    ent_coef_loss   | -0.01    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3165824  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 345      |
|    ep_rew_mean     | 202      |
|    success_rate    | 0.84     |
| time/              |          |
|    episodes        | 55910    |
|    fps             | 949      |
|    t

In [28]:
print(model.target_entropy)

-2.0


In [22]:
# > 20M steps, stabilize spawn radius
model.load(os.path.join(best_dir, "model_20M.zip"), env=env)
model.target_entropy="-2"
spawn_radius=(10, 100.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)

for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(5e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_25M"))

Increasing spawn radius to  (10, 100.0)
Logging to runs/sac_arpod_20250912_001326_20kg_256x1024x64/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 407      |
|    ep_rew_mean     | 232      |
|    success_rate    | 0.97     |
| time/              |          |
|    episodes        | 58480    |
|    fps             | 1096     |
|    time_elapsed    | 0        |
|    total_timesteps | 20015100 |
| train/             |          |
|    actor_loss      | 340      |
|    critic_loss     | 2.63e+03 |
|    ent_coef        | 0.168    |
|    ent_coef_loss   | 0.416    |
|    learning_rate   | 0.0003   |
|    n_updates       | 3332480  |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 410      |
|    ep_rew_mean     | 233      |
|    success_rate    | 0.97     |
| time/              |          |
|    episodes        | 58490    |
|    fps             | 957      |
|    t