In [None]:
import os
from datetime import datetime

from envs.arpod_HCW import HCWSE2Env
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

def main():
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = os.path.join("runs", f"sac_arpod_{run_id}")
    os.makedirs(out_dir, exist_ok=True)
    best_dir = os.path.join(out_dir, "best_models")
    logs_dir = os.path.join(out_dir, "logs")
    os.makedirs(best_dir, exist_ok=True)
    os.makedirs(logs_dir, exist_ok=True)
    
    
    n_envs = 12
    seed = 42
    env = DummyVecEnv([lambda: Monitor(HCWSE2Env(seed=seed), logs_dir)] * n_envs)
    # env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)0)
    eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir), logs_dir,allow_early_resets=True)
    
    policy_kwargs = dict(
        net_arch=dict(pi=[256, 256,64], qf=[256, 256,64])
    )
    model = SAC(
        policy="MlpPolicy",
        env=env,
        learning_rate=3e-4,
        buffer_size=int(1e6),
        learning_starts=20_000,
        batch_size=512,
        tau=0.01,
        gamma=0.999,
        train_freq=(512,"step"),
        gradient_steps=512,
        ent_coef="auto_0.1",
        policy_kwargs=policy_kwargs,
        verbose=1,
        tensorboard_log=logs_dir,
        seed=seed
    )
    
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path=best_dir,
        log_path=logs_dir,
        eval_freq=10_000,
        n_eval_episodes=3,
        deterministic=True,
        render=True,
        verbose=1,
    )
    
    
    model.learn(total_timesteps=int(100e6), log_interval=10, callback=[eval_callback])
    model.save(os.path.join(out_dir, "final_model"))
    env.close()
    
    return model, eval_env, logs_dir, env ,best_dir  
    
    
    
if __name__ == "__main__":
    model, eval_env, logs_dir, env ,best_dir  = main()



Using cuda device
Logging to runs/sac_arpod_20250909_013218/logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 703      |
|    ep_rew_mean     | -662     |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 10       |
|    fps             | 17647    |
|    time_elapsed    | 1        |
|    total_timesteps | 22200    |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 888       |
|    ep_rew_mean     | -1.01e+03 |
|    success_rate    | 0         |
| time/              |           |
|    episodes        | 20        |
|    fps             | 5697      |
|    time_elapsed    | 5         |
|    total_timesteps | 33840     |
| train/             |           |
|    actor_loss      | 6.08      |
|    critic_loss     | 4.81      |
|    ent_coef        | 0.0894    |
|    ent_coef_loss   | -8        |
|    learning_rate   | 0.0003 

KeyboardInterrupt: 