In [None]:
import os
from datetime import datetime

from envs.arpod_HCW import HCWSE2Env
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import is_wrapped

run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = os.path.join("runs", f"sac_arpod_{run_id}")
os.makedirs(out_dir, exist_ok=True)
best_dir = os.path.join(out_dir, "best_models")
logs_dir = os.path.join(out_dir, "logs")
os.makedirs(best_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)


n_envs = 12
seed = 42
spawn_radius=(2.5, 5.0)
env = DummyVecEnv([lambda: Monitor(HCWSE2Env(seed=seed,spawn_radius=spawn_radius), logs_dir)] * n_envs)
# env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)0)

policy_kwargs = dict(
    net_arch=dict(pi=[256, 256,64], qf=[256, 256,64])
)
model = SAC(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    buffer_size=int(1e6),
    learning_starts=20_000,
    batch_size=256,
    tau=0.005,
    gamma=0.999,
    train_freq=(64,"step"),
    gradient_steps=128,
    ent_coef="auto_0.1",
    target_entropy="auto",
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log=logs_dir,
    seed=seed
)



In [None]:

eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)

model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback])
model.save(os.path.join(out_dir, "model_1M"))



In [None]:

# > 1M steps, increase spawn radius
spawn_radius=(2.5, 10.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
    print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)
model.save(os.path.join(out_dir, "model_2M"))




In [None]:

# > 2M steps, increase spawn radius
spawn_radius=(2.5, 20.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)
model.save(os.path.join(out_dir, "model_3M"))




In [None]:

# > 3M steps, increase spawn radius
spawn_radius=(2.5, 40.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_4M"))



In [None]:
# > 4M steps, increase spawn radius
spawn_radius=(2.5, 60.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_5M"))



In [None]:

# > 5M steps, increase spawn radius
spawn_radius=(2.5, 60.0)
eval_env = Monitor(HCWSE2Env(seed=seed,render_mode="rgb_array",render_folder=logs_dir,spawn_radius=spawn_radius), logs_dir)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=best_dir,
    log_path=logs_dir,
    eval_freq=20_000,
    n_eval_episodes=3,
    deterministic=True,
    render=True,
    verbose=1,
)
for e in env.envs:
    e.unwrapped.spawn_radius=spawn_radius
print("Increasing spawn radius to ", env.envs[0].unwrapped.spawn_radius)    
model.learn(total_timesteps=int(1e6), log_interval=10, callback=[eval_callback],reset_num_timesteps=False)   
model.save(os.path.join(out_dir, "model_6M"))
