# Hyperparameter tuning with Optuna for PPO and SAC

In [None]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from Hyperparameters import HYPERPARAMS as params
from SAC_Distillation.DistilledSACAgent import DistilledSAC
from SAC_Distillation.Trajectories import SAC_ExperienceBuffer
import numpy as np
import torch
import optuna
import optunahub
import pandas as pd

# Global settings
ENV_ID = "Env/DroneFlightv1"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'hydra'

In [2]:
def relocate_agents(env):
    return list(env.agents)  # simplified

# New helper to extract observation data for an agent
def get_agent_obs(obs, agent):
    agent_data = obs[agent]
    return np.array(agent_data[1]), np.array(agent_data[2])


# SAC Hyperparameters Optimization

In [None]:
def sac_objective(trial):
    # Define the hyperparameters to optimize
    params = params['sac_distilled']
    params.actor_lr = trial.suggest_loguniform("actor_lr", 1e-6, 5e-5)
    params.critic_lr = trial.suggest_loguniform("critic_lr", 1e-5, 5e-4)
    params.n_step = trial.suggest_categorical("n_step", [1, 3, 5])
    params.policy_delay = trial.suggest_int("policy_delay", 1, 3)
    params.noise_std = trial.suggest_uniform("noise_std", 0.1, 0.3)
    params.smooth_clip = trial.suggest_uniform("smooth_clip", 0.3, 1.0)

    # Initialize the agent
    env = UE(file_name="Env/DroneFlightv1", seed=1, side_channels=[])
    env = UPZBE(env)
    agents = relocate_agents(env)
    brain = DistilledSAC(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape,1, params)
    Buffer = SAC_ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params)

    # Training loop
    obs = env.reset()
    steps = 0
    done_flags = {agent: False for agent in env.agents}
    max_steps = params.max_steps // 10
    while steps < 150000:
        episode_rewards = []
        t = 0
        if all(done_flags.values()):
            obs = env.reset()
            done_flags = {agent: False for agent in env.agents}
        while not all(done_flags.values()) and t < params['sac_distilled'].n_steps:
            actions, values = {}, {}
            agents = relocate_agents(env)

            for agent in env.agents:
                if agent not in obs:
                    continue


                camera_obs, pos_obs = get_agent_obs(obs, agent)
                
                action_tensor = brain.get_action(camera_obs, pos_obs, train=False)
                v1, v2 = brain.get_values(camera_obs, pos_obs, action_tensor, steps + t)
                value = torch.min(v1, v2)

                actions[agent] = action_tensor.cpu().numpy()[0]
                values[agent] = value

            # Step environment safely with active agent actions
            next_obs, rewards, dones, info  = env.step(actions)
            # print(info)
            for agent in actions.keys():
                if agent not in next_obs:
                    continue

                camera_obs, pos_obs = get_agent_obs(obs, agent)
                next_camera_obs, next_pos_obs = get_agent_obs(next_obs, agent)
                reward = rewards[agent] + info[agent]['group_reward']
                reward = np.clip(rewards[agent],-1.0,1.0)  # scale reward appropriately
                done = dones[agent]
                Buffer.store(
                    camera_obs,
                    pos_obs,
                    actions[agent],
                    reward,
                    next_camera_obs,
                    next_pos_obs,
                    done
                )

                episode_rewards.append(reward)
                done_flags[agent] = done
            obs = next_obs
            t += 1
            
            # Remove terminated agents explicitly
            terminated_agents = [agent for agent, d in done_flags.items() if d]
            for agent in terminated_agents:
                del done_flags[agent]

        # Compute reward properly
        mean_reward = np.mean(np.array(episode_rewards))

        steps += t

        # Perform training step
        # actor_loss, critic_loss, entropy_loss, distill_loss = brain.train(Buffer, steps)
        brain.train(Buffer, steps)
    env.close()
    return mean_reward

# Hyperparameter tuning with Optuna for PPO and SAC Run

In [4]:
module = optunahub.load_module(package='samplers/auto_sampler')
best_params = {}
sac_study = optuna.create_study(direction='maximize', sampler=module.AutoSampler(), study_name='sac_distillation')
sac_study.optimize(sac_objective, n_trials=20)
sac_best_params = sac_study.best_params

[I 2025-04-15 09:22:24,347] A new study created in memory with name: sac_distillation
[I 2025-04-15 17:24:08,099] Trial 0 finished with value: -0.02549417889735681 and parameters: {'gamma': 0.9515870969435453, 'tau': 0.004970699455526257, 'actor_lr': 1.895952178642781e-05, 'critic_lr': 5.9882690993486226e-05, 'alpha_lr': 0.0008080835286256991, 'entropy_lr': 2.703696089275268e-05, 'distill_lr': 0.000333026459067705, 'distill_coef': 0.4378099226820733}. Best is trial 0 with value: -0.02549417889735681.
  return GPSampler(seed=seed)
[I 2025-04-16 01:25:43,861] Trial 1 finished with value: -0.023493190020053588 and parameters: {'gamma': 0.9601751071276555, 'tau': 0.0031046435256995354, 'actor_lr': 0.0001273754763414145, 'critic_lr': 0.0003034934029270385, 'alpha_lr': 0.0005881194460954781, 'entropy_lr': 0.0002020590656339279, 'distill_lr': 1.6896941017587673e-05, 'distill_coef': 0.24886211888447513}. Best is trial 1 with value: -0.023493190020053588.
[I 2025-04-16 09:38:02,580] Trial 2 fin

# Print Best results obtained

In [5]:

# Create dataframes for the best parameters
sac_best_params_df = pd.DataFrame(list(sac_best_params.items()), columns=['Parameter', 'Value'])

# Display the dataframes

print("Best SAC Parameters:")
display(sac_best_params_df)

Best SAC Parameters:


Unnamed: 0,Parameter,Value
0,gamma,0.960715
1,tau,0.001991
2,actor_lr,2.2e-05
3,critic_lr,0.00014
4,alpha_lr,0.000212
5,entropy_lr,1e-05
6,distill_lr,6.3e-05
7,distill_coef,0.129177


In [None]:
# Save the best SAC parameters
for param, value in sac_best_params.items():
    params['sac_distilled'].param = value

# Save the updated parameters to the Hyperparams module
with open('Hyperparameters.py', 'w') as f:
    f.write(f"HYPERPARAMS = {params}")

TypeError: 'types.SimpleNamespace' object does not support item assignment