# Hyperparameter tuning with Optuna for PPO and SAC

In [None]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from PPO_Distillation.DistilledPPOAgent import DistilledPPO
from PPO_Distillation.Trajectories import PPO_ExperienceBuffer
from Hyperparameters import HYPERPARAMS as params
from SAC_Distillation.DistilledSACAgent import DistilledSAC
from SAC_Distillation.Trajectories import SAC_ExperienceBuffer
import numpy as np
import torch
import wandb
import optuna
import optunahub
import pandas as pd

In [None]:
def relocate_agents(env):
    return list(env.agents)  # simplified

# New helper to extract observation data for an agent
def get_agent_obs(obs, agent):
    agent_data = obs[agent]
    return np.array(agent_data[1]), np.array(agent_data[2])


## PPO Hyperparameters Optimization

In [None]:
def ppo_objective(trial):
    # Define the hyperparameters to optimize
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float('gamma', 0.9, 0.999, step=0.001)
    entropy_coef = trial.suggest_float('entropy_coef', 0.01, 0.1, step=0.01)
    value_loss_coef = trial.suggest_float('value_loss_coef', 0.1, 1.0, step=0.1)
    clip_grad_norm = trial.suggest_float('clip_grad_norm', 0.1, 1.0, step=0.1)
    action_std = trial.suggest_float('action_std', 0.1, 1.0, step=0.1)

    # Update the hyperparameters in the configuration
    params['ppo_distilled'].lr = lr
    params['ppo_distilled'].gamma = gamma
    params['ppo_distilled'].entropy_coef = entropy_coef
    params['ppo_distilled'].value_loss_coef = value_loss_coef
    params['ppo_distilled'].clip_grad_norm = clip_grad_norm
    params['ppo_distilled'].action_std = action_std

    # Initialize the agent
    env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
    env = UPZBE(env)
    agents = relocate_agents(env)
    brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])
    Buffer = PPO_ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

    # Training loop
    steps = 0
    while steps < 100000:
        obs, done, t = env.reset(), [False for _ in env.agents], 0
        while not all(done) or t < params['ppo_distilled'].n_steps:
            actions, log_probs, values = {}, {}, {}
            agents = relocate_agents(env)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
                t += 1

            obs, reward, done, _ = env.step(actions)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
            done = [done[agent] for agent in agents if agent in done]
            tot_reward = [reward[agent] for agent in agents if agent in reward]
        
        obs_keys = list(obs.keys())
        _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
        Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
        mean_reward = np.mean(tot_reward)
        
        steps += t

        brain.train(steps, Buffer)
        Buffer.compute_advantages_and_returns()
        brain.optimizer = brain.improv_lr(brain.optimizer, params['ppo_distilled'].lr, steps, params['ppo_distilled'].n_steps)
        brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['ppo_distilled'].lr, steps, params['ppo_distilled'].n_steps)
    
    env.close()
    return mean_reward


# SAC Hyperparameters Optimization

In [None]:
def sac_objective(trial):
    # Define the hyperparameters to optimize
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float('gamma', 0.9, 0.999, step=0.001)
    entropy_coef = trial.suggest_float('entropy_coef', 0.01, 0.1, step=0.01)
    value_loss_coef = trial.suggest_float('value_loss_coef', 0.1, 1.0, step=0.1)
    clip_grad_norm = trial.suggest_float('clip_grad_norm', 0.1, 1.0, step=0.1)
    action_std = trial.suggest_float('action_std', 0.1, 1.0, step=0.1)

    # Update the hyperparameters in the configuration
    params['sac_distilled'].lr = lr
    params['sac_distilled'].gamma = gamma
    params['sac_distilled'].entropy_coef = entropy_coef
    params['sac_distilled'].value_loss_coef = value_loss_coef
    params['sac_distilled'].clip_grad_norm = clip_grad_norm
    params['sac_distilled'].action_std = action_std

    # Initialize the agent
    env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
    env = UPZBE(env)
    agents = relocate_agents(env)
    brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['sac_distilled'])
    Buffer = PPO_ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['sac_distilled'])

    # Training loop
    steps = 0
    while steps < 100000:
        obs, done, t = env.reset(), [False for _ in env.agents], 0
        while not all(done) or t < params['sac_distilled'].n_steps:
            actions, log_probs, values = {}, {}, {}
            agents = relocate_agents(env)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
                t += 1

            obs, reward, done, _ = env.step(actions)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
            done = [done[agent] for agent in agents if agent in done]
            tot_reward = [reward[agent] for agent in agents if agent in reward]
        
        obs_keys = list(obs.keys())
        _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
        Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
        mean_reward = np.mean(tot_reward)
        
        steps += t

        brain.train(steps, Buffer)
        Buffer.compute_advantages_and_returns()
        brain.optimizer = brain.improv_lr(brain.optimizer, params['sac_distilled'].lr, steps, params['sac_distilled'].n_steps)
        brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['sac_distilled'].lr, steps, params['sac_distilled'].n_steps)
    
    env.close()
    return mean_reward

# Hyperparameter tuning with Optuna for PPO and SAC Run

In [None]:
module = optunahub.load_module(package='samplers/auto_sampler')
best_params = {}
ppo_study = optuna.create_study(direction='maximize', sampler=module.AutoSampler(), study_name='ppo_distillation')
sac_study = optuna.create_study(direction='maximize', sampler=module.AutoSampler(), study_name='sac_distillation')
ppo_study.optimize(ppo_objective, n_trials=10)
ppo_best_params = ppo_study.best_params
sac_study.optimize(sac_objective, n_trials=10)
sac_best_params = sac_study.best_params

# Print Best results obtained

In [None]:

# Create dataframes for the best parameters
ppo_best_params_df = pd.DataFrame(list(ppo_best_params.items()), columns=['Parameter', 'Value'])
sac_best_params_df = pd.DataFrame(list(sac_best_params.items()), columns=['Parameter', 'Value'])

# Display the dataframes
print("Best PPO Parameters:")
display(ppo_best_params_df)

print("Best SAC Parameters:")
display(sac_best_params_df)

In [None]:
# Save the best PPO parameters
for param, value in ppo_best_params.items():
    params['ppo_distilled'][param] = value

# Save the best SAC parameters
for param, value in sac_best_params.items():
    params['sac_distilled'][param] = value

# Save the updated parameters to the Hyperparams module
with open('Hyperparameters.py', 'w') as f:
    f.write(f"HYPERPARAMS = {params}")