In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from PPO_Distillation.DistilledPPOAgent import DistilledPPO
from PPO_Distillation.Trajectories import ExperienceBuffer
from PPO_Distillation.Hyperparameters import HYPERPARAMS as params
import numpy as np
import torch
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init(project="PPO_Distillation", entity="fede-")
wandb.config.update(params['ppo_distilled'])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
wandb.config.update({"device": device})

[34m[1mwandb[0m: Currently logged in as: [33mrullofederico16[0m ([33mfede-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [3]:
def relocate_agents(env):
    return list(env.agents)  # simplified

# New helper to extract observation data for an agent
def get_agent_obs(obs, agent):
    agent_data = obs[agent]
    return np.array(agent_data[1]), np.array(agent_data[2])


## Hyperparameter Search

In [4]:
import optuna

def objective(trial):
    # Define the hyperparameters to optimize
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    gamma = trial.suggest_float('gamma', 0.9, 0.999, step=0.001)
    entropy_coef = trial.suggest_float('entropy_coef', 0.01, 0.1, step=0.01)
    value_loss_coef = trial.suggest_float('value_loss_coef', 0.1, 1.0, step=0.1)
    clip_grad_norm = trial.suggest_float('clip_grad_norm', 0.1, 1.0, step=0.1)
    action_std = trial.suggest_float('action_std', 0.1, 1.0, step=0.1)

    # Update the hyperparameters in the configuration
    params['ppo_distilled'].lr = lr
    params['ppo_distilled'].gamma = gamma
    params['ppo_distilled'].entropy_coef = entropy_coef
    params['ppo_distilled'].value_loss_coef = value_loss_coef
    params['ppo_distilled'].clip_grad_norm = clip_grad_norm
    params['ppo_distilled'].action_std = action_std

    # Initialize the agent
    env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
    env = UPZBE(env)
    agents = relocate_agents(env)
    brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])
    Buffer = ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

    # Training loop
    steps = 0
    while steps < 100000:
        obs, done, t = env.reset(), [False for _ in env.agents], 0
        while not all(done) or t < params['ppo_distilled'].n_steps:
            actions, log_probs, values = {}, {}, {}
            agents = relocate_agents(env)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
                t += 1

            obs, reward, done, _ = env.step(actions)
            for agent in agents:
                if agent not in obs:
                    continue
                obs1, obs2 = get_agent_obs(obs, agent)
                Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
            done = [done[agent] for agent in agents if agent in done]
            tot_reward = [reward[agent] for agent in agents if agent in reward]
        
        obs_keys = list(obs.keys())
        _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
        Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
        mean_reward = np.mean(tot_reward)
        
        steps += t

        brain.train(steps, Buffer)
        Buffer.compute_advantages_and_returns()
        brain.optimizer = brain.improv_lr(brain.optimizer, params['ppo_distilled'].lr, steps, params['ppo_distilled'].n_steps)
        brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['ppo_distilled'].lr, steps, params['ppo_distilled'].n_steps)
    
    env.close()
    return mean_reward


In [5]:
import optunahub

module = optunahub.load_module(package='samplers/auto_sampler')
best_params = {}
study = optuna.create_study(direction='maximize', sampler=module.AutoSampler())
study.optimize(objective, n_trials=10)
best_params = study.best_params

[I 2025-02-19 23:57:09,305] A new study created in memory with name: no-name-9de0f2bb-bff9-4e4c-97df-c7a52bb97d32
  context_layer = torch.nn.functional.scaled_dot_product_attention(
[I 2025-02-20 00:18:28,236] Trial 0 finished with value: -0.29206669330596924 and parameters: {'lr': 1.4235847482036186e-05, 'gamma': 0.909, 'entropy_coef': 0.09999999999999999, 'value_loss_coef': 1.0, 'clip_grad_norm': 0.8, 'action_std': 0.8}. Best is trial 0 with value: -0.29206669330596924.
  return GPSampler(seed=seed)
[I 2025-02-20 00:39:56,797] Trial 1 finished with value: -0.29206669330596924 and parameters: {'lr': 0.0006193430342324773, 'gamma': 0.97, 'entropy_coef': 0.09999999999999999, 'value_loss_coef': 0.6, 'clip_grad_norm': 0.1, 'action_std': 0.9}. Best is trial 0 with value: -0.29206669330596924.
[I 2025-02-20 01:01:53,822] Trial 2 finished with value: -0.16706667840480804 and parameters: {'lr': 2.0351082291802278e-05, 'gamma': 0.907, 'entropy_coef': 0.04, 'value_loss_coef': 0.2, 'clip_grad_no



[W 2025-02-20 08:51:30,161] Trial 4 failed with parameters: {'lr': 0.0007267685565859478, 'gamma': 0.9610000000000001, 'entropy_coef': 0.02, 'value_loss_coef': 0.8, 'clip_grad_norm': 0.9, 'action_std': 0.30000000000000004} because of the following error: UnityTimeOutException('The Unity environment took too long to respond. Make sure that :\n\t The environment does not need user interaction to launch\n\t The Agents\' Behavior Parameters > Behavior Type is set to "Default"\n\t The environment and the Python interface have compatible versions.\n\t If you\'re running on a headless server without graphics support, turn off display by either passing --no-graphics option or build your Unity executable as server build.').
Traceback (most recent call last):
  File "c:\Users\rullo\anaconda3\envs\ml_agents\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\rullo\AppData\Local\Temp\ipykernel_7720\514747813.py", line 21, in obje

UnityTimeOutException: The Unity environment took too long to respond. Make sure that :
	 The environment does not need user interaction to launch
	 The Agents' Behavior Parameters > Behavior Type is set to "Default"
	 The environment and the Python interface have compatible versions.
	 If you're running on a headless server without graphics support, turn off display by either passing --no-graphics option or build your Unity executable as server build.

In [4]:
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)

In [5]:
agents = relocate_agents(env)
print(agents)

['Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1', 'Drone?team=0?agent_id=10', 'Drone?team=0?agent_id=11', 'Drone?team=0?agent_id=2', 'Drone?team=0?agent_id=3', 'Drone?team=0?agent_id=4', 'Drone?team=0?agent_id=5', 'Drone?team=0?agent_id=6', 'Drone?team=0?agent_id=7', 'Drone?team=0?agent_id=8', 'Drone?team=0?agent_id=9']


In [6]:
# obs = env.reset()
# print(obs[agents[0]][1])
# possible_actions = env.action_space(agents[0]).sample()
# print(f"Possible actions: {possible_actions}")
# print(env.action_space(agents[0]).shape)
# print(env.observation_space(agents[0])[1].shape)
# print(env.observation_space(agents[0])[2].shape)

In [7]:
Buffer = ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape,env.action_space(agents[0]).shape, params['ppo_distilled'])

In [8]:
brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

In [9]:
for s in range(1, params['ppo_distilled'].seed_episodes + 1):
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    while not all(done) or t < params['ppo_distilled'].n_steps_random_exploration:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents(env)
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
            t+=1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
    print(f'Finished episode {s}')

Buffer.compute_advantages_and_returns()
print("Finished Rnd Exploration")
# env.close()

Finished episode 1
Finished episode 2
Finished episode 3
Finished episode 4
Finished episode 5
Finished Rnd Exploration


In [10]:
brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

In [11]:
brain.fine_tune_teacher(Buffer)

  context_layer = torch.nn.functional.scaled_dot_product_attention(


In [None]:
steps = 0
best_mean_reward = -np.inf
not_improved = 0
while steps < params['ppo_distilled'].max_steps:
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    episode_reward = 0
    while not all(done) or t < params['ppo_distilled'].n_steps:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents(env)
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
            t += 1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
        tot_reward = [reward[agent] for agent in agents if agent in reward.keys()]
    obs_keys = list(obs.keys())
    _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
    Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
    mean_reward = np.mean(tot_reward)
    
    steps += t

    brain.train(steps, Buffer) 
    
    Buffer.compute_advantages_and_returns()
    brain.optimizer = brain.improv_lr(brain.optimizer, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    wandb.log({"Mean Reward": mean_reward, "Steps": steps})

In [None]:
env.close()

In [None]:
# torch.save(brain.net.state_dict(), "PPO_distilled_checkpoint.pth")
# print("Checkpoint saved successfully.")
# env.close()

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# brain.net.to(device)

# # Ensure the model is in evaluation mode
# brain.net.eval()

# # Create dummy input matching the expected input format of the model
# dummy_input_1 = torch.randn(1, *env.observation_space(agents[0])[1].shape).to(device)
# dummy_input_2 = torch.randn(1, *env.observation_space(agents[0])[2].shape).to(device)

# # Export the model to ONNX format
# torch.onnx.export(
#     brain.net,
#     (dummy_input_1, dummy_input_2),
#     "PPO_distilled.onnx",
#     export_params=True,
#     opset_version=10,
#     do_constant_folding=True,
#     input_names=["observation1", "observation2"],
#     output_names=["action"],
# )
# print("Model exported to ONNX format successfully.")

# # Dispose of the dummy input tensors
# del dummy_input_1
# del dummy_input_2
# torch.cuda.empty_cache()