In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from PPO_Distillation.DistilledPPOAgent import DistilledPPO
from PPO_Distillation.Trajectories import PPO_ExperienceBuffer
from Hyperparameters import HYPERPARAMS as params
import numpy as np
import torch
import wandb

In [2]:
wandb.init(project="PPO_Distillation", entity="fede-")
wandb.config.update(params['ppo_distilled'])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
wandb.config.update({"device": device})

[34m[1mwandb[0m: Currently logged in as: [33mrullofederico16[0m ([33mfede-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [3]:
def relocate_agents(env):
    return list(env.agents)  # simplified

# New helper to extract observation data for an agent
def get_agent_obs(obs, agent):
    agent_data = obs[agent]
    return np.array(agent_data[1]), np.array(agent_data[2])


In [4]:
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)

In [5]:
agents = relocate_agents(env)
print(agents)

['Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1', 'Drone?team=0?agent_id=10', 'Drone?team=0?agent_id=11', 'Drone?team=0?agent_id=2', 'Drone?team=0?agent_id=3', 'Drone?team=0?agent_id=4', 'Drone?team=0?agent_id=5', 'Drone?team=0?agent_id=6', 'Drone?team=0?agent_id=7', 'Drone?team=0?agent_id=8', 'Drone?team=0?agent_id=9']


In [6]:
# obs = env.reset()
# print(obs[agents[0]][1])
# possible_actions = env.action_space(agents[0]).sample()
# print(f"Possible actions: {possible_actions}")
# print(env.action_space(agents[0]).shape)
# print(env.observation_space(agents[0])[1].shape)
# print(env.observation_space(agents[0])[2].shape)

In [7]:
Buffer = PPO_ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape,env.action_space(agents[0]).shape, params['ppo_distilled'])

In [8]:
brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

In [9]:
for s in range(1, params['ppo_distilled'].seed_episodes + 1):
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    while not all(done) or t < params['ppo_distilled'].n_steps_random_exploration:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents(env)
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
            t+=1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
    print(f'Finished episode {s}')

Buffer.compute_advantages_and_returns()
print("Finished Rnd Exploration")

Finished episode 1
Finished episode 2
Finished episode 3
Finished episode 4
Finished episode 5
Finished Rnd Exploration


In [10]:
brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

In [11]:
brain.train(params['ppo_distilled'].seed_episodes*params['ppo_distilled'].n_steps_random_exploration, Buffer)

  context_layer = torch.nn.functional.scaled_dot_product_attention(


In [12]:
steps = 0
best_mean_reward = -np.inf
not_improved = 0
while steps < params['ppo_distilled'].max_steps:
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    episode_reward = 0
    while not all(done) or t < params['ppo_distilled'].n_steps:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents(env)
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            actions[agent], log_probs[agent], values[agent] = brain.get_action(obs1, obs2)
            t += 1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            obs1, obs2 = get_agent_obs(obs, agent)
            Buffer.add(obs1, obs2, actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
        tot_reward = [reward[agent] for agent in agents if agent in reward.keys()]
    obs_keys = list(obs.keys())
    _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
    Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
    mean_reward = np.mean(tot_reward)
    
    steps += t

    brain.train(steps, Buffer)
    
    Buffer.compute_advantages_and_returns()
    brain.optimizer = brain.improv_lr(brain.optimizer, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    wandb.log({"Mean Reward": mean_reward, "Steps": steps})

In [13]:
env.close()

In [14]:
torch.save(brain.net.state_dict(), "SavedModels/PPO_distilled_checkpoint.pth")
print("Checkpoint saved successfully.")
env.close()

Checkpoint saved successfully.


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
brain.net.to(device)

# Ensure the model is in evaluation mode
brain.net.eval()

# Create dummy input matching the expected input format of the model
dummy_input_1 = torch.randn(1, *env.observation_space(agents[0])[1].shape).to(device)
dummy_input_2 = torch.randn(1, *env.observation_space(agents[0])[2].shape).to(device)

# Export the model to ONNX format
torch.onnx.export(
    brain.net,
    (dummy_input_1, dummy_input_2),
    "SavedModels/PPO_distilled.onnx",
    export_params=True,
    opset_version=10,
    do_constant_folding=True,
    input_names=["observation1", "observation2"],
    output_names=["action"],
)
print("Model exported to ONNX format successfully.")

# Dispose of the dummy input tensors
del dummy_input_1
del dummy_input_2
torch.cuda.empty_cache()

Model exported to ONNX format successfully.
