In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from PPO_Distillation.DistilledPPOAgent import DistilledPPO
from PPO_Distillation.Trajectories import ExperienceBuffer
from PPO_Distillation.Hyperparameters import HYPERPARAMS as params
import numpy as np
import torch
import cv2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)

In [3]:
def relocate_agents():
    return [agent for agent in env.agents]

In [4]:
agents = relocate_agents()
print(agents)

['Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1', 'Drone?team=0?agent_id=10', 'Drone?team=0?agent_id=11', 'Drone?team=0?agent_id=2', 'Drone?team=0?agent_id=3', 'Drone?team=0?agent_id=4', 'Drone?team=0?agent_id=5', 'Drone?team=0?agent_id=6', 'Drone?team=0?agent_id=7', 'Drone?team=0?agent_id=8', 'Drone?team=0?agent_id=9']


In [5]:
obs = env.reset()
print(obs[agents[0]][1])
possible_actions = env.action_space(agents[0]).sample()
print(f"Possible actions: {possible_actions}")
print(env.action_space(agents[0]).shape)
print(env.observation_space(agents[0])[1].shape)
print(env.observation_space(agents[0])[2].shape)

[[[0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]
  [0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]
  [0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]
  ...
  [0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]
  [0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]
  [0.8039215 0.8039215 0.8039215 ... 0.8039215 0.8039215 0.8039215]]]
Possible actions: [ 0  0 -1 -1 -1  0]
(6,)
(1, 84, 84)
(24,)


In [6]:
Buffer = ExperienceBuffer(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape,env.action_space(agents[0]).shape, params['ppo_distilled'])

In [7]:
brain = DistilledPPO(env.observation_space(agents[0])[1].shape, env.observation_space(agents[0])[2].shape, env.action_space(agents[0]).shape, params['ppo_distilled'])

In [8]:
for s in range(1, params['ppo_distilled'].seed_episodes + 1):
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    while not all(done) or t < params['ppo_distilled'].n_steps_random_exploration:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents()
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            if isinstance(obs[agent], list):
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs[agent][1], obs[agent][2])
            else:
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs[agent]['observation'][1], obs[agent]['observation'][2])
            t+=1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            if isinstance(obs[agent], list):
                Buffer.add(np.array(obs[agent][1]), np.array(obs[agent][2]), actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
            else:
                Buffer.add(np.array(obs[agent]['observation'][1]),np.array(obs[agent]['observation'][2]), actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
    print(f'Finished episode {s}')

Buffer.compute_advantages_and_returns()
print("Finished Rnd Exploration")
env.close()

Finished episode 1
Finished episode 2
Finished episode 3
Finished episode 4
Finished episode 5
Finished Rnd Exploration


In [9]:
env.close()
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)
agents = relocate_agents()

In [10]:
agents = relocate_agents()
brain.train(1, Buffer)

  context_layer = torch.nn.functional.scaled_dot_product_attention(


In [13]:
steps = 0
while steps < params['ppo_distilled'].max_steps:
    obs, done, t = env.reset(), [False for _ in env.agents], 0
    episode_reward = 0
    while not all(done) or t < params['ppo_distilled'].n_steps:
        actions = {}
        log_probs = {}
        values = {}
        agents = relocate_agents()
        for agent in agents:
            # actions[agent] = env.action_space(agent).sample()
            if agent not in obs.keys():
                continue
            if isinstance(obs[agent], list):
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs[agent][1], obs[agent][2])
            else:
                actions[agent], log_probs[agent], values[agent] = brain.get_action(obs[agent]['observation'][1], obs[agent]['observation'][2])
            t += 1

        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            if isinstance(obs[agent], list):
                Buffer.add(np.array(obs[agent][1]), np.array(obs[agent][2]), actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
            else:
                Buffer.add(np.array(obs[agent]['observation'][1]),np.array(obs[agent]['observation'][2]), actions[agent], reward[agent], done[agent], log_prob=log_probs[agent], value=values[agent])
        done = [done[agent] for agent in agents if agent in done.keys()]
        tot_reward = [reward[agent] for agent in agents if agent in reward.keys()]
    obs_keys = list(obs.keys())
    _, _, last_values = brain.get_action(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2])
    Buffer.add_final_state(obs[obs_keys[-1]][1], obs[obs_keys[-1]][2], last_values)
    mean_reward = np.mean(tot_reward)
    
    steps += t

    brain.train(steps, Buffer)
    Buffer.compute_advantages_and_returns()
    brain.optimizer = brain.improv_lr(brain.optimizer, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    # brain.optimizer_distill = brain.improv_lr(brain.optimizer_distill, params['ppo_distilled'].lr,steps, params['ppo_distilled'].n_steps)
    print(f'Finished episode {steps}')
    print(f"Mean reward: {mean_reward}")

Finished episode 18926
Mean reward: -0.16706667840480804
Finished episode 42926
Mean reward: -0.2920667231082916
Finished episode 66926
Mean reward: -0.2920667231082916
Finished episode 90926
Mean reward: -0.25040003657341003
Finished episode 114926
Mean reward: -0.12539999186992645
Finished episode 138926
Mean reward: -0.20873336493968964
Finished episode 162926
Mean reward: -0.16706667840480804
Finished episode 186926
Mean reward: -0.12540000677108765
Finished episode 210926
Mean reward: -0.41706669330596924
Finished episode 234926
Mean reward: -0.2920667231082916
Finished episode 258926
Mean reward: -0.25040003657341003
Finished episode 282926
Mean reward: -0.2920667231082916
Finished episode 306926
Mean reward: -0.29206669330596924
Finished episode 330926
Mean reward: -0.16706670820713043
Finished episode 354926
Mean reward: -0.16706669330596924
Finished episode 378926
Mean reward: -0.25040003657341003
Finished episode 402926
Mean reward: -0.16706669330596924
Finished episode 42692

In [15]:
torch.save(brain.net.state_dict(), "PPO_distilled_checkpoint.pth")
print("Checkpoint saved successfully.")
env.close()

Checkpoint saved successfully.


In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# brain.net.to(device)

# # Ensure the model is in evaluation mode
# brain.net.eval()

# # Create dummy input matching the expected input format of the model
# dummy_input_1 = torch.randn(1, *env.observation_space(agents[0])[1].shape).to(device)
# dummy_input_2 = torch.randn(1, *env.observation_space(agents[0])[2].shape).to(device)

# # Export the model to ONNX format
# torch.onnx.export(
#     brain.net,
#     (dummy_input_1, dummy_input_2),
#     "PPO_distilled.onnx",
#     export_params=True,
#     opset_version=10,
#     do_constant_folding=True,
#     input_names=["observation1", "observation2"],
#     output_names=["action"],
# )
# print("Model exported to ONNX format successfully.")

# # Dispose of the dummy input tensors
# del dummy_input_1
# del dummy_input_2
# torch.cuda.empty_cache()

Model exported to ONNX format successfully.
