In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from Dreamer.agent import Dreamer
from Dreamer.memory import ExperienceReplay
import numpy as np
import torch
import cv2

In [2]:
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)

In [3]:
agents = [agent for agent in env.agents]
print(agents)

['Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1']


In [4]:
obs = env.reset()
print(obs[agents[0]][2])
print(env.action_space(agents[0]).shape)
# print(env.observation_space(agents[0])[0].shape)
print(env.observation_space(agents[0])[1].shape)
print(env.observation_space(agents[0])[2].shape)

[ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.   10.    0.   -8.95 20.   25.77  0.   10.    0.   -8.95 20.   25.77]
(6,)
(1, 84, 84)
(24,)


In [5]:
from types import SimpleNamespace

HYPERPARAMS = {
    'dreamer': SimpleNamespace(**{
        'belief_size': 200,
        'state_size': 30,
        'hidden_size': 200,
        'embedding_size': 1024,
        'dense_act': 'relu',
        'symbolic': False,
        'camera_obs': env.observation_space(agents[0])[1].shape,
        'vector_obs': env.observation_space(agents[0])[2].shape[0],
        'action_size': env.action_space(agents[0]).shape[0],
        'cnn_act': 'relu',
        'pcont': True,
        'world_lr': 1e-3,
        'actor_lr': 8e-5,
        'value_lr': 8e-5,
        'free_nats': 3.0,
        'device': 'cuda',
        'reward_scale': 1.0,
        'pcont_scale': 10.0,
        'discount': 0.99,
        'disclam': 0.95,
        'grad_clip_norm': 100.0,
        'planning_horizon': 15,
        'expl_amount': 0.3,
        'with_logprob': False,
        'temp': 0.1,
        'batch_size': 50,
        'bit_depth': 5,
        'experience_size': 100000,
        'seed_episodes': 10,
        'collect_interval': 100,
        'chunk_size': 50,
    }),
}

n_episodes = 1000
n_steps = 10000
n_steps_rnd_exploration = 10
net_update= 100

In [6]:
params = HYPERPARAMS['dreamer']
brain = Dreamer(
	algo_name='dreamer',
	deter_dim=params.belief_size,
	stoc_dim=params.state_size,
	mlp_dim=params.hidden_size,
	embedding_dim=params.embedding_size,
	obs_shape=params.camera_obs,
	action_dim=params.action_size,
	mlp_layer=2,
	world_lr=params.world_lr,
	actor_lr=params.actor_lr,
	value_lr=params.value_lr,
	grad_clip_norm=params.grad_clip_norm,
	weight_decay=0.0,
	actor_ent=3.0,
	free_nats=params.free_nats,
	coef_pred=1.0,
	coef_dyn=1.0,
	coef_rep=1.0,
	imag_length=params.planning_horizon,
	device=params.device
)

In [7]:
D = ExperienceReplay(params.experience_size, params.symbolic, params.camera_obs, params.vector_obs, params.action_size, params.bit_depth, params.device)

In [8]:
def relocate_agents():
    return [agent for agent in env.agents]

In [9]:
# for s in range(1, params.seed_episodes + 1):
for s in range(1, 2):
    obs, done, t = env.reset(), [False, False, False], 0
    while not all(done) or t < n_steps_rnd_exploration:
        actions = {}
        agents = relocate_agents()
        for agent in agents:
            actions[agent] = env.action_space(agent).sample()
        obs, reward, done, _ = env.step(actions)
        for agent in agents:
            if agent not in obs.keys():
                continue
            if isinstance(obs[agent], list):
                D.add(np.array(obs[agent][1]), np.array(obs[agent][2]), actions[agent], reward[agent], done[agent])
            else:
                D.add(np.array(obs[agent]['observation'][1]),np.array(obs[agent]['observation'][2]), actions[agent], reward[agent], done[agent])
        t += 1
        done = [done[agent] for agent in agents if agent in done.keys()]
    print(f'Finished episode {s}')

print("Finished Rnd Exploration")
env.close()

Finished episode 1
Finished Rnd Exploration


In [10]:
env = UE(file_name="DroneFlightv1", seed=1, side_channels=[], no_graphics_monitor=True, no_graphics=True)
env = UPZBE(env)
agents = relocate_agents()

In [11]:
def replay_generator(D, batch_size, chunk_size):
    while True:
        yield D.sample(batch_size, chunk_size)

replay_iter = replay_generator(D, params.batch_size, params.chunk_size)

for episode in range(n_episodes):
    agents = relocate_agents()
    metrics = brain.update(replay_iter)
    losses = tuple(zip(*metrics))
    with torch.no_grad():
        obs, tot_reward = env.reset(), 0
        belief = torch.zeros(1, params.belief_size).to(params.device)
        posterior_state = torch.zeros(1, params.state_size).to(params.device)
        action = torch.zeros(1, params.action_size).to(params.device)

        for step in range(n_steps):
            agents = relocate_agents()
            actions={}
            for agent in agents:
                if isinstance(obs[agent], list):
                    belief, posterior_state = brain.infer_state(obs[agent][0], action, belief, posterior_state)
                else:
                    belief, posterior_state = brain.infer_state(obs[agent]['observation'][0], action, belief, posterior_state)
                action = brain.select_action((belief, posterior_state), deterministic=True)
                actions[agent] = action.argmax().detach().cpu().item()

            next_obs, reward, done,_ = env.step(actions)
            tot_reward += sum(reward.values())
            agents = relocate_agents()
            for agent in agents:
                if agent not in obs.keys() or agent not in actions.keys():
                    continue
                if isinstance(obs[agent], list):
                    camera = np.array(obs[agent][0])
                    camera = camera[0]
                    camera = camera[None, ...]  # keep shape (1,84,84)
                    D.add(camera, actions[agent], reward[agent], done[agent])
                else:
                    camera = np.array(obs[agent]['observation'][0])
                    camera = camera[0]
                    camera = camera[None, ...]  # keep shape (1,84,84)
                    D.add(camera, actions[agent], reward[agent], done[agent])
            if len(done.values())== 1 and all(done.values()):
                next_obs = env.reset()
            obs = next_obs
        
    print(f"Episode {episode} finished with reward {tot_reward}")

TypeError: TransitionModel.forward() missing 2 required positional arguments: 'observations' and 'nonterminals'