In [1]:
import mlagents
from mlagents_envs.environment import UnityEnvironment as UE
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv as UPZBE
from SAC_Distillation.DistilledSACAgent import DistilledSAC
from SAC_Distillation.Trajectories import SAC_ExperienceBuffer
from Hyperparameters import HYPERPARAMS as params
import numpy as np
import torch
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"]="1" # for debugging purposes

In [3]:
def relocate_agents(env):
    agents = list(set(env.agents))
    return agents  # simplified

# New helper to extract observation data for an agent
def get_agent_obs(obs, agent, *, cam_key=1, vec_keys=[0,2]):
    data = obs[agent]
    if isinstance(data, dict) and "observation" in data:
        data = data["observation"]
        cam, vec = np.asarray(data[cam_key]), np.concatenate((np.asarray(data[vec_keys[0]]), np.asarray(data[vec_keys[1]])))
    else:
        cam, vec = np.asarray(data[cam_key]), np.concatenate((np.asarray(data[vec_keys[0]]), np.asarray(data[vec_keys[1]])))

    assert cam.ndim == 3, "Camera observation should be 3D array"
    assert vec.ndim in (1,2), "Vector observation should be 1D or 2D array"
    return cam, vec

In [4]:
env = UE(file_name="Env/Level1/DroneFlightv1", seed=1)
env = UPZBE(env)

In [5]:
agents = list(set(env.agents))
print(agents)
print(env.action_spaces)
print(env._env.behavior_specs.values())

['Drone?team=0?agent_id=3', 'Drone?team=0?agent_id=1', 'Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=2']
{'Drone?team=0?agent_id=3': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=1': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=0': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=2': Box(-1.0, 1.0, (4,), float32)}
ValuesView(<mlagents_envs.base_env.BehaviorMapping object at 0x00000285FFBEFE20>)


In [6]:
obs = env.reset()
print("obs", obs.keys())
print("obs", env.observation_space(agents[0]))
# print("obs", obs['Drone?team=0?agent_id=1'][0].shape)
# print("obs", obs['Drone?team=0?agent_id=1'][1].shape)
# print("obs", obs['Drone?team=0?agent_id=1'][2].shape)
print(env.observation_space(agents[0])[0].shape)
print(env.observation_space(agents[0])[1].shape)
print(env.observation_space(agents[0])[2].shape)
print(env.action_space(agents[0]).shape)
# print({a: env.action_space(a).sample() for a in agents})
# print(env.step({a: env.action_space(a).sample() for a in agents}))

obs dict_keys(['Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1', 'Drone?team=0?agent_id=2', 'Drone?team=0?agent_id=3'])
obs Tuple(Box(-inf, inf, (36,), float32), Box(-inf, inf, (4, 84, 84), float32), Box(-inf, inf, (48,), float32))
(36,)
(4, 84, 84)
(48,)
(4,)


In [7]:
cam_shape = env.observation_space(agents[0])[1].shape
vec_dim = env.observation_space(agents[0])[0].shape[0] + env.observation_space(agents[0])[2].shape[0]
vec_shape = (vec_dim,)
action_shape = env.action_space(agents[0]).shape

In [8]:
print("cam_shape", cam_shape)
print("vec_shape", vec_shape)
print("action_shape", action_shape)

cam_shape (4, 84, 84)
vec_shape (84,)
action_shape (4,)


In [9]:
replay_buffer = SAC_ExperienceBuffer(cam_shape, vec_shape,action_shape, params['sac_distilled'])

In [10]:
agent = DistilledSAC(cam_shape, vec_shape, action_shape,len(agents), params['sac_distilled'])
agent.model.convolution_pipeline.load_state_dict(torch.load("SavedModels/feature_extractor_contrastive_init.pth"))
agent.model.convolution_pipeline.to(device)

FeatureExtractionNet(
  (convolutional_pipeline): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4), padding=(2, 2))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (distilled_converter): Linear(in_features=6400, out_features=12800, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [11]:
N_AGENTS = len(agents)
cfg = params['sac_distilled']
RAND_STEPS = cfg.get("n_steps_random_exploration", 10_000)
SEED_EPISODES = cfg.get("seed_episodes", 5)
blank_cam = np.zeros(cam_shape, dtype=np.float32)
blank_vec = np.zeros(vec_shape, dtype=np.float32)

In [12]:
# import onnxruntime as ort # Import the onnxruntime library

# # --- 1. Load your ONNX model and create an inference session ---
# # Replace "path/to/your/model.onnx" with the actual path to your ONNX file.
# ort_session = ort.InferenceSession(r"C:\Users\Fede\Desktop\MasterThesis\PPO_Unity\results\PPO_Unity\Drone\Drone-3000093.onnx")
# input_metas = ort_session.get_inputs()
# input_names = [inp.name for inp in input_metas]
# print("model requires inputs:", input_names)
# # Get the names of the input nodes. You might need to inspect your model
# # to find these names. Common names are "obs_0", "obs_1" etc.
# # We assume the first input is for camera observations and the second is for vector observations.
# input_names = [inp.name for inp in ort_session.get_inputs()]
# camera_input_name = input_names[2]
# vector_input_name = input_names[1]
# recurrent_in_meta = next((meta for meta in input_metas if meta.name == "recurrent_in"), None)

In [13]:
# tot_steps = RAND_STEPS * SEED_EPISODES   # e.g. 5 120 000

# obs_dict = env.reset()                   # one reset BEFORE the loop

# for step in range(tot_steps):
#     if not obs_dict:
#         obs_dict = env.reset()
#         continue
#     agents = relocate_agents(env)  # get the current agents in the environment
#     # --- draw a random joint action ---------------------------------
#     act_dict = {a: env.action_space(a).sample() for a in agents}
#     cam_now  = np.empty((N_AGENTS, *cam_shape),   dtype=np.float32)
#     vect_now = np.empty((N_AGENTS, *vec_shape),   dtype=np.float32)
#     act_now  = np.empty((N_AGENTS, *action_shape),dtype=np.float32)
    
#     for i, a in enumerate(agents):
#         cam, vec = get_agent_obs(obs_dict, a) if a in obs_dict else (blank_cam, blank_vec)
#         cam_now[i], vect_now[i], act_now[i] = cam, vec, act_dict[a]

#     # --- pack the current joint observation into arrays -------------
    
#     # obs_0 = env.observation_space(agents[0])[0].shape
#     # obs_3 = env.observation_space(agents[0])[3].shape
#     # obs_1 = env.observation_space(agents[0])[1].shape
#     # obs_2 = env.observation_space(agents[0])[2].shape
#     # recurrent_in_shape = recurrent_in_meta.shape
#     # recurrent_in_shape[0] = N_AGENTS  # Adjust the batch size for recurrent input
#     # obs_0_now = np.empty((N_AGENTS, *obs_0), dtype=np.float32)
#     # obs_3_now = np.empty((N_AGENTS, *obs_3), dtype=np.float32)
#     # obs_1_now = np.empty((N_AGENTS, *obs_1), dtype=np.float32)
#     # obs_2_now = np.empty((N_AGENTS, *obs_2), dtype=np.float32)
#     # cam_now  = np.empty((N_AGENTS, *cam_shape),   dtype=np.uint8)
#     # vect_now = np.empty((N_AGENTS, *vec_shape),   dtype=np.float32)
#     # act_now  = np.empty((N_AGENTS, *action_shape),dtype=np.float32)

#     # for i, a in enumerate(agents):
#     #     if a in obs_dict:
#     #         obs_0_now[i] = obs_dict[a][0]
#     #         obs_3_now[i] = obs_dict[a][3]
#     #         cam_n, vec_n = get_agent_obs(obs_dict, a) if a in obs_dict else (blank_cam, blank_vec)
#     #         cam_now[i], vect_now[i] = cam_n, vec_n
#     #         obs_1_now[i] = obs_dict[a][1]
#     #         obs_2_now[i] = obs_dict[a][2]
#     #     else:
#     #         obs_0_now[i] = np.zeros(obs_0, dtype=np.float32)
#     #         obs_3_now[i] = np.zeros(obs_3, dtype=np.float32)
#     #         cam_now[i], vect_now[i] = blank_cam, blank_vec
#     #         obs_1_now[i] = np.zeros(obs_1, dtype=np.float32)
#     #         obs_2_now[i] = np.zeros(obs_2, dtype=np.float32)

#     # model_input = {
#     #     "obs_0": obs_0_now,  # Placeholder for first observation input
#     #     "obs_3": obs_3_now,  # Placeholder for third observation input
#     #     "obs_2": obs_2_now,  # Normalize camera input
#     #     "obs_1": obs_1_now,  # Vector input
#     #     "recurrent_in": np.zeros(recurrent_in_shape, dtype=np.float32)  # Placeholder for recurrent input
#     # }
    
#     # model_output = ort_session.run(None, model_input)
#     # act_dict = {a:model_output[2][i] for i, a in enumerate(agents)}

#     # --- take one step ----------------------------------------------
#     next_obs, rew_dict, done_dict, _ = env.step(act_dict)

#     # --- pack the next‐state tensors --------------------------------
#     cam_next  = np.empty_like(cam_now)
#     vect_next = np.empty_like(vect_now)
#     rew_now   = np.zeros((N_AGENTS, 1), dtype=np.float32)
#     done_now  = np.zeros((N_AGENTS, 1), dtype=np.float32)

#     for i, a in enumerate(agents):
#         cam_n, vec_n = get_agent_obs(next_obs, a) if a in next_obs else (blank_cam, blank_vec)
#         cam_next[i], vect_next[i] = cam_n, vec_n
#         rew_now[i, 0]  = rew_dict.get(a, 0.0)
#         done_now[i, 0] = float(done_dict.get(a, False))

#     replay_buffer.store_joint(cam_now, vect_now, act_now,
#                               rew_now, cam_next, vect_next, done_now)

#     obs_dict = next_obs

#     # if the whole team is done, start a new episode
#     if all(done_dict.values()):
#         obs_dict = env.reset()

# print("Finished collecting random steps")


In [14]:
# from pathlib import Path


# epochs = 3
# batch_size = 128
# distill_lr = cfg.get('distill_lr', 1e-4)
# temperature = cfg.get('temperature', 0.07)
# num_frames = cfg.get('num_frames', 4_000)

# assert len(replay_buffer) >= batch_size, "Not enough data in the buffer to start training"
# print(f"Starting distillation training for {epochs} epochs with batch size {batch_size}")
# agent.offline_distill(
#     frame_buffer=replay_buffer,
#     epochs=epochs,
#     batch_size=batch_size,
#     lr=distill_lr,
#     temperature=temperature,
#     num_frames=num_frames,
# )
# out_dir = Path("SavedModels")
# out_dir.mkdir(exist_ok=True)


In [15]:
# torch.save(agent.model.convolution_pipeline.state_dict(), out_dir / "student_latest.pth")

In [16]:
# agent.model.convolution_pipeline.load_state_dict(torch.load("SavedModels/student_latest.pth"))

In [17]:
# agent.train(Buffer,step = params['sac_distilled'].seed_episodes*params['sac_distilled'].n_steps_random_exploration)

In [18]:
total_updates = 0
train_every = 4_096
log_every = 4_096
print_every = 10_000
max_steps = cfg.get("max_steps", 5_000_000)
goal_reached = 0.0
crashed = 0.0


ema_reward = 0.0
last_ema_reward = -np.inf
ema_alpha = cfg.get("ema_alpha", 0.01)


obs = env.reset()
steps=0

In [None]:
while steps < max_steps//2:
    if not obs:
        obs = env.reset()
        continue
    agents = relocate_agents(env)
    cam_now = np.zeros((N_AGENTS, *cam_shape), dtype=np.float32)
    vect_now = np.zeros((N_AGENTS, *vec_shape), dtype=np.float32)

    for i, aid in enumerate(agents):
        if aid in obs:
            cam, vec = get_agent_obs(obs, aid)
        else:
            cam, vec = blank_cam, blank_vec
        cam_now[i]  = cam
        vect_now[i] = vec
        
    cam_t = torch.from_numpy(np.stack(cam_now)).float().to(device)
    vec_t = torch.from_numpy(np.stack(vect_now)).float().to(device)

    if cam_t.isnan().any() or vec_t.isnan().any():
        cam_t = torch.nan_to_num(cam_t)
        vec_t = torch.nan_to_num(vec_t)
    with torch.no_grad():
        step_fraction = steps / max_steps
        act_t = agent.act(cam_t, vec_t)
    

    # act_np = torch.round(act_t).clamp(-1,1).cpu().numpy()
    act_np = act_t.cpu().numpy()
    actions = {aid: action for aid, action in zip(agents, act_np)}
    # print("Actions:", actions)
    # print(actions)
    

    next_obs, rew_dict, done_dict, infos = env.step(actions)
    steps += 1

    cam_next = np.zeros_like(cam_now)
    vect_next = np.zeros_like(vect_now)
    rew_now = np.zeros((N_AGENTS, 1), dtype=np.float32)
    done_now = np.zeros((N_AGENTS, 1), dtype=np.float32)

    

    for i, aid in enumerate(agents):
        if aid in next_obs:
            cam_n, vec_n = get_agent_obs(next_obs, aid)
        else:
            cam_n, vec_n = blank_cam, blank_vec
        cam_next[i] = cam_n
        vect_next[i] = vec_n

        r = rew_dict.get(aid, 0.0) + infos.get(aid, {}).get('reward', 0.0)
        
        rew_now[i, 0] = r
        done_now[i, 0] = done_dict.get(aid, False)

        goal_reached += 1 if r > 99 else 0
        crashed += 1 if r < -99 else 0

    replay_buffer.store_joint(
        cam_now, vect_now, act_np,
        rew_now,
        cam_next, vect_next,
        done_now
    )

    mean_r = np.mean(rew_now)
    ema_reward = ema_reward * (1- ema_alpha) + mean_r * ema_alpha

    if steps % train_every == 0:
        a_loss, c_loss, intrinsic_rew, icm_loss = agent.train(replay_buffer, step=steps)
        total_updates += 1

        # if c_loss > 1e6:
        #     agent.load("SavedModels/SAC_distilled_trained_level1.pth")
        #     print("Critic loss exploded, reloading model")
        print("Step:", steps)
        print(f"Goal Reached: {goal_reached}, Crashes: {crashed}")
        print(f"Agent Losses: Actor={a_loss:.4f}, Critic={c_loss:.4f}, RND={icm_loss:.4f}, Intrinsic Reward={intrinsic_rew:.4f}")
        print(f"Mean Reward: {mean_r:.4f}")
        
        if ema_reward > last_ema_reward:
            last_ema_reward = ema_reward
            agent.save("SavedModels/SAC_distilled_trained_level1.pth")
            print(f"New best EMA reward: {last_ema_reward:.2f}")

        # wandb.log({
        #     "EMA Reward": ema_reward,
        #     "Mean Reward": mean_r,
        #     "Actor Loss": a_loss,
        #     "Critic Loss": c_loss,
        #     "Steps": steps,
        #     "Goal Reached": goal_reached,
        #     "Crashes": crashed,
        #     "Intrinsic Reward": intrinsic_rew,
        #     "ICM Loss": icm_loss,
        # }, step=steps)
        goal_reached = 0.0
        crashed = 0.0

    obs = next_obs

env.close()

-9.999999747378752e-05
-9.999999747378752e-05
-9.999999747378752e-05
-9.999999747378752e-05
-0.13519082963466644
0.11440481245517731
0.024292128160595894
0.06231226027011871
0.05338872969150543
-0.12476621568202972
0.05105127394199371
-0.113914355635643
0.1286260038614273
-0.1507633775472641
0.08729566633701324
-0.11165891587734222
-0.07938753068447113
-0.10789667069911957
0.02115453965961933
0.06254495680332184
-0.12201391160488129
0.12166227400302887
-0.11277280747890472
0.024020330980420113
0.12237085402011871
-9.999999747378752e-05
-0.11577211320400238
-0.12235437333583832
-0.14128004014492035
0.036743298172950745
0.11344636976718903
0.08671678602695465
0.13745130598545074
-0.09819589555263519
-0.11860929429531097
-0.15423665940761566
0.08360971510410309
-0.18371771275997162
0.04749692976474762
-9.999999747378752e-05
0.03818906843662262
-0.31107741594314575
-0.02440057508647442
0.10691846907138824
-0.10932336747646332
-0.17138658463954926
0.1025926023721695
-0.14120851457118988
0.1

In [None]:
env = UE(file_name="Env/FinalLevel/DroneFlightv1", seed=1)
env = UPZBE(env)

In [None]:
agents = list(set(env.agents))
print(agents)
print(env.action_spaces)
print(env._env.behavior_specs.values())

['Drone?team=0?agent_id=3', 'Drone?team=0?agent_id=2', 'Drone?team=0?agent_id=0', 'Drone?team=0?agent_id=1']
{'Drone?team=0?agent_id=3': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=2': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=0': Box(-1.0, 1.0, (4,), float32), 'Drone?team=0?agent_id=1': Box(-1.0, 1.0, (4,), float32)}
ValuesView(<mlagents_envs.base_env.BehaviorMapping object at 0x00000251F2E7FA90>)


In [None]:
del replay_buffer
replay_buffer = SAC_ExperienceBuffer(cam_shape, vec_shape, action_shape, params['sac_distilled'])

In [None]:
tot_steps = RAND_STEPS * SEED_EPISODES   # e.g. 5 120 000

obs_dict = env.reset()                   # one reset BEFORE the loop

for step in range(tot_steps):
    if not obs_dict:
        obs_dict = env.reset()
        continue
    agents = relocate_agents(env)  # get the current agents in the environment
    # --- draw a random joint action ---------------------------------
    act_dict = {a: env.action_space(a).sample() for a in agents}

    cam_now  = np.empty((N_AGENTS, *cam_shape),   dtype=np.float32)
    vect_now = np.empty((N_AGENTS, *vec_shape),   dtype=np.float32)
    act_now  = np.empty((N_AGENTS, *action_shape),dtype=np.float32)

    for i, a in enumerate(agents):
        cam, vec = get_agent_obs(obs_dict, a) if a in obs_dict else (blank_cam, blank_vec)
        cam_now[i], vect_now[i], act_now[i] = cam, vec, act_dict[a]

    # --- take one step ----------------------------------------------
    next_obs, rew_dict, done_dict, _ = env.step(act_dict)

    # --- pack the next‐state tensors --------------------------------
    cam_next  = np.empty_like(cam_now)
    vect_next = np.empty_like(vect_now)
    rew_now   = np.zeros((N_AGENTS, 1), dtype=np.float32)
    done_now  = np.zeros((N_AGENTS, 1), dtype=np.float32)

    for i, a in enumerate(agents):
        cam_n, vec_n = get_agent_obs(next_obs, a) if a in next_obs else (blank_cam, blank_vec)
        cam_next[i], vect_next[i] = cam_n, vec_n
        rew_now[i, 0]  = rew_dict.get(a, 0.0)
        done_now[i, 0] = float(done_dict.get(a, False))

    replay_buffer.store_joint(cam_now, vect_now, act_now,
                              rew_now, cam_next, vect_next, done_now)

    obs_dict = next_obs

    # if the whole team is done, start a new episode
    if all(done_dict.values()):
        obs_dict = env.reset()

print("Finished collecting random steps")


Finished collecting random steps


In [None]:
import datetime as dt
run_name = f"sac_distill_{dt.datetime.now():%Y%m%d_%H%M%S}"
wandb.init(
            project=os.getenv("WANDB_PROJECT", "SAC_Distillation"),
            entity =os.getenv("WANDB_ENTITY",  "fede-"),
            name=run_name,
            config = {**params["sac_distilled"], "device": str(device)},
        )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrullofederico16[0m ([33mfede-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
total_updates = 0
train_every = 4_096
log_every = 4_096
print_every = 10_000
max_steps = cfg.get("max_steps", 5_000_000)
goal_reached = 0.0
crashed = 0.0


ema_reward = 0.0
last_ema_reward = -np.inf
ema_alpha = cfg.get("ema_alpha", 0.01)


obs = env.reset()
steps=0

In [None]:
while steps < max_steps:
    if not obs:
        obs = env.reset()
        continue
    agents = relocate_agents(env)
    cam_now = np.zeros((N_AGENTS, *cam_shape), dtype=np.uint8)
    vect_now = np.zeros((N_AGENTS, *vec_shape), dtype=np.float32)

    for i, aid in enumerate(agents):
        if aid in obs:
            cam, vec = get_agent_obs(obs, aid)
        else:
            cam, vec = blank_cam, blank_vec
        cam_now[i]  = cam
        vect_now[i] = vec

    cam_t = torch.from_numpy(np.stack(cam_now)).float().to(device)
    vec_t = torch.from_numpy(np.stack(vect_now)).float().to(device)

    if cam_t.isnan().any() or vec_t.isnan().any():
        cam_t = torch.nan_to_num(cam_t)
        vec_t = torch.nan_to_num(vec_t)
    with torch.no_grad():
        step_fraction = steps / max_steps
        act_t = agent.act(cam_t, vec_t)
    

    # act_np = torch.round(act_t).clamp(-1,1).cpu().numpy()
    act_np = act_t.cpu().numpy()
    actions = {aid: action for aid, action in zip(agents, act_np)}
    # print(actions)
    

    next_obs, rew_dict, done_dict, infos = env.step(actions)
    steps += 1

    cam_next = np.zeros_like(cam_now)
    vect_next = np.zeros_like(vect_now)
    rew_now = np.zeros((N_AGENTS, 1), dtype=np.float32)
    done_now = np.zeros((N_AGENTS, 1), dtype=np.float32)

    

    for i, aid in enumerate(agents):
        if aid in next_obs:
            cam_n, vec_n = get_agent_obs(next_obs, aid)
        else:
            cam_n, vec_n = blank_cam, blank_vec
        cam_next[i] = cam_n
        vect_next[i] = vec_n

        r = rew_dict.get(aid, 0.0) + infos.get(aid, {}).get('reward', 0.0)
        rew_now[i, 0] = r
        done_now[i, 0] = done_dict.get(aid, False)

        goal_reached += 1 if r > 19 else 0
        crashed += 1 if r < -9 else 0

    replay_buffer.store_joint(
        cam_now, vect_now, act_np,
        rew_now,
        cam_next, vect_next,
        done_now
    )

    mean_r = np.mean(rew_now)
    ema_reward = ema_reward * (1- ema_alpha) + mean_r * ema_alpha

    if steps % train_every == 0:
        a_loss, c_loss, intrinsic_rew, rnd_loss = agent.train(replay_buffer, step=steps)
        total_updates += 1

        if c_loss > 1e6:
            agent.load("SavedModels/SAC_distilled_trained.pth")
            print("Critic loss exploded, reloading model")
        
        if ema_reward > last_ema_reward:
            last_ema_reward = ema_reward
            agent.save("SavedModels/SAC_distilled_trained.pth")
            print(f"New best EMA reward: {last_ema_reward:.2f}")

        wandb.log({
            "EMA Reward": ema_reward,
            "Mean Reward": mean_r,
            "Actor Loss": a_loss,
            "Critic Loss": c_loss,
            "Steps": steps,
            "Goal Reached": goal_reached,
            "Crashes": crashed,
            "Intrinsic Reward": intrinsic_rew,
            "RND Loss": rnd_loss,
        }, step=steps)
        goal_reached = 0.0
        crashed = 0.0

    obs = next_obs

env.close()

New best EMA reward: -0.03


KeyboardInterrupt: 