In [1]:
import gym
import numpy as np
import torch
import torchkit.pytorch_utils as ptu
import torchsde
from torch.nn import functional as F
import random as rnd
import copy as cp
# import environments
import envs.pomdp
import pdb
# import recurrent model-free RL (separate architecture)
from policies.models.policy_rnn import ModelFreeOffPolicy_Separate_RNN as Policy_RNN
from policies.models.policy_rnn_shared import ModelFreeOffPolicy_Shared_RNN as Policy_Shared_RNN
from policies.models.policy_mlp import ModelFreeOffPolicy_MLP as Policy_MLP
from tqdm import tqdm
# import the replay buffer
from buffers.seq_replay_buffer_vanilla import SeqReplayBuffer
from buffers.simple_replay_buffer import SimpleReplayBuffer 
from utils import helpers as utl
from typing import Sequence
from read_ini import read_ini
conf =read_ini("C:/Users/alexander.vasilyev/pomdp-baselines-main/configfile.ini")

  logger.warn(
  from collections import OrderedDict, Set
  if not hasattr(tensorboard, "__version__") or LooseVersion(


## Build a POMDP environment: Pendulum-V (only observe the velocity)

In [2]:
cuda_id = 0  # -1 if using cpu
ptu.set_gpu_mode(torch.cuda.is_available() and cuda_id >= 0, cuda_id)

env = gym.make(conf["env_name"])
max_trajectory_len = env._max_episode_steps
act_dim = env.action_space.shape[0]
obs_dim = env.observation_space.shape[0]

shared = False
markov = False

if markov:
    agent = Policy_MLP(
        obs_dim=obs_dim,
        action_dim=act_dim,
        algo_name=conf["algo_name"],
        dqn_layers=[128, 128],
        policy_layers=[128, 128],
        lr=3e-4,
        gamma=0.99,
        tau=5e-3,
    ).to(ptu.device)
    encoder="Nan"
else:
    if shared:
        agent = Policy_Shared_RNN(
            obs_dim=obs_dim,
            action_dim=act_dim,
            encoder=conf["encoder"],
            algo_name=conf["algo_name"],
            action_embedding_size=int(conf["action_embedding_size"]),
            observ_embedding_size=int(conf["observ_embedding_size"]),
            reward_embedding_size=int(conf["reward_embedding_size"]),
            rnn_hidden_size=int(conf["hidden_size"]),
            dqn_layers=[128, 128],
            policy_layers=[128, 128],
            lr=float(conf["lr"]),
            gamma=0.9,
            tau=0.005,
            embed=True,
        ).to(ptu.device)
    else: 
        agent = Policy_RNN(
            obs_dim=obs_dim,
            action_dim=act_dim,
            encoder=conf["encoder"],
            algo_name=conf["algo_name"],
            action_embedding_size=int(conf["action_embedding_size"]),
            observ_embedding_size=int(conf["observ_embedding_size"]),
            reward_embedding_size=int(conf["reward_embedding_size"]),
            rnn_hidden_size=int(conf["hidden_size"]),
            dqn_layers=[128, 128],
            policy_layers=[128, 128],
            lr=float(conf["lr"]),
            gamma=0.9,
            tau=0.005,
            radii=40,
            embed=True,
            activation = conf["activation"],
        ).to(ptu.device)
    
print(agent)
lr=float(conf["lr"])
encoder=conf["encoder"]
num_updates_per_iter = int(conf["num_updates_per_iter"])  # training frequency
sampled_seq_len = int(conf["sampled_seq_len"])  # context length
buffer_size = int(float(conf["buffer_size"]))
batch_size = int(conf["batch_size"])
dropout_rate=float(conf["dropout_rate"])
num_iters = int(conf["num_iters"])
num_init_rollouts_pool = int(conf["num_init_rollouts_pool"])
num_rollouts_per_iter = int(conf["num_rollouts_per_iter"])
total_rollouts = num_init_rollouts_pool + num_iters * num_rollouts_per_iter
n_env_steps_total = max_trajectory_len * total_rollouts
_n_env_steps_total = 0
print("total env episodes", total_rollouts, "total env steps", n_env_steps_total)


ModelFreeOffPolicy_Separate_RNN(
  (critic): Critic_RNN(
    (observ_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=32, bias=True)
    )
    (action_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=8, bias=True)
    )
    (reward_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=8, bias=True)
    )
    (rnn): NeuralCDE(
      (func): CDEFunc(
        (linear0): Linear(in_features=72, out_features=72, bias=True)
        (linear1): Linear(in_features=72, out_features=72, bias=True)
        (linear2): Linear(in_features=72, out_features=3528, bias=True)
      )
      (initial): Linear(in_features=49, out_features=72, bias=True)
      (readout): Linear(in_features=72, out_features=72, bias=True)
    )
    (current_shortcut_embedder): FeatureExtractor(
      (fc): Linear(in_features=2, out_features=48, bias=True)
    )
    (qf1): FlattenMlp(
      (fc0): Linear(in_features=120, out_features=128, bias=True)
   

## Build a recurent model-free RL agent: separate architecture, `lstm` encoder, `oar` policy input space, `td3` RL algorithm (context length set later)

## Define other training parameters such as context length and training frequency

## Define key functions: collect rollouts and policy update

In [3]:
def create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,init):
    
    if init:
        obs_row= obs
        rew_row = prev_reward
        act_row = prev_action
    else:
        obs_row=torch.cat((obs, next_obs),0)
        rew_row=torch.cat((prev_reward, reward),0)
        act_row=torch.cat((prev_action, action),0)
 
    if shared: 
        obs_row=agent.observ_embedder(obs_row)
        rew_row=agent.reward_embedder(rew_row)
        act_row=agent.action_embedder(act_row)
    else: 
        obs_row=agent.actor.observ_embedder(obs_row)
        rew_row=agent.actor.reward_embedder(rew_row)
        act_row=agent.actor.action_embedder(act_row)
    
    if init:
        time_tensor=torch.tensor([[steps]]).to(ptu.device)
    else:
        time_tensor=torch.tensor([[steps],[steps+1]]).to(ptu.device)

    ncde_row=torch.cat((time_tensor,act_row,obs_row,rew_row),1)
    ncde_row=ncde_row[None,:]
    
    return ncde_row



@torch.no_grad()
def collect_rollouts(
    num_rollouts, random_actions=False, deterministic=True, train_mode=True
):
    """collect num_rollouts of trajectories in task and save into policy buffer
    :param
        random_actions: whether to use policy to sample actions, or randomly sample action space
        deterministic: deterministic action selection?
        train_mode: whether to train (stored to buffer) or test
    """
    if not train_mode:
        assert random_actions == False and deterministic == True

    total_steps = 0
    total_rewards = 0.0
    trewards =[]
    for idx in range(num_rollouts):
        steps = 0
        rewards = 0.0
        energy = 0.0
        print(env.reset())
        obs = ptu.from_numpy(env.reset())
        obs = obs.reshape(1, obs.shape[-1])
        done_rollout = False
        init=True
        # get hidden state at timestep=0, None for mlp
        
        if not markov:
            action, reward, internal_state = agent.get_initial_info()

            if encoder == "ncde":
                internal_state= None
                ncde_row= create_ncde_row(obs, obs, action, action, reward, reward, steps,init)
                prev_action= action.clone()
                prev_reward= reward.clone()
                next_obs= obs.clone()
        
        
        if train_mode:
            # temporary storage
            obs_list, act_list, rew_list, next_obs_list, term_list = (
                [],
                [],
                [],
                [],
                [],
            )
                           

        while not done_rollout:
            if markov: 
                action = agent.act(obs=obs, deterministic=deterministic)[0]
            else:
                if encoder == "ncde":
                    (action,_,_,_), internal_state= agent.ncde_act(ncde_row=ncde_row, prev_internal_state=internal_state, obs=obs,  deterministic=deterministic)
                else:
                    (action, _, _, _), internal_state = agent.act(
                        prev_internal_state=internal_state,
                        prev_action=action,
                        reward=reward,
                        obs=obs,
                        deterministic=deterministic,
                    )
            # observe reward and next obs (B=1, dim)
            #pdb.set_trace()
        
            #print(torch.norm(internal_state))
            next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))
            done_rollout = False if ptu.get_numpy(done[0][0]) == 0.0 else True
            init=False
            
            if not markov:
                if encoder == "ncde":
   
                    ncde_row= create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,init)
            
            #switch on/off dropouts
            #drop_trigger=rnd.uniform(0,1)
            #if drop_trigger<dropout_rate:
            #    next_obs=cp.deepcopy(obs)
            # update statistics
           
            rewards += reward.item()
            energy += action*action
           
            # early stopping env: such as rmdp, pomdp, generalize tasks. term ignores timeout
            term = (
                False
                if "TimeLimit.truncated" in info or steps >= max_trajectory_len
                else done_rollout
            )

            if train_mode:
                # append tensors to temporary storage
                obs_list.append(obs)  # (1, dim)
                act_list.append(action)  # (1, dim)
                rew_list.append(reward)  # (1, dim)
                term_list.append(term)  # bool
                next_obs_list.append(next_obs)  # (1, dim)
            steps += 1
            # set: obs <- next_obs
            obs = next_obs.clone()
            prev_reward= reward.clone()
            prev_action= action.clone()
        if train_mode:
            # add collected sequence to buffer
            policy_storage.add_episode(
                observations=ptu.get_numpy(torch.cat(obs_list, dim=0)),  # (L, dim)
                actions=ptu.get_numpy(torch.cat(act_list, dim=0)),  # (L, dim)
                rewards=ptu.get_numpy(torch.cat(rew_list, dim=0)),  # (L, dim)
                terminals=np.array(term_list).reshape(-1, 1),  # (L, 1)
                next_observations=ptu.get_numpy(
                    torch.cat(next_obs_list, dim=0)
                ),  # (L, dim)
            )
        print(
            "Mode:",
            "Train" if train_mode else "Test",
            "env_steps",
            steps,
            "total rewards",
            rewards,
            "total energy",
            energy,
        )
        total_steps += steps
        total_rewards += rewards
        trewards.append(rewards)
    if train_mode:
        return total_steps
    else:
        return total_rewards / num_rollouts, np.std(trewards)


def update(num_updates, factor):
    rl_losses_agg = {}
    # print(num_updates)
    for update in tqdm(range(num_updates), leave=True):
        # sample random RL batch: in transitions
        batch = ptu.np_to_pytorch_batch(policy_storage.random_episodes(batch_size))
        # RL update
        
        rl_losses = agent.update(batch, factor)

        for k, v in rl_losses.items():
            if update == 0:  # first iterate - create list
                rl_losses_agg[k] = [v]
            else:  # append values
                rl_losses_agg[k].append(v)
    # statistics
    for k in rl_losses_agg:
        rl_losses_agg[k] = np.mean(rl_losses_agg[k])
    return rl_losses_agg

## Train and Evaluate the agent: only costs < 20 min

In [None]:
policy_storage = SeqReplayBuffer(
    max_replay_buffer_size=buffer_size,
    observation_dim=obs_dim,
    action_dim=act_dim,
    sampled_seq_len=sampled_seq_len,
    sample_weight_baseline=0.0,
)

env_steps = collect_rollouts(
    num_rollouts=num_init_rollouts_pool, random_actions=False, train_mode=True
)
_n_env_steps_total += env_steps

# evaluation parameters
last_eval_num_iters = 10
log_interval = 5
eval_num_rollouts = 10
learning_curve = {
    "x": [],
    "y": [],
    "z": [],
}
epoch=0
lambda_pat = 0.65

while _n_env_steps_total < n_env_steps_total:

    env_steps = collect_rollouts(num_rollouts=num_rollouts_per_iter, train_mode=True)
    _n_env_steps_total += env_steps

    #train_stats = update(int(num_updates_per_iter * env_steps))
    factor= lambda_pat **(epoch )
    #train_stats = update(int(num_updates_per_iter * env_steps))
    train_stats = update(25, lr)
    
    epoch += 1
    current_num_iters = _n_env_steps_total // (
        num_rollouts_per_iter * max_trajectory_len
    )
    if (
        current_num_iters != last_eval_num_iters
        and current_num_iters % log_interval == 0
    ):
        last_eval_num_iters = current_num_iters
        average_returns, std_returns = collect_rollouts(
            num_rollouts=eval_num_rollouts,
            train_mode=False,
            random_actions=False,
            deterministic=True,
        )
        learning_curve["x"].append(_n_env_steps_total)
        learning_curve["y"].append(average_returns)
        learning_curve["z"].append(std_returns)
        print(_n_env_steps_total, average_returns)

buffer RAM usage: 0.02 GB
[-0.18286392]




Mode: Train env_steps 200 total rewards -627.1723878339399 total energy tensor([[0.0009]])
[-0.86359847]
Mode: Train env_steps 200 total rewards -1465.099901676178 total energy tensor([[0.0006]])
[0.74585605]
Mode: Train env_steps 200 total rewards -968.4575101137161 total energy tensor([[0.0010]])
[-0.8119963]
Mode: Train env_steps 200 total rewards -1064.254163146019 total energy tensor([[0.0011]])
[-0.65336865]
Mode: Train env_steps 200 total rewards -1254.5659635066986 total energy tensor([[0.0009]])
[-0.84177446]
Mode: Train env_steps 200 total rewards -1584.4519271850586 total energy tensor([[0.0006]])
[0.05887923]
Mode: Train env_steps 200 total rewards -628.1147714760154 total energy tensor([[0.0009]])
[-0.29377258]
Mode: Train env_steps 200 total rewards -737.794536806643 total energy tensor([[0.0010]])
[-0.8305148]
Mode: Train env_steps 200 total rewards -1557.0814833641052 total energy tensor([[0.0006]])
[0.61149436]
Mode: Train env_steps 200 total rewards -1171.859337568283

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if v.dtype == np.bool:
100%|██████████| 25/25 [01:02<00:00,  2.49s/it]


[0.6731633]
Mode: Train env_steps 200 total rewards -1343.8932387828827 total energy tensor([[31.7528]])
[-0.27208433]
Mode: Train env_steps 200 total rewards -1266.3245940208435 total energy tensor([[32.4656]])
[0.5807534]
Mode: Train env_steps 200 total rewards -1699.2578616142273 total energy tensor([[19.7072]])
[-0.04527098]
Mode: Train env_steps 200 total rewards -1307.411400437355 total energy tensor([[31.1604]])
[-0.97157735]
Mode: Train env_steps 200 total rewards -1351.332904547453 total energy tensor([[32.0476]])


100%|██████████| 25/25 [01:07<00:00,  2.68s/it]


[-0.48463875]
Mode: Train env_steps 200 total rewards -1013.9208480650559 total energy tensor([[3.8427]])
[0.5292008]
Mode: Train env_steps 200 total rewards -905.1960357353091 total energy tensor([[2.5171]])
[-0.27751896]
Mode: Train env_steps 200 total rewards -1270.2195472717285 total energy tensor([[2.2214]])
[0.39900678]
Mode: Train env_steps 200 total rewards -1548.196283340454 total energy tensor([[1.6130]])
[0.55401415]
Mode: Train env_steps 200 total rewards -1056.3510165549815 total energy tensor([[4.3005]])


100%|██████████| 25/25 [01:14<00:00,  2.99s/it]


[0.8541971]
Mode: Train env_steps 200 total rewards -1039.1114730238914 total energy tensor([[6.0354]])
[0.41087466]
Mode: Train env_steps 200 total rewards -1600.6446633338928 total energy tensor([[2.6838]])
[0.76770645]
Mode: Train env_steps 200 total rewards -959.2560813948512 total energy tensor([[5.9694]])
[0.56193846]
Mode: Train env_steps 200 total rewards -1045.8042680621147 total energy tensor([[6.0185]])
[-0.4370459]
Mode: Train env_steps 200 total rewards -1092.2935677319765 total energy tensor([[6.8191]])


100%|██████████| 25/25 [01:10<00:00,  2.83s/it]


[-0.6180618]
Mode: Test env_steps 200 total rewards -1662.0854816436768 total energy tensor([[14.0714]])
[-0.8945591]
Mode: Test env_steps 200 total rewards -1202.432815656066 total energy tensor([[18.3976]])
[0.890028]
Mode: Test env_steps 200 total rewards -1941.72012424469 total energy tensor([[0.5264]])
[0.80951387]
Mode: Test env_steps 200 total rewards -1864.3609809875488 total energy tensor([[5.9019]])
[0.42710203]
Mode: Test env_steps 200 total rewards -1746.466650724411 total energy tensor([[13.6720]])
[0.74520326]
Mode: Test env_steps 200 total rewards -1943.074589729309 total energy tensor([[0.6629]])
[-0.25170442]
Mode: Test env_steps 200 total rewards -1814.7647194862366 total energy tensor([[6.0244]])
[0.27936238]
Mode: Test env_steps 200 total rewards -1629.9115783572197 total energy tensor([[21.4080]])
[-0.00369019]
Mode: Test env_steps 200 total rewards -1777.9158170223236 total energy tensor([[11.5687]])
[0.16060762]
Mode: Test env_steps 200 total rewards -1491.595801

100%|██████████| 25/25 [01:15<00:00,  3.03s/it]


[0.9493371]
Mode: Train env_steps 200 total rewards -1725.1589938402176 total energy tensor([[11.8734]])
[-0.6375915]
Mode: Train env_steps 200 total rewards -1766.9023275375366 total energy tensor([[9.1713]])
[0.1520319]
Mode: Train env_steps 200 total rewards -1849.0722332000732 total energy tensor([[3.8441]])
[-0.89367926]
Mode: Train env_steps 200 total rewards -1626.971121788025 total energy tensor([[16.1817]])
[0.26634613]
Mode: Train env_steps 200 total rewards -1375.0553357303143 total energy tensor([[42.9877]])


100%|██████████| 25/25 [01:09<00:00,  2.79s/it]


[0.77172476]
Mode: Train env_steps 200 total rewards -1404.228477358818 total energy tensor([[56.1248]])
[-0.14430985]
Mode: Train env_steps 200 total rewards -1414.0377424955368 total energy tensor([[56.4431]])
[-0.18019481]
Mode: Train env_steps 200 total rewards -1420.5737288594246 total energy tensor([[57.7075]])
[0.05015166]
Mode: Train env_steps 200 total rewards -1876.0046792030334 total energy tensor([[2.0594]])
[0.8314296]
Mode: Train env_steps 200 total rewards -1534.730565071106 total energy tensor([[3.8204]])


100%|██████████| 25/25 [01:12<00:00,  2.90s/it]


[0.3954042]
Mode: Train env_steps 200 total rewards -1383.6435743272305 total energy tensor([[94.4836]])
[0.20333406]
Mode: Train env_steps 200 total rewards -1355.230950385332 total energy tensor([[81.6843]])
[0.8841586]
Mode: Train env_steps 200 total rewards -1192.585041904822 total energy tensor([[68.4791]])
[0.83319604]
Mode: Train env_steps 200 total rewards -1731.3692646026611 total energy tensor([[5.9787]])
[0.0804089]
Mode: Train env_steps 200 total rewards -908.4952503293753 total energy tensor([[2.1642]])


100%|██████████| 25/25 [01:23<00:00,  3.35s/it]


[0.15499073]
Mode: Train env_steps 200 total rewards -1607.0602875351906 total energy tensor([[163.1991]])
[-0.19695659]
Mode: Train env_steps 200 total rewards -1518.6114336028695 total energy tensor([[137.2252]])
[0.4013462]
Mode: Train env_steps 200 total rewards -1890.2161655426025 total energy tensor([[2.0544]])
[-0.5135966]
Mode: Train env_steps 200 total rewards -1951.1738548278809 total energy tensor([[0.4269]])
[0.1571486]
Mode: Train env_steps 200 total rewards -1874.922622680664 total energy tensor([[1.8949]])


100%|██████████| 25/25 [01:32<00:00,  3.71s/it]


[0.9871652]
Mode: Test env_steps 200 total rewards -1388.7813239395618 total energy tensor([[108.5133]])
[0.48804292]
Mode: Test env_steps 200 total rewards -1611.185109257698 total energy tensor([[168.8188]])
[0.5315943]
Mode: Test env_steps 200 total rewards -1460.1157260090113 total energy tensor([[131.8175]])
[-0.8156318]
Mode: Test env_steps 200 total rewards -1585.284763097763 total energy tensor([[163.9564]])
[0.4804839]
Mode: Test env_steps 200 total rewards -1534.7785720825195 total energy tensor([[142.8596]])
[0.24370894]
Mode: Test env_steps 200 total rewards -1515.9913493543863 total energy tensor([[144.6905]])
[-0.45471397]
Mode: Test env_steps 200 total rewards -1577.2756012380123 total energy tensor([[166.0901]])
[0.6268348]
Mode: Test env_steps 200 total rewards -1913.2778205871582 total energy tensor([[1.8658]])
[0.9688026]
Mode: Test env_steps 200 total rewards -1491.8189459443092 total energy tensor([[130.5346]])
[-0.37145162]
Mode: Test env_steps 200 total rewards -

100%|██████████| 25/25 [01:48<00:00,  4.33s/it]


[-0.1599921]
Mode: Train env_steps 200 total rewards -1633.14479804039 total energy tensor([[180.2180]])
[-0.6888604]
Mode: Train env_steps 200 total rewards -1848.7131152153015 total energy tensor([[2.3004]])
[-0.61524165]
Mode: Train env_steps 200 total rewards -1926.1032905578613 total energy tensor([[1.4972]])
[-0.28260043]
Mode: Train env_steps 200 total rewards -1555.1142466068268 total energy tensor([[156.3283]])
[0.1762983]
Mode: Train env_steps 200 total rewards -1938.098424911499 total energy tensor([[0.9474]])


100%|██████████| 25/25 [01:16<00:00,  3.07s/it]


[0.32413808]
Mode: Train env_steps 200 total rewards -1633.885877341032 total energy tensor([[182.8791]])
[0.43044457]
Mode: Train env_steps 200 total rewards -1645.906468808651 total energy tensor([[183.3582]])
[0.16840471]
Mode: Train env_steps 200 total rewards -1919.4635977745056 total energy tensor([[1.5437]])
[0.99078465]
Mode: Train env_steps 200 total rewards -1890.980372428894 total energy tensor([[2.2630]])
[-0.6927419]
Mode: Train env_steps 200 total rewards -1838.686912536621 total energy tensor([[3.0092]])


100%|██████████| 25/25 [01:16<00:00,  3.06s/it]


[-0.8044674]
Mode: Train env_steps 200 total rewards -1782.8735976219177 total energy tensor([[2.9067]])
[-0.42788523]
Mode: Train env_steps 200 total rewards -1516.7631882429123 total energy tensor([[142.3286]])
[0.6187882]
Mode: Train env_steps 200 total rewards -1560.5180988311768 total energy tensor([[163.9623]])
[0.62228]
Mode: Train env_steps 200 total rewards -1621.3189475536346 total energy tensor([[184.8376]])
[0.4871931]
Mode: Train env_steps 200 total rewards -1531.0628660917282 total energy tensor([[149.9875]])


100%|██████████| 25/25 [01:21<00:00,  3.27s/it]


[0.6548461]
Mode: Train env_steps 200 total rewards -1401.1626613512635 total energy tensor([[140.8096]])
[-0.7380303]
Mode: Train env_steps 200 total rewards -1946.8271017074585 total energy tensor([[0.6389]])
[-0.13667013]
Mode: Train env_steps 200 total rewards -1580.3786436319351 total energy tensor([[171.2686]])
[0.4475782]
Mode: Train env_steps 200 total rewards -1579.7315204441547 total energy tensor([[181.3769]])
[-0.7010048]
Mode: Train env_steps 200 total rewards -1939.3147706985474 total energy tensor([[1.1237]])


100%|██████████| 25/25 [01:18<00:00,  3.15s/it]


[0.7755755]
Mode: Test env_steps 200 total rewards -1583.850512623787 total energy tensor([[170.8460]])
[0.4115116]
Mode: Test env_steps 200 total rewards -1221.2874257480726 total energy tensor([[105.6162]])
[-0.75288486]
Mode: Test env_steps 200 total rewards -1579.1257704626769 total energy tensor([[182.1161]])
[0.18034057]
Mode: Test env_steps 200 total rewards -1568.1845009326935 total energy tensor([[168.7669]])
[0.8122652]
Mode: Test env_steps 200 total rewards -1912.386471748352 total energy tensor([[1.9767]])
[-0.19943881]
Mode: Test env_steps 200 total rewards -1134.3799818726256 total energy tensor([[99.1288]])
[-0.28473204]
Mode: Test env_steps 200 total rewards -1524.179848909378 total energy tensor([[160.4480]])
[-0.8504278]
Mode: Test env_steps 200 total rewards -1924.3445177078247 total energy tensor([[1.8270]])
[-0.9131769]
Mode: Test env_steps 200 total rewards -1921.9585580825806 total energy tensor([[1.9523]])
[-0.27747452]
Mode: Test env_steps 200 total rewards -18

100%|██████████| 25/25 [01:21<00:00,  3.24s/it]


[-0.70642376]
Mode: Train env_steps 200 total rewards -1580.261923134327 total energy tensor([[180.5030]])
[-0.38362578]
Mode: Train env_steps 200 total rewards -1265.0422469377518 total energy tensor([[83.9454]])
[-0.65218663]
Mode: Train env_steps 200 total rewards -1563.4446083307266 total energy tensor([[167.5109]])
[-0.909472]
Mode: Train env_steps 200 total rewards -1813.333912372589 total energy tensor([[3.5617]])
[-0.62521166]
Mode: Train env_steps 200 total rewards -1602.8814917057753 total energy tensor([[187.6884]])


100%|██████████| 25/25 [01:13<00:00,  2.92s/it]


[-0.63980085]
Mode: Train env_steps 200 total rewards -1929.8248805999756 total energy tensor([[1.3611]])
[-0.41099325]
Mode: Train env_steps 200 total rewards -1420.258905224502 total energy tensor([[149.0931]])
[0.79847807]
Mode: Train env_steps 200 total rewards -1762.5496401786804 total energy tensor([[3.6167]])
[0.7797261]
Mode: Train env_steps 200 total rewards -1521.9018638134003 total energy tensor([[150.9355]])
[-0.0791804]
Mode: Train env_steps 200 total rewards -1488.4741319417953 total energy tensor([[129.9081]])


100%|██████████| 25/25 [01:23<00:00,  3.32s/it]


[0.13292982]
Mode: Train env_steps 200 total rewards -1475.902972459793 total energy tensor([[134.6350]])
[-0.6884185]
Mode: Train env_steps 200 total rewards -1633.0113563537598 total energy tensor([[189.8305]])
[-0.61551696]
Mode: Train env_steps 200 total rewards -1462.1281624436378 total energy tensor([[126.4449]])
[-0.1768803]
Mode: Train env_steps 200 total rewards -1543.240952834487 total energy tensor([[162.6967]])
[0.26363453]
Mode: Train env_steps 200 total rewards -1526.7065216004848 total energy tensor([[157.6806]])


100%|██████████| 25/25 [01:21<00:00,  3.25s/it]


[0.4762924]
Mode: Train env_steps 200 total rewards -1623.2258843183517 total energy tensor([[187.1408]])
[0.99629647]
Mode: Train env_steps 200 total rewards -1555.9145169258118 total energy tensor([[164.2038]])
[0.12906049]
Mode: Train env_steps 200 total rewards -1868.6943001747131 total energy tensor([[2.7933]])
[0.64094114]
Mode: Train env_steps 200 total rewards -1558.6559108784422 total energy tensor([[177.5862]])
[-0.6625711]
Mode: Train env_steps 200 total rewards -1693.5302453041077 total energy tensor([[4.9267]])


100%|██████████| 25/25 [01:23<00:00,  3.33s/it]


[0.6380241]
Mode: Test env_steps 200 total rewards -1956.2669343948364 total energy tensor([[0.5630]])
[-0.36989704]
Mode: Test env_steps 200 total rewards -1564.3688442111015 total energy tensor([[165.5622]])
[0.778222]
Mode: Test env_steps 200 total rewards -1294.252350345254 total energy tensor([[82.4476]])
[0.536951]
Mode: Test env_steps 200 total rewards -1560.7814950942993 total energy tensor([[165.5268]])
[0.85153985]
Mode: Test env_steps 200 total rewards -1584.728413619101 total energy tensor([[180.8173]])
[-0.4528105]
Mode: Test env_steps 200 total rewards -1791.3184142112732 total energy tensor([[4.3643]])
[0.02103655]
Mode: Test env_steps 200 total rewards -1622.4556182920933 total energy tensor([[187.3471]])
[0.82927877]
Mode: Test env_steps 200 total rewards -1657.6632115840912 total energy tensor([[189.1216]])
[-0.26068917]
Mode: Test env_steps 200 total rewards -1185.4368878901005 total energy tensor([[20.9354]])
[-0.7090724]
Mode: Test env_steps 200 total rewards -1432

100%|██████████| 25/25 [01:25<00:00,  3.42s/it]


[-0.22729349]
Mode: Train env_steps 200 total rewards -1836.8441700935364 total energy tensor([[4.8747]])
[0.45464116]
Mode: Train env_steps 200 total rewards -1450.4538716003299 total energy tensor([[154.2228]])
[-0.9383594]
Mode: Train env_steps 200 total rewards -1205.8366934508085 total energy tensor([[47.2379]])
[-0.23815139]
Mode: Train env_steps 200 total rewards -1338.94384470582 total energy tensor([[104.3900]])
[-0.76947427]
Mode: Train env_steps 200 total rewards -1949.2370471954346 total energy tensor([[0.5583]])


100%|██████████| 25/25 [01:23<00:00,  3.34s/it]


[-0.7039796]
Mode: Train env_steps 200 total rewards -1859.483868598938 total energy tensor([[3.2718]])
[-0.98048836]
Mode: Train env_steps 200 total rewards -1552.5771242678165 total energy tensor([[164.8958]])
[-0.6726894]
Mode: Train env_steps 200 total rewards -1918.0378684997559 total energy tensor([[2.1549]])
[-0.91917175]
Mode: Train env_steps 200 total rewards -1874.0962281227112 total energy tensor([[2.9727]])
[0.32118738]
Mode: Train env_steps 200 total rewards -1534.6853347420692 total energy tensor([[161.3980]])


100%|██████████| 25/25 [01:18<00:00,  3.12s/it]


[-0.9200177]
Mode: Train env_steps 200 total rewards -1931.9645223617554 total energy tensor([[1.3840]])
[-0.48450485]
Mode: Train env_steps 200 total rewards -1624.0453878641129 total energy tensor([[186.1614]])
[0.61567676]
Mode: Train env_steps 200 total rewards -1513.319035757333 total energy tensor([[161.4876]])
[-0.5664824]
Mode: Train env_steps 200 total rewards -1900.7502689361572 total energy tensor([[2.3817]])
[-0.6554988]
Mode: Train env_steps 200 total rewards -1925.666184425354 total energy tensor([[1.8183]])


100%|██████████| 25/25 [01:20<00:00,  3.23s/it]


[0.24706006]
Mode: Train env_steps 200 total rewards -1648.9352271556854 total energy tensor([[190.8652]])
[0.66996366]
Mode: Train env_steps 200 total rewards -1947.4450283050537 total energy tensor([[1.0803]])
[-0.27475578]
Mode: Train env_steps 200 total rewards -1652.222731411457 total energy tensor([[191.3376]])
[0.30260766]
Mode: Train env_steps 200 total rewards -1611.1025849878788 total energy tensor([[188.0383]])
[0.34630948]
Mode: Train env_steps 200 total rewards -1628.6147825717926 total energy tensor([[189.7363]])


100%|██████████| 25/25 [01:16<00:00,  3.06s/it]


[-0.16545616]
Mode: Test env_steps 200 total rewards -1062.615165702533 total energy tensor([[74.5741]])
[0.8755364]
Mode: Test env_steps 200 total rewards -1339.0814244151115 total energy tensor([[114.6412]])
[0.96283424]
Mode: Test env_steps 200 total rewards -1799.3880281448364 total energy tensor([[5.9255]])
[-0.5272495]
Mode: Test env_steps 200 total rewards -1870.2569093704224 total energy tensor([[4.3140]])
[-0.89746386]
Mode: Test env_steps 200 total rewards -1647.4229617714882 total energy tensor([[191.1246]])
[0.53423387]
Mode: Test env_steps 200 total rewards -1814.1247220039368 total energy tensor([[5.4035]])
[-0.36603174]
Mode: Test env_steps 200 total rewards -1855.7251420021057 total energy tensor([[5.0196]])
[-0.23064801]
Mode: Test env_steps 200 total rewards -1641.6919810771942 total energy tensor([[190.4475]])
[-0.1160769]
Mode: Test env_steps 200 total rewards -1835.3629174232483 total energy tensor([[5.9821]])
[-0.9464604]
Mode: Test env_steps 200 total rewards -18

100%|██████████| 25/25 [01:16<00:00,  3.05s/it]


[-0.8799375]
Mode: Train env_steps 200 total rewards -1637.9219407439232 total energy tensor([[190.1309]])
[0.57347476]
Mode: Train env_steps 200 total rewards -1659.9842394590378 total energy tensor([[191.2721]])
[0.5607218]
Mode: Train env_steps 200 total rewards -1135.4844479858875 total energy tensor([[51.0188]])
[-0.8271978]
Mode: Train env_steps 200 total rewards -1165.7514541344717 total energy tensor([[91.1422]])
[-0.5216405]
Mode: Train env_steps 200 total rewards -1938.1966609954834 total energy tensor([[1.3292]])


100%|██████████| 25/25 [01:22<00:00,  3.30s/it]


[-0.3053096]
Mode: Train env_steps 200 total rewards -1888.2909588813782 total energy tensor([[2.5855]])
[0.9674701]
Mode: Train env_steps 200 total rewards -1622.394239783287 total energy tensor([[187.5777]])
[-0.41176906]
Mode: Train env_steps 200 total rewards -1946.5116176605225 total energy tensor([[1.0062]])
[-0.4179811]
Mode: Train env_steps 200 total rewards -1633.4796619415283 total energy tensor([[190.4261]])
[-0.4677181]
Mode: Train env_steps 200 total rewards -1368.8728082180023 total energy tensor([[23.6058]])


100%|██████████| 25/25 [01:13<00:00,  2.95s/it]


[-0.3864167]
Mode: Train env_steps 200 total rewards -1946.841236114502 total energy tensor([[0.6581]])
[0.37941763]
Mode: Train env_steps 200 total rewards -1945.2846193313599 total energy tensor([[1.0908]])
[-0.7855552]
Mode: Train env_steps 200 total rewards -1625.236055135727 total energy tensor([[188.3865]])
[0.4721376]
Mode: Train env_steps 200 total rewards -1854.2765855789185 total energy tensor([[6.6304]])
[0.20459862]
Mode: Train env_steps 200 total rewards -1401.8805564939976 total energy tensor([[141.4057]])


100%|██████████| 25/25 [01:05<00:00,  2.63s/it]


[0.8268382]
Mode: Train env_steps 200 total rewards -1851.8244032859802 total energy tensor([[3.9174]])
[0.3122738]
Mode: Train env_steps 200 total rewards -1867.585491657257 total energy tensor([[3.1110]])
[-0.9366995]
Mode: Train env_steps 200 total rewards -1650.426581978798 total energy tensor([[191.6896]])
[-0.07240005]
Mode: Train env_steps 200 total rewards -1648.9161781668663 total energy tensor([[190.9430]])
[-0.22405425]
Mode: Train env_steps 200 total rewards -1606.114557981491 total energy tensor([[187.1293]])


100%|██████████| 25/25 [01:03<00:00,  2.53s/it]


[-0.09119865]
Mode: Test env_steps 200 total rewards -1862.539086818695 total energy tensor([[3.2006]])
[0.70383155]
Mode: Test env_steps 200 total rewards -1609.7961013615131 total energy tensor([[188.4602]])
[0.14944714]
Mode: Test env_steps 200 total rewards -1667.314734339714 total energy tensor([[192.5339]])
[0.16318478]
Mode: Test env_steps 200 total rewards -1731.7203001976013 total energy tensor([[12.0179]])
[0.38415682]
Mode: Test env_steps 200 total rewards -1590.2090647220612 total energy tensor([[181.1318]])
[-0.99951184]
Mode: Test env_steps 200 total rewards -1817.8987503051758 total energy tensor([[5.6888]])
[-0.11329564]
Mode: Test env_steps 200 total rewards -1656.6020753383636 total energy tensor([[20.7113]])
[-0.32362702]
Mode: Test env_steps 200 total rewards -1730.8598235845566 total energy tensor([[16.9991]])
[0.6062914]
Mode: Test env_steps 200 total rewards -1815.3972759246826 total energy tensor([[8.8554]])
[0.14368069]
Mode: Test env_steps 200 total rewards -1

100%|██████████| 25/25 [01:02<00:00,  2.51s/it]


[-0.54640865]
Mode: Train env_steps 200 total rewards -1905.9829287528992 total energy tensor([[2.4944]])
[0.16435257]
Mode: Train env_steps 200 total rewards -1803.3828558921814 total energy tensor([[7.0364]])
[-0.10612082]
Mode: Train env_steps 200 total rewards -1739.0047221183777 total energy tensor([[17.2823]])
[-0.643085]
Mode: Train env_steps 200 total rewards -1898.5365929603577 total energy tensor([[2.5747]])
[-0.54642993]
Mode: Train env_steps 200 total rewards -1830.4607901573181 total energy tensor([[7.9311]])


100%|██████████| 25/25 [01:09<00:00,  2.76s/it]


[0.88004434]
Mode: Train env_steps 200 total rewards -1723.6517538428307 total energy tensor([[17.8669]])
[-0.29850534]
Mode: Train env_steps 200 total rewards -1882.1584944725037 total energy tensor([[2.6082]])
[-0.03609039]
Mode: Train env_steps 200 total rewards -1763.0903487205505 total energy tensor([[14.5381]])
[0.9985618]
Mode: Train env_steps 200 total rewards -1918.60142993927 total energy tensor([[2.1187]])
[0.53666717]
Mode: Train env_steps 200 total rewards -1566.0231897830963 total energy tensor([[158.7878]])


100%|██████████| 25/25 [01:12<00:00,  2.91s/it]


[-0.07771582]
Mode: Train env_steps 200 total rewards -1381.9204009100795 total energy tensor([[53.0088]])
[-0.61696076]
Mode: Train env_steps 200 total rewards -1733.5493824481964 total energy tensor([[12.3221]])
[-0.15121719]
Mode: Train env_steps 200 total rewards -1362.736045371741 total energy tensor([[51.9303]])
[0.7536695]
Mode: Train env_steps 200 total rewards -1789.7758421897888 total energy tensor([[12.6767]])
[0.02391289]
Mode: Train env_steps 200 total rewards -1805.379364490509 total energy tensor([[9.9640]])


100%|██████████| 25/25 [01:02<00:00,  2.51s/it]


[-0.8539833]
Mode: Train env_steps 200 total rewards -1388.733128964901 total energy tensor([[51.9785]])
[0.9200382]
Mode: Train env_steps 200 total rewards -1637.1885667741299 total energy tensor([[21.9539]])
[0.8722265]
Mode: Train env_steps 200 total rewards -1871.6549530029297 total energy tensor([[2.7283]])
[0.08376437]
Mode: Train env_steps 200 total rewards -1872.5861763954163 total energy tensor([[4.9484]])
[0.5577852]
Mode: Train env_steps 200 total rewards -1814.4485671520233 total energy tensor([[9.4235]])


100%|██████████| 25/25 [00:59<00:00,  2.37s/it]


[0.39811623]
Mode: Test env_steps 200 total rewards -1929.911904335022 total energy tensor([[1.6947]])
[-0.670191]
Mode: Test env_steps 200 total rewards -1274.0403387956321 total energy tensor([[43.7960]])
[-0.7091089]
Mode: Test env_steps 200 total rewards -1748.2800024747849 total energy tensor([[17.6128]])
[0.2626294]
Mode: Test env_steps 200 total rewards -1916.8864679336548 total energy tensor([[1.9131]])
[-0.6625261]
Mode: Test env_steps 200 total rewards -1378.2875380069017 total energy tensor([[46.9791]])
[-0.63169736]
Mode: Test env_steps 200 total rewards -1937.1730375289917 total energy tensor([[1.1619]])
[-0.6177671]
Mode: Test env_steps 200 total rewards -1667.7905752658844 total energy tensor([[17.6159]])
[0.72285956]
Mode: Test env_steps 200 total rewards -1367.7181482985616 total energy tensor([[46.8426]])
[0.5165133]
Mode: Test env_steps 200 total rewards -1923.3268823623657 total energy tensor([[1.9934]])
[0.17629433]
Mode: Test env_steps 200 total rewards -1381.5630

100%|██████████| 25/25 [01:02<00:00,  2.50s/it]


[0.7783583]
Mode: Train env_steps 200 total rewards -1382.0202741026878 total energy tensor([[48.7553]])
[-0.83982015]
Mode: Train env_steps 200 total rewards -1833.3649485111237 total energy tensor([[7.9838]])
[0.917532]
Mode: Train env_steps 200 total rewards -1381.063308224082 total energy tensor([[49.2090]])
[-0.926766]
Mode: Train env_steps 200 total rewards -1856.5493531227112 total energy tensor([[6.7890]])
[0.45223513]
Mode: Train env_steps 200 total rewards -1788.0245325565338 total energy tensor([[7.9478]])


100%|██████████| 25/25 [01:03<00:00,  2.54s/it]


[-0.69651604]
Mode: Train env_steps 200 total rewards -1804.321132659912 total energy tensor([[7.5655]])
[0.7020163]
Mode: Train env_steps 200 total rewards -1931.436939239502 total energy tensor([[1.6461]])
[0.48916373]
Mode: Train env_steps 200 total rewards -1364.724163889885 total energy tensor([[41.8103]])
[-0.3392886]
Mode: Train env_steps 200 total rewards -1698.2363207936287 total energy tensor([[20.4874]])
[-0.06065166]
Mode: Train env_steps 200 total rewards -1762.9404892921448 total energy tensor([[15.2576]])


100%|██████████| 25/25 [01:03<00:00,  2.56s/it]


[-0.2915301]
Mode: Train env_steps 200 total rewards -1829.1677842140198 total energy tensor([[5.7955]])
[-0.15265566]
Mode: Train env_steps 200 total rewards -1932.0999507904053 total energy tensor([[1.7692]])
[0.24700546]
Mode: Train env_steps 200 total rewards -1934.891053199768 total energy tensor([[1.4986]])
[-0.5331369]
Mode: Train env_steps 200 total rewards -1360.5199131071568 total energy tensor([[42.8052]])
[-0.4311398]
Mode: Train env_steps 200 total rewards -1859.3657264709473 total energy tensor([[6.2059]])


100%|██████████| 25/25 [01:32<00:00,  3.68s/it]


[-0.65123814]
Mode: Train env_steps 200 total rewards -1892.098575592041 total energy tensor([[2.8806]])
[0.61828715]
Mode: Train env_steps 200 total rewards -1846.9166860580444 total energy tensor([[3.8822]])
[0.9394267]
Mode: Train env_steps 200 total rewards -1927.112714767456 total energy tensor([[1.9169]])
[-0.46418917]
Mode: Train env_steps 200 total rewards -1730.5403471589088 total energy tensor([[18.7821]])
[-0.6753547]
Mode: Train env_steps 200 total rewards -1849.8407168388367 total energy tensor([[4.4013]])


100%|██████████| 25/25 [01:31<00:00,  3.65s/it]


[0.3209648]
Mode: Test env_steps 200 total rewards -1369.8837891817093 total energy tensor([[40.4895]])
[0.01294914]
Mode: Test env_steps 200 total rewards -1616.8055165559053 total energy tensor([[22.6415]])
[0.18893333]
Mode: Test env_steps 200 total rewards -1936.345232963562 total energy tensor([[1.5555]])
[0.66590136]
Mode: Test env_steps 200 total rewards -1740.6114857196808 total energy tensor([[17.2951]])
[0.07892353]
Mode: Test env_steps 200 total rewards -1816.2199482917786 total energy tensor([[6.4498]])
[0.3970958]
Mode: Test env_steps 200 total rewards -1804.1469993591309 total energy tensor([[7.2913]])
[-0.10368115]
Mode: Test env_steps 200 total rewards -1768.8801229000092 total energy tensor([[15.9950]])
[0.9485104]
Mode: Test env_steps 200 total rewards -1804.7292461395264 total energy tensor([[7.2870]])
[-0.4863958]
Mode: Test env_steps 200 total rewards -1864.8785257339478 total energy tensor([[3.0738]])
[-0.23219252]
Mode: Test env_steps 200 total rewards -1367.0566

100%|██████████| 25/25 [01:27<00:00,  3.50s/it]


[-0.48512614]
Mode: Train env_steps 200 total rewards -1869.8667387962341 total energy tensor([[4.2495]])
[0.01395647]
Mode: Train env_steps 200 total rewards -1852.211880683899 total energy tensor([[3.9831]])
[0.8005497]
Mode: Train env_steps 200 total rewards -1688.4643464684486 total energy tensor([[19.7121]])
[0.04167321]
Mode: Train env_steps 200 total rewards -1337.6573618352413 total energy tensor([[32.9300]])
[0.88041]
Mode: Train env_steps 200 total rewards -1924.427583694458 total energy tensor([[2.1114]])


100%|██████████| 25/25 [01:28<00:00,  3.52s/it]


[0.7780732]
Mode: Train env_steps 200 total rewards -1344.4447799548507 total energy tensor([[36.4815]])
[0.26227352]
Mode: Train env_steps 200 total rewards -1360.3292890191078 total energy tensor([[36.8430]])
[0.76572955]
Mode: Train env_steps 200 total rewards -1846.6677474975586 total energy tensor([[3.7842]])
[-0.42979556]
Mode: Train env_steps 200 total rewards -1610.7319011762738 total energy tensor([[22.5358]])
[0.92675865]
Mode: Train env_steps 200 total rewards -1813.360577583313 total energy tensor([[6.2907]])


100%|██████████| 25/25 [01:12<00:00,  2.92s/it]


[0.62487334]
Mode: Train env_steps 200 total rewards -1837.3510615825653 total energy tensor([[8.0110]])
[0.01676218]
Mode: Train env_steps 200 total rewards -1826.4575219154358 total energy tensor([[5.6816]])
[-0.6192325]
Mode: Train env_steps 200 total rewards -1691.1335198879242 total energy tensor([[20.2444]])
[-0.46643832]
Mode: Train env_steps 200 total rewards -1621.659099549055 total energy tensor([[22.0492]])
[0.43142658]
Mode: Train env_steps 200 total rewards -1688.535483598709 total energy tensor([[20.9313]])


100%|██████████| 25/25 [01:05<00:00,  2.62s/it]


[0.40483314]
Mode: Train env_steps 200 total rewards -1830.3975019454956 total energy tensor([[5.9931]])
[-0.20725653]
Mode: Train env_steps 200 total rewards -1825.9507327079773 total energy tensor([[5.8567]])
[-0.963758]
Mode: Train env_steps 200 total rewards -1603.7994512729347 total energy tensor([[22.5407]])
[0.97151953]
Mode: Train env_steps 200 total rewards -1922.4610443115234 total energy tensor([[2.0780]])
[0.6505613]
Mode: Train env_steps 200 total rewards -1667.9489595293999 total energy tensor([[21.0821]])


100%|██████████| 25/25 [01:28<00:00,  3.53s/it]


[-0.9294661]
Mode: Test env_steps 200 total rewards -1794.4488258361816 total energy tensor([[12.8175]])
[-0.7312869]
Mode: Test env_steps 200 total rewards -1751.7645089626312 total energy tensor([[17.0941]])
[-0.1943961]
Mode: Test env_steps 200 total rewards -1892.730706691742 total energy tensor([[3.1584]])
[-0.92642784]
Mode: Test env_steps 200 total rewards -1685.2070016562939 total energy tensor([[21.5290]])
[0.15604085]
Mode: Test env_steps 200 total rewards -1212.982266236926 total energy tensor([[52.0378]])
[0.04886438]
Mode: Test env_steps 200 total rewards -1826.43745303154 total energy tensor([[9.4883]])
[0.11433371]
Mode: Test env_steps 200 total rewards -1730.2417036294937 total energy tensor([[18.6700]])
[0.6184923]
Mode: Test env_steps 200 total rewards -1639.8976250737906 total energy tensor([[22.8584]])
[0.27098387]
Mode: Test env_steps 200 total rewards -1358.5969622135162 total energy tensor([[37.3145]])
[-0.90113866]
Mode: Test env_steps 200 total rewards -1365.36

100%|██████████| 25/25 [01:14<00:00,  2.99s/it]


[-0.03215145]
Mode: Train env_steps 200 total rewards -1763.5642105340958 total energy tensor([[15.9738]])
[0.37968466]
Mode: Train env_steps 200 total rewards -1594.9742490947247 total energy tensor([[23.2374]])
[-0.9605344]
Mode: Train env_steps 200 total rewards -1669.8235552310944 total energy tensor([[17.7961]])
[-0.98656315]
Mode: Train env_steps 200 total rewards -1687.0672160685062 total energy tensor([[21.9719]])
[0.9384674]
Mode: Train env_steps 200 total rewards -1870.1859722137451 total energy tensor([[3.5319]])


100%|██████████| 25/25 [01:22<00:00,  3.31s/it]


[0.19892587]
Mode: Train env_steps 200 total rewards -1921.2106113433838 total energy tensor([[2.7217]])
[-0.49968842]
Mode: Train env_steps 200 total rewards -1800.105731010437 total energy tensor([[8.1243]])
[0.27044618]
Mode: Train env_steps 200 total rewards -1752.942475438118 total energy tensor([[17.9294]])
[0.44853464]
Mode: Train env_steps 200 total rewards -1766.607182264328 total energy tensor([[15.5104]])
[0.6325918]
Mode: Train env_steps 200 total rewards -1346.0390987992287 total energy tensor([[33.7969]])


100%|██████████| 25/25 [01:25<00:00,  3.43s/it]


[-0.15470915]
Mode: Train env_steps 200 total rewards -1754.5353248119354 total energy tensor([[18.1794]])
[-0.11353115]
Mode: Train env_steps 200 total rewards -1636.1345685720444 total energy tensor([[19.7632]])
[0.8591406]
Mode: Train env_steps 200 total rewards -1781.519285917282 total energy tensor([[14.9460]])
[0.46451038]
Mode: Train env_steps 200 total rewards -1765.3818624019623 total energy tensor([[16.0401]])
[-0.29733798]
Mode: Train env_steps 200 total rewards -1673.1782869398594 total energy tensor([[22.4755]])


100%|██████████| 25/25 [01:11<00:00,  2.87s/it]


[0.40083376]
Mode: Train env_steps 200 total rewards -1817.3572297096252 total energy tensor([[6.6563]])
[-0.38210467]
Mode: Train env_steps 200 total rewards -1808.561184644699 total energy tensor([[11.1071]])
[-0.5189084]
Mode: Train env_steps 200 total rewards -1742.7930958271027 total energy tensor([[17.7258]])
[-0.5732252]
Mode: Train env_steps 200 total rewards -1748.42533993721 total energy tensor([[11.9544]])
[-0.81220526]
Mode: Train env_steps 200 total rewards -1569.016594260931 total energy tensor([[22.0529]])


100%|██████████| 25/25 [01:01<00:00,  2.44s/it]


[0.42138013]
Mode: Test env_steps 200 total rewards -1634.5783041119576 total energy tensor([[23.3151]])
[0.24868236]
Mode: Test env_steps 200 total rewards -1815.5684034824371 total energy tensor([[10.5327]])
[0.32956746]
Mode: Test env_steps 200 total rewards -1647.444678068161 total energy tensor([[18.2961]])
[-0.20460892]
Mode: Test env_steps 200 total rewards -1713.6535514593124 total energy tensor([[15.0601]])
[-0.8836205]
Mode: Test env_steps 200 total rewards -1475.1668321490288 total energy tensor([[23.0162]])
[-0.03509248]
Mode: Test env_steps 200 total rewards -1644.9461407065392 total energy tensor([[18.6980]])
[-0.9097682]
Mode: Test env_steps 200 total rewards -1649.12285810709 total energy tensor([[18.7078]])
[-0.7100129]
Mode: Test env_steps 200 total rewards -1800.6703231334686 total energy tensor([[11.3531]])
[0.07539237]
Mode: Test env_steps 200 total rewards -1896.7294402122498 total energy tensor([[3.0910]])
[0.62226385]
Mode: Test env_steps 200 total rewards -1715

100%|██████████| 25/25 [00:57<00:00,  2.30s/it]


[-0.3133594]
Mode: Train env_steps 200 total rewards -1698.3422666788101 total energy tensor([[21.6862]])
[-0.064078]
Mode: Train env_steps 200 total rewards -1651.8325768709183 total energy tensor([[23.1542]])
[0.4338357]
Mode: Train env_steps 200 total rewards -1647.60889005661 total energy tensor([[18.5043]])
[-0.827236]
Mode: Train env_steps 200 total rewards -1596.520895600319 total energy tensor([[20.1730]])
[-0.4734519]
Mode: Train env_steps 200 total rewards -1756.1398998498917 total energy tensor([[17.2874]])


100%|██████████| 25/25 [01:14<00:00,  2.98s/it]


[0.8420167]
Mode: Train env_steps 200 total rewards -1728.2144443392754 total energy tensor([[20.6176]])
[-0.06739818]
Mode: Train env_steps 200 total rewards -1916.0685405731201 total energy tensor([[2.7834]])
[0.9665999]
Mode: Train env_steps 200 total rewards -1891.5223956108093 total energy tensor([[4.7018]])
[0.266398]
Mode: Train env_steps 200 total rewards -1814.6009550094604 total energy tensor([[6.9053]])
[-0.26638687]
Mode: Train env_steps 200 total rewards -1748.4988219738007 total energy tensor([[11.4028]])


100%|██████████| 25/25 [01:20<00:00,  3.22s/it]


[-0.7994443]
Mode: Train env_steps 200 total rewards -1907.8679866790771 total energy tensor([[3.9357]])
[0.3618125]
Mode: Train env_steps 200 total rewards -1638.8415734916925 total energy tensor([[24.7174]])
[-0.08971651]
Mode: Train env_steps 200 total rewards -1697.8917961716652 total energy tensor([[21.9379]])
[0.42836937]
Mode: Train env_steps 200 total rewards -1691.423469632864 total energy tensor([[23.6773]])
[0.5888775]
Mode: Train env_steps 200 total rewards -1824.5532875061035 total energy tensor([[6.4701]])


100%|██████████| 25/25 [01:22<00:00,  3.31s/it]


[0.35192233]
Mode: Train env_steps 200 total rewards -1788.156792640686 total energy tensor([[8.8831]])
[0.65618426]
Mode: Train env_steps 200 total rewards -1767.7417907714844 total energy tensor([[10.9214]])
[0.2870116]
Mode: Train env_steps 200 total rewards -1759.3531818389893 total energy tensor([[18.2868]])
[0.81198263]
Mode: Train env_steps 200 total rewards -1682.2223567962646 total energy tensor([[17.1349]])
[-0.3136218]
Mode: Train env_steps 200 total rewards -1648.3606014847755 total energy tensor([[24.3034]])


100%|██████████| 25/25 [01:27<00:00,  3.49s/it]


[0.01630884]
Mode: Test env_steps 200 total rewards -1728.0063247680664 total energy tensor([[14.7074]])
[0.714346]
Mode: Test env_steps 200 total rewards -1869.8560552597046 total energy tensor([[5.8410]])
[-0.61074173]
Mode: Test env_steps 200 total rewards -1590.863879173994 total energy tensor([[25.1340]])
[0.11786392]
Mode: Test env_steps 200 total rewards -1894.1665935516357 total energy tensor([[4.6353]])
[-0.9872365]
Mode: Test env_steps 200 total rewards -1772.0067151784897 total energy tensor([[16.4093]])
[-0.83762914]
Mode: Test env_steps 200 total rewards -1674.604913175106 total energy tensor([[22.9792]])
[0.76089066]
Mode: Test env_steps 200 total rewards -1689.0073791444302 total energy tensor([[23.0642]])
[0.4961847]
Mode: Test env_steps 200 total rewards -1078.8600202668458 total energy tensor([[25.7426]])
[-0.78153837]
Mode: Test env_steps 200 total rewards -1068.250623865053 total energy tensor([[15.0240]])
[-0.24437606]
Mode: Test env_steps 200 total rewards -1721.1

100%|██████████| 25/25 [01:24<00:00,  3.37s/it]


[-0.9658687]
Mode: Train env_steps 200 total rewards -1770.288536310196 total energy tensor([[10.4851]])
[0.10595401]
Mode: Train env_steps 200 total rewards -1899.2884311676025 total energy tensor([[4.4941]])
[-0.8950067]
Mode: Train env_steps 200 total rewards -1789.4617371559143 total energy tensor([[8.5539]])
[0.811244]
Mode: Train env_steps 200 total rewards -1852.5565347671509 total energy tensor([[7.1328]])
[0.97951186]
Mode: Train env_steps 200 total rewards -1751.2616112232208 total energy tensor([[18.8907]])


100%|██████████| 25/25 [01:08<00:00,  2.74s/it]


[-0.23450509]
Mode: Train env_steps 200 total rewards -1785.4768905639648 total energy tensor([[16.3654]])
[-0.06969831]
Mode: Train env_steps 200 total rewards -1787.161955833435 total energy tensor([[9.1735]])
[-0.4703267]
Mode: Train env_steps 200 total rewards -1752.6586272716522 total energy tensor([[13.1645]])
[0.675857]
Mode: Train env_steps 200 total rewards -1768.082941532135 total energy tensor([[17.6717]])
[0.57088923]
Mode: Train env_steps 200 total rewards -1833.9319953918457 total energy tensor([[6.1531]])


100%|██████████| 25/25 [01:35<00:00,  3.83s/it]


[-0.9406593]
Mode: Train env_steps 200 total rewards -1705.8267886638641 total energy tensor([[20.9618]])
[0.16843854]
Mode: Train env_steps 200 total rewards -1763.684918642044 total energy tensor([[11.4964]])
[-0.6534672]
Mode: Train env_steps 200 total rewards -1699.9360486268997 total energy tensor([[21.0698]])
[-0.16248064]
Mode: Train env_steps 200 total rewards -1780.2045214176178 total energy tensor([[15.5238]])
[0.5200351]
Mode: Train env_steps 200 total rewards -1700.7555345892906 total energy tensor([[22.3137]])


100%|██████████| 25/25 [01:25<00:00,  3.40s/it]


[0.48182556]
Mode: Train env_steps 200 total rewards -1829.6474394798279 total energy tensor([[9.2077]])
[-0.7505483]
Mode: Train env_steps 200 total rewards -1501.322518227622 total energy tensor([[25.9562]])
[-0.5746645]
Mode: Train env_steps 200 total rewards -1898.6433601379395 total energy tensor([[4.6230]])
[0.9079027]
Mode: Train env_steps 200 total rewards -1893.7273273468018 total energy tensor([[4.7993]])
[0.04577384]
Mode: Train env_steps 200 total rewards -1882.9579162597656 total energy tensor([[5.7643]])


100%|██████████| 25/25 [01:27<00:00,  3.50s/it]


[0.3684391]
Mode: Test env_steps 200 total rewards -1718.3622078895569 total energy tensor([[15.5243]])
[-0.50549036]
Mode: Test env_steps 200 total rewards -1756.4933975934982 total energy tensor([[17.8935]])
[-0.8859417]
Mode: Test env_steps 200 total rewards -1081.8314996326808 total energy tensor([[43.5625]])
[-0.4392322]
Mode: Test env_steps 200 total rewards -1796.6482424736023 total energy tensor([[14.6594]])
[0.79719615]
Mode: Test env_steps 200 total rewards -1814.8962988853455 total energy tensor([[10.6768]])
[-0.3452081]
Mode: Test env_steps 200 total rewards -1431.2141695693135 total energy tensor([[25.3150]])
[-0.37021962]
Mode: Test env_steps 200 total rewards -1585.9295133054256 total energy tensor([[25.7228]])
[0.25651783]
Mode: Test env_steps 200 total rewards -1885.1793332099915 total energy tensor([[5.8435]])
[-0.5278507]
Mode: Test env_steps 200 total rewards -1705.4498327374458 total energy tensor([[21.5511]])
[0.48406732]
Mode: Test env_steps 200 total rewards -11

100%|██████████| 25/25 [01:23<00:00,  3.34s/it]


[-0.01777115]
Mode: Train env_steps 200 total rewards -1882.808439731598 total energy tensor([[5.5857]])
[0.5934154]
Mode: Train env_steps 200 total rewards -1547.103693574667 total energy tensor([[22.2052]])
[-0.18652074]
Mode: Train env_steps 200 total rewards -1619.8164192289114 total energy tensor([[24.4869]])
[-0.2460178]
Mode: Train env_steps 200 total rewards -1478.2938439399004 total energy tensor([[23.3450]])
[-0.5459168]
Mode: Train env_steps 200 total rewards -1503.178875476122 total energy tensor([[23.3985]])


100%|██████████| 25/25 [01:21<00:00,  3.26s/it]


[0.07483624]
Mode: Train env_steps 200 total rewards -1894.4841966629028 total energy tensor([[5.2078]])
[-0.9107214]
Mode: Train env_steps 200 total rewards -1689.3099572658539 total energy tensor([[17.4299]])
[0.77721685]
Mode: Train env_steps 200 total rewards -1739.3083727359772 total energy tensor([[14.3348]])
[0.96681947]
Mode: Train env_steps 200 total rewards -1841.6399760246277 total energy tensor([[6.6058]])
[0.15856276]
Mode: Train env_steps 200 total rewards -1862.6544589996338 total energy tensor([[6.6036]])


100%|██████████| 25/25 [01:31<00:00,  3.68s/it]


[0.924144]
Mode: Train env_steps 200 total rewards -1709.4603998661041 total energy tensor([[16.9927]])
[0.07047912]
Mode: Train env_steps 200 total rewards -1888.7674379348755 total energy tensor([[5.8569]])
[0.43224794]
Mode: Train env_steps 200 total rewards -1868.7845206260681 total energy tensor([[6.8133]])
[-0.8931884]
Mode: Train env_steps 200 total rewards -1609.1117327213287 total energy tensor([[21.2681]])
[0.22342257]
Mode: Train env_steps 200 total rewards -1888.7458410263062 total energy tensor([[5.5843]])


100%|██████████| 25/25 [01:17<00:00,  3.09s/it]


[0.9129607]
Mode: Train env_steps 200 total rewards -1759.1311700344086 total energy tensor([[12.2058]])
[0.24659434]
Mode: Train env_steps 200 total rewards -1680.371215224266 total energy tensor([[18.4299]])
[-0.78240466]
Mode: Train env_steps 200 total rewards -1876.6994976997375 total energy tensor([[6.5436]])
[-0.55695796]
Mode: Train env_steps 200 total rewards -1535.3742465376854 total energy tensor([[22.0710]])
[-0.43081352]
Mode: Train env_steps 200 total rewards -1890.0721940994263 total energy tensor([[5.3844]])


100%|██████████| 25/25 [01:29<00:00,  3.56s/it]


[0.8268217]
Mode: Test env_steps 200 total rewards -1607.0474002957344 total energy tensor([[21.0182]])
[-0.4456242]
Mode: Test env_steps 200 total rewards -1719.5570878982544 total energy tensor([[21.6744]])
[0.82562166]
Mode: Test env_steps 200 total rewards -1824.2567782402039 total energy tensor([[9.6397]])
[-0.27271053]
Mode: Test env_steps 200 total rewards -1863.4001088142395 total energy tensor([[7.0729]])
[0.14387952]
Mode: Test env_steps 200 total rewards -1837.6430864334106 total energy tensor([[6.9459]])
[-0.0277853]
Mode: Test env_steps 200 total rewards -1751.2086639404297 total energy tensor([[13.0260]])
[-0.6834483]
Mode: Test env_steps 200 total rewards -1877.7356896400452 total energy tensor([[6.2967]])
[0.77873504]
Mode: Test env_steps 200 total rewards -1867.2845859527588 total energy tensor([[5.6448]])
[0.9550433]
Mode: Test env_steps 200 total rewards -1455.6705842381343 total energy tensor([[21.9147]])
[-0.46722084]
Mode: Test env_steps 200 total rewards -1806.82

100%|██████████| 25/25 [01:11<00:00,  2.86s/it]


[0.37320453]
Mode: Train env_steps 200 total rewards -1857.6903281211853 total energy tensor([[7.7772]])
[-0.7474673]
Mode: Train env_steps 200 total rewards -1883.5286779403687 total energy tensor([[5.2306]])
[0.64353734]
Mode: Train env_steps 200 total rewards -1891.1976299285889 total energy tensor([[5.1807]])
[-0.954044]
Mode: Train env_steps 200 total rewards -1888.1331100463867 total energy tensor([[5.1715]])
[0.78230184]
Mode: Train env_steps 200 total rewards -1510.7169608920813 total energy tensor([[22.2622]])


100%|██████████| 25/25 [01:06<00:00,  2.65s/it]


[0.9472982]
Mode: Train env_steps 200 total rewards -1847.8469505310059 total energy tensor([[7.8992]])
[0.68912]
Mode: Train env_steps 200 total rewards -1715.0657708644867 total energy tensor([[16.0058]])
[0.50833637]
Mode: Train env_steps 200 total rewards -1631.4957299232483 total energy tensor([[20.5205]])
[-0.08532714]
Mode: Train env_steps 200 total rewards -1874.9423441886902 total energy tensor([[6.0344]])
[-0.9362373]
Mode: Train env_steps 200 total rewards -1842.7410173416138 total energy tensor([[6.4292]])


100%|██████████| 25/25 [01:15<00:00,  3.03s/it]


[-0.29513547]
Mode: Train env_steps 200 total rewards -1633.791264295578 total energy tensor([[20.0520]])
[-0.18207656]
Mode: Train env_steps 200 total rewards -1859.0448513031006 total energy tensor([[7.5212]])
[0.75788873]
Mode: Train env_steps 200 total rewards -1854.695430278778 total energy tensor([[7.8118]])
[-0.37232405]
Mode: Train env_steps 200 total rewards -1679.564288109541 total energy tensor([[23.4345]])
[0.05506822]
Mode: Train env_steps 200 total rewards -1413.496458229376 total energy tensor([[21.6814]])


100%|██████████| 25/25 [00:58<00:00,  2.33s/it]


[-0.2108131]
Mode: Train env_steps 200 total rewards -1874.3747806549072 total energy tensor([[5.1773]])
[0.49056044]
Mode: Train env_steps 200 total rewards -1840.2085852622986 total energy tensor([[6.4143]])
[0.85392696]
Mode: Train env_steps 200 total rewards -1626.4442472159863 total energy tensor([[20.0872]])
[-0.4948401]
Mode: Train env_steps 200 total rewards -1865.585114955902 total energy tensor([[6.6041]])
[0.7162319]
Mode: Train env_steps 200 total rewards -1455.1228447244503 total energy tensor([[20.6505]])


100%|██████████| 25/25 [00:55<00:00,  2.22s/it]


[0.06211038]
Mode: Test env_steps 200 total rewards -1674.2429419755936 total energy tensor([[22.0888]])
[-0.5015586]
Mode: Test env_steps 200 total rewards -1888.6661710739136 total energy tensor([[5.7555]])
[-0.40954563]
Mode: Test env_steps 200 total rewards -1777.976316690445 total energy tensor([[16.6225]])
[-0.8162156]
Mode: Test env_steps 200 total rewards -1885.882703781128 total energy tensor([[5.9479]])
[0.57959235]
Mode: Test env_steps 200 total rewards -1640.794600263238 total energy tensor([[23.5107]])
[0.12218763]
Mode: Test env_steps 200 total rewards -1868.1904835700989 total energy tensor([[6.4045]])
[0.43719727]
Mode: Test env_steps 200 total rewards -1625.4639767855406 total energy tensor([[23.8896]])
[0.9824797]
Mode: Test env_steps 200 total rewards -1877.8864660263062 total energy tensor([[5.9212]])
[0.44532087]
Mode: Test env_steps 200 total rewards -1743.9127215147018 total energy tensor([[18.7627]])
[0.04631947]
Mode: Test env_steps 200 total rewards -1585.0781

100%|██████████| 25/25 [00:59<00:00,  2.37s/it]


[-0.7235859]
Mode: Train env_steps 200 total rewards -1817.38223361969 total energy tensor([[10.6294]])
[-0.65142477]
Mode: Train env_steps 200 total rewards -1870.8636837005615 total energy tensor([[6.8020]])
[0.12123479]
Mode: Train env_steps 200 total rewards -1839.8687510490417 total energy tensor([[6.5130]])
[-0.21346535]
Mode: Train env_steps 200 total rewards -1707.564540863037 total energy tensor([[21.4003]])
[-0.43562165]
Mode: Train env_steps 200 total rewards -1698.3652935028076 total energy tensor([[20.9759]])


100%|██████████| 25/25 [00:59<00:00,  2.37s/it]


[0.18918186]
Mode: Train env_steps 200 total rewards -1685.7435510754585 total energy tensor([[21.7995]])
[0.854785]
Mode: Train env_steps 200 total rewards -1813.446389913559 total energy tensor([[10.9203]])
[-0.64358103]
Mode: Train env_steps 200 total rewards -1618.898284368217 total energy tensor([[23.7165]])
[0.14829029]
Mode: Train env_steps 200 total rewards -1881.4479675292969 total energy tensor([[5.0057]])
[-0.57928896]
Mode: Train env_steps 200 total rewards -1869.0356726646423 total energy tensor([[6.7660]])


100%|██████████| 25/25 [00:56<00:00,  2.24s/it]


[-0.12600864]
Mode: Train env_steps 200 total rewards -1585.7684149630368 total energy tensor([[23.2440]])
[-0.05172486]
Mode: Train env_steps 200 total rewards -1665.9182832092047 total energy tensor([[22.5437]])
[0.4578488]
Mode: Train env_steps 200 total rewards -1726.3103192448616 total energy tensor([[19.6049]])
[0.02103662]
Mode: Train env_steps 200 total rewards -1810.1864655017853 total energy tensor([[10.3641]])
[-0.9548587]
Mode: Train env_steps 200 total rewards -1871.5265169143677 total energy tensor([[6.4505]])


100%|██████████| 25/25 [00:57<00:00,  2.29s/it]


[0.00555227]
Mode: Train env_steps 200 total rewards -1625.982881128788 total energy tensor([[19.3411]])
[0.2166729]
Mode: Train env_steps 200 total rewards -1895.0644779205322 total energy tensor([[5.2001]])
[-0.89542997]
Mode: Train env_steps 200 total rewards -1273.180535239284 total energy tensor([[19.5657]])
[0.7166959]
Mode: Train env_steps 200 total rewards -1654.420184865594 total energy tensor([[23.2655]])
[-0.68611664]
Mode: Train env_steps 200 total rewards -1827.015215396881 total energy tensor([[7.0278]])


100%|██████████| 25/25 [01:06<00:00,  2.65s/it]


[-0.07304686]
Mode: Test env_steps 200 total rewards -1709.90606752038 total energy tensor([[21.5468]])
[0.36865124]
Mode: Test env_steps 200 total rewards -1628.1127270162106 total energy tensor([[18.9809]])
[0.9273282]
Mode: Test env_steps 200 total rewards -1735.8824939727783 total energy tensor([[12.9845]])
[-0.9926403]
Mode: Test env_steps 200 total rewards -1599.5709739923477 total energy tensor([[19.5713]])
[0.3269941]
Mode: Test env_steps 200 total rewards -1831.3901863098145 total energy tensor([[9.2692]])
[0.3248072]
Mode: Test env_steps 200 total rewards -1872.246822834015 total energy tensor([[6.5007]])
[0.9669345]
Mode: Test env_steps 200 total rewards -1702.610684543848 total energy tensor([[21.4261]])
[-0.51329315]
Mode: Test env_steps 200 total rewards -1596.7991974186152 total energy tensor([[23.5029]])
[-0.24647936]
Mode: Test env_steps 200 total rewards -1737.7948627471924 total energy tensor([[13.4197]])
[-0.27235004]
Mode: Test env_steps 200 total rewards -1707.503

100%|██████████| 25/25 [01:00<00:00,  2.44s/it]


[-0.84402037]
Mode: Train env_steps 200 total rewards -1806.0136642456055 total energy tensor([[8.2605]])
[-0.6368398]
Mode: Train env_steps 200 total rewards -1803.9449417591095 total energy tensor([[12.9254]])
[0.00493748]
Mode: Train env_steps 200 total rewards -1693.1911097168922 total energy tensor([[21.5754]])
[0.8123643]
Mode: Train env_steps 200 total rewards -1841.6359395980835 total energy tensor([[6.9550]])
[-0.77822036]
Mode: Train env_steps 200 total rewards -1743.860538482666 total energy tensor([[12.8635]])


100%|██████████| 25/25 [01:02<00:00,  2.49s/it]


[-0.78727096]
Mode: Train env_steps 200 total rewards -1658.8317453116179 total energy tensor([[22.9203]])
[0.7445667]
Mode: Train env_steps 200 total rewards -1818.6616978645325 total energy tensor([[11.0668]])
[0.876291]
Mode: Train env_steps 200 total rewards -1866.743191242218 total energy tensor([[6.5725]])
[0.07413258]
Mode: Train env_steps 200 total rewards -1733.3551495075226 total energy tensor([[19.4209]])
[0.62304515]
Mode: Train env_steps 200 total rewards -1860.3973054885864 total energy tensor([[7.0482]])


100%|██████████| 25/25 [01:00<00:00,  2.43s/it]


[0.42767766]
Mode: Train env_steps 200 total rewards -1693.8138190507889 total energy tensor([[21.1214]])
[-0.01018314]
Mode: Train env_steps 200 total rewards -1687.3270101547241 total energy tensor([[16.6058]])
[-0.10516604]
Mode: Train env_steps 200 total rewards -1894.3571681976318 total energy tensor([[4.8341]])
[-0.07013538]
Mode: Train env_steps 200 total rewards -1887.9142799377441 total energy tensor([[5.7829]])
[-0.3391724]
Mode: Train env_steps 200 total rewards -1651.2111244797707 total energy tensor([[18.5272]])


100%|██████████| 25/25 [01:04<00:00,  2.60s/it]


[-0.34505045]
Mode: Train env_steps 200 total rewards -1864.3985342979431 total energy tensor([[5.8294]])
[-0.9426343]
Mode: Train env_steps 200 total rewards -1695.6102533340454 total energy tensor([[15.9131]])
[0.4471411]
Mode: Train env_steps 200 total rewards -1741.1736228466034 total energy tensor([[12.9883]])
[-0.68376267]
Mode: Train env_steps 200 total rewards -1713.969077706337 total energy tensor([[14.4458]])
[-0.3792988]
Mode: Train env_steps 200 total rewards -1895.9311037063599 total energy tensor([[5.0580]])


100%|██████████| 25/25 [01:04<00:00,  2.57s/it]


[0.35615048]
Mode: Test env_steps 200 total rewards -1838.5456671714783 total energy tensor([[8.5278]])
[-0.06528242]
Mode: Test env_steps 200 total rewards -1808.708116054535 total energy tensor([[12.6041]])
[-0.23765643]
Mode: Test env_steps 200 total rewards -1765.428864479065 total energy tensor([[16.1431]])
[-0.7390572]
Mode: Test env_steps 200 total rewards -1843.9335017204285 total energy tensor([[6.5740]])
[0.48510474]
Mode: Test env_steps 200 total rewards -1776.9436584711075 total energy tensor([[16.8383]])
[0.68000424]
Mode: Test env_steps 200 total rewards -1859.2275185585022 total energy tensor([[6.8379]])
[-0.63240695]
Mode: Test env_steps 200 total rewards -1705.0951740145683 total energy tensor([[21.7580]])
[-0.888294]
Mode: Test env_steps 200 total rewards -1703.445943057537 total energy tensor([[21.0036]])
[-0.8416811]
Mode: Test env_steps 200 total rewards -1885.7819991111755 total energy tensor([[5.3644]])
[-0.60519856]
Mode: Test env_steps 200 total rewards -1626.4

100%|██████████| 25/25 [01:00<00:00,  2.41s/it]


[0.6594014]
Mode: Train env_steps 200 total rewards -1773.7827732563019 total energy tensor([[17.2786]])
[-0.5600553]
Mode: Train env_steps 200 total rewards -1887.0171546936035 total energy tensor([[5.2940]])
[-0.13972579]
Mode: Train env_steps 200 total rewards -1718.3252382278442 total energy tensor([[20.5522]])
[-0.8907427]
Mode: Train env_steps 200 total rewards -1827.21910572052 total energy tensor([[9.6455]])
[0.42096418]
Mode: Train env_steps 200 total rewards -1801.3982481956482 total energy tensor([[13.8632]])


100%|██████████| 25/25 [01:01<00:00,  2.47s/it]


[-0.78358465]
Mode: Train env_steps 200 total rewards -1748.717424273491 total energy tensor([[18.0453]])
[0.47809154]
Mode: Train env_steps 200 total rewards -1735.0994296073914 total energy tensor([[14.9886]])
[0.9257999]
Mode: Train env_steps 200 total rewards -1800.11505818367 total energy tensor([[9.4852]])
[-0.4304514]
Mode: Train env_steps 200 total rewards -1672.4359190762043 total energy tensor([[17.7829]])
[0.35999745]
Mode: Train env_steps 200 total rewards -1840.703784942627 total energy tensor([[8.1511]])


100%|██████████| 25/25 [01:02<00:00,  2.51s/it]


[-0.3135557]
Mode: Train env_steps 200 total rewards -1854.0456309318542 total energy tensor([[7.1985]])
[0.14819355]
Mode: Train env_steps 200 total rewards -1767.7223320007324 total energy tensor([[16.8119]])
[0.13536538]
Mode: Train env_steps 200 total rewards -1697.4164431095123 total energy tensor([[17.5513]])
[-0.0339441]
Mode: Train env_steps 200 total rewards -1843.021665096283 total energy tensor([[8.5615]])
[-0.852765]
Mode: Train env_steps 200 total rewards -1543.1082634981722 total energy tensor([[19.3923]])


100%|██████████| 25/25 [01:20<00:00,  3.22s/it]


[-0.8173576]
Mode: Train env_steps 200 total rewards -1848.9218034744263 total energy tensor([[7.4231]])
[0.94750744]
Mode: Train env_steps 200 total rewards -1830.8810935020447 total energy tensor([[9.4044]])
[-0.88215077]
Mode: Train env_steps 200 total rewards -1594.619949825108 total energy tensor([[22.4550]])
[-0.50952935]
Mode: Train env_steps 200 total rewards -1718.668552994728 total energy tensor([[16.1897]])
[-0.6355062]
Mode: Train env_steps 200 total rewards -1676.6784285604954 total energy tensor([[18.0042]])


100%|██████████| 25/25 [01:21<00:00,  3.25s/it]


[0.23446889]
Mode: Test env_steps 200 total rewards -1706.7563310861588 total energy tensor([[22.2929]])
[-0.24592978]
Mode: Test env_steps 200 total rewards -1790.9555678367615 total energy tensor([[10.4631]])
[0.71214813]
Mode: Test env_steps 200 total rewards -1770.6086971759796 total energy tensor([[12.7825]])
[0.66145694]
Mode: Test env_steps 200 total rewards -1755.1671330928802 total energy tensor([[18.1660]])
[-0.5473258]
Mode: Test env_steps 200 total rewards -1773.2178152799606 total energy tensor([[16.6080]])
[-0.8434899]
Mode: Test env_steps 200 total rewards -1681.2480400949717 total energy tensor([[21.8707]])
[-0.1472114]
Mode: Test env_steps 200 total rewards -1698.3388919234276 total energy tensor([[22.3641]])
[-0.7055109]
Mode: Test env_steps 200 total rewards -1715.5717427432537 total energy tensor([[21.0140]])
[0.08159346]
Mode: Test env_steps 200 total rewards -1766.989174604416 total energy tensor([[13.4119]])
[-0.76357424]
Mode: Test env_steps 200 total rewards -1

100%|██████████| 25/25 [01:19<00:00,  3.18s/it]


[-0.9628665]
Mode: Train env_steps 200 total rewards -1853.375147819519 total energy tensor([[8.0447]])
[-0.3274088]
Mode: Train env_steps 200 total rewards -1677.6461918354034 total energy tensor([[18.3655]])
[0.20950559]
Mode: Train env_steps 200 total rewards -1793.415320634842 total energy tensor([[11.2110]])
[-0.42384756]
Mode: Train env_steps 200 total rewards -1504.3804801884107 total energy tensor([[19.5158]])
[0.08419411]
Mode: Train env_steps 200 total rewards -1707.6612555086613 total energy tensor([[21.1630]])


100%|██████████| 25/25 [01:16<00:00,  3.05s/it]


[-0.9867461]
Mode: Train env_steps 200 total rewards -1443.0670258390019 total energy tensor([[19.6722]])
[0.7147748]
Mode: Train env_steps 200 total rewards -1598.1803290769458 total energy tensor([[19.7722]])
[-0.7663517]
Mode: Train env_steps 200 total rewards -1828.167981147766 total energy tensor([[7.6853]])
[0.45027757]
Mode: Train env_steps 200 total rewards -1897.9874668121338 total energy tensor([[4.9732]])
[-0.78078043]
Mode: Train env_steps 200 total rewards -1721.6121042966843 total energy tensor([[16.8362]])


100%|██████████| 25/25 [01:06<00:00,  2.66s/it]


[-0.36992982]
Mode: Train env_steps 200 total rewards -1646.7269360795617 total energy tensor([[23.2848]])
[0.60867023]
Mode: Train env_steps 200 total rewards -1892.1898164749146 total energy tensor([[5.5631]])
[-0.09633541]
Mode: Train env_steps 200 total rewards -1706.1540986299515 total energy tensor([[18.4252]])
[0.69708776]
Mode: Train env_steps 200 total rewards -1807.896915435791 total energy tensor([[8.9913]])
[-0.3820118]
Mode: Train env_steps 200 total rewards -1830.5564813613892 total energy tensor([[7.6961]])


100%|██████████| 25/25 [01:16<00:00,  3.06s/it]


[-0.30696237]
Mode: Train env_steps 200 total rewards -1503.5457588669378 total energy tensor([[20.2857]])
[-0.04197837]
Mode: Train env_steps 200 total rewards -1573.6030056774616 total energy tensor([[20.4533]])
[-0.9494325]
Mode: Train env_steps 200 total rewards -1674.1722757816315 total energy tensor([[22.1644]])
[0.17024739]
Mode: Train env_steps 200 total rewards -1752.9832664728165 total energy tensor([[18.3063]])
[-0.5231903]
Mode: Train env_steps 200 total rewards -1846.5521125793457 total energy tensor([[6.8912]])


100%|██████████| 25/25 [01:25<00:00,  3.41s/it]


[0.33928522]
Mode: Test env_steps 200 total rewards -1870.2622933387756 total energy tensor([[7.3857]])
[0.21050774]
Mode: Test env_steps 200 total rewards -1801.5695102214813 total energy tensor([[13.7577]])
[0.38278642]
Mode: Test env_steps 200 total rewards -1837.1514568328857 total energy tensor([[9.8529]])
[0.00533392]
Mode: Test env_steps 200 total rewards -1692.6143415868282 total energy tensor([[19.4232]])
[-0.91372794]
Mode: Test env_steps 200 total rewards -1750.1953234672546 total energy tensor([[19.1666]])
[0.113194]
Mode: Test env_steps 200 total rewards -1783.4181990623474 total energy tensor([[15.3524]])
[0.666427]
Mode: Test env_steps 200 total rewards -1787.567854642868 total energy tensor([[10.7574]])
[-0.21509661]
Mode: Test env_steps 200 total rewards -1890.2389907836914 total energy tensor([[5.8442]])
[-0.66484404]
Mode: Test env_steps 200 total rewards -1478.444243728707 total energy tensor([[20.2833]])
[0.6756626]
Mode: Test env_steps 200 total rewards -1888.0840

100%|██████████| 25/25 [01:19<00:00,  3.18s/it]


[-0.20091054]
Mode: Train env_steps 200 total rewards -1887.946605682373 total energy tensor([[6.3150]])
[0.47487116]
Mode: Train env_steps 200 total rewards -1844.9503064155579 total energy tensor([[7.3207]])
[-0.03619559]
Mode: Train env_steps 200 total rewards -1836.3171129226685 total energy tensor([[7.3694]])
[0.90573007]
Mode: Train env_steps 200 total rewards -1779.3033945560455 total energy tensor([[15.0833]])
[0.64824486]
Mode: Train env_steps 200 total rewards -1700.4361208379269 total energy tensor([[22.1868]])


100%|██████████| 25/25 [01:31<00:00,  3.65s/it]


[0.45997775]
Mode: Train env_steps 200 total rewards -1841.0531311035156 total energy tensor([[9.3719]])
[-0.01028112]
Mode: Train env_steps 200 total rewards -1784.0215051174164 total energy tensor([[15.7810]])
[-0.44098032]
Mode: Train env_steps 200 total rewards -1814.889970779419 total energy tensor([[11.4618]])
[0.21360171]
Mode: Train env_steps 200 total rewards -1816.3333868980408 total energy tensor([[11.4164]])
[-0.3464415]
Mode: Train env_steps 200 total rewards -1869.5352816581726 total energy tensor([[5.6704]])


100%|██████████| 25/25 [01:18<00:00,  3.12s/it]


[-0.92728424]
Mode: Train env_steps 200 total rewards -1866.0207705497742 total energy tensor([[7.5547]])
[0.8080743]
Mode: Train env_steps 200 total rewards -1830.0370125770569 total energy tensor([[7.6463]])
[-0.50895125]
Mode: Train env_steps 200 total rewards -1715.937338232994 total energy tensor([[18.6356]])
[0.30430174]
Mode: Train env_steps 200 total rewards -1731.3540697097778 total energy tensor([[20.4304]])
[0.29450592]
Mode: Train env_steps 200 total rewards -1817.0088455677032 total energy tensor([[12.2668]])


100%|██████████| 25/25 [01:30<00:00,  3.61s/it]


[-0.3837602]
Mode: Train env_steps 200 total rewards -1679.3960940390825 total energy tensor([[23.6428]])
[-0.98358804]
Mode: Train env_steps 200 total rewards -1745.8661725521088 total energy tensor([[19.4997]])
[-0.2092879]
Mode: Train env_steps 200 total rewards -1685.2538266181946 total energy tensor([[19.7271]])
[-0.74188566]
Mode: Train env_steps 200 total rewards -1867.89235496521 total energy tensor([[7.2867]])
[-0.05028189]
Mode: Train env_steps 200 total rewards -1821.2463307380676 total energy tensor([[8.1696]])


100%|██████████| 25/25 [01:22<00:00,  3.29s/it]


[0.546985]
Mode: Test env_steps 200 total rewards -1778.287343263626 total energy tensor([[15.3999]])
[0.94923943]
Mode: Test env_steps 200 total rewards -1831.101037979126 total energy tensor([[7.5752]])
[0.19983184]
Mode: Test env_steps 200 total rewards -1859.425280570984 total energy tensor([[7.3967]])
[0.9870509]
Mode: Test env_steps 200 total rewards -1819.9743347167969 total energy tensor([[12.0127]])
[-0.65329236]
Mode: Test env_steps 200 total rewards -1873.9424457550049 total energy tensor([[5.7577]])
[0.876937]
Mode: Test env_steps 200 total rewards -1776.951346874237 total energy tensor([[13.1152]])
[-0.68259287]
Mode: Test env_steps 200 total rewards -1638.4402808062732 total energy tensor([[23.4099]])
[0.58076924]
Mode: Test env_steps 200 total rewards -1837.8255515098572 total energy tensor([[7.4788]])
[0.628061]
Mode: Test env_steps 200 total rewards -1800.8119013309479 total energy tensor([[12.5725]])
[-0.35442138]
Mode: Test env_steps 200 total rewards -1874.295369148

100%|██████████| 25/25 [01:29<00:00,  3.58s/it]


[0.86512524]
Mode: Train env_steps 200 total rewards -1789.3197169303894 total energy tensor([[13.8484]])
[-0.63109]
Mode: Train env_steps 200 total rewards -1519.6924812146463 total energy tensor([[20.9697]])
[-0.33353096]
Mode: Train env_steps 200 total rewards -1859.5119886398315 total energy tensor([[6.5010]])
[-0.0985707]
Mode: Train env_steps 200 total rewards -1731.6128414273262 total energy tensor([[19.7473]])
[-0.675949]
Mode: Train env_steps 200 total rewards -1737.2327086925507 total energy tensor([[17.7299]])


100%|██████████| 25/25 [01:25<00:00,  3.42s/it]


[0.9857002]
Mode: Train env_steps 200 total rewards -1899.2441987991333 total energy tensor([[4.8139]])
[-0.49577639]
Mode: Train env_steps 200 total rewards -1725.244641304016 total energy tensor([[20.3798]])
[0.399014]
Mode: Train env_steps 200 total rewards -1734.7259806394577 total energy tensor([[20.5025]])
[0.91409874]
Mode: Train env_steps 200 total rewards -1734.2683662772179 total energy tensor([[19.2321]])
[0.8428167]
Mode: Train env_steps 200 total rewards -1889.470911026001 total energy tensor([[4.9363]])


100%|██████████| 25/25 [01:22<00:00,  3.31s/it]


[-0.9689371]
Mode: Train env_steps 200 total rewards -1815.825828075409 total energy tensor([[11.2904]])
[0.11790281]
Mode: Train env_steps 200 total rewards -1804.0283613204956 total energy tensor([[9.8162]])
[0.516075]
Mode: Train env_steps 200 total rewards -1870.0786681175232 total energy tensor([[5.8449]])
[0.7338468]
Mode: Train env_steps 200 total rewards -1577.181947948411 total energy tensor([[22.8520]])
[-0.23374926]
Mode: Train env_steps 200 total rewards -1899.1533155441284 total energy tensor([[4.7694]])


100%|██████████| 25/25 [01:20<00:00,  3.20s/it]


[-0.88392764]
Mode: Train env_steps 200 total rewards -1705.887121528387 total energy tensor([[21.8451]])
[0.84711874]
Mode: Train env_steps 200 total rewards -1812.7859234809875 total energy tensor([[8.6394]])
[-0.3245]
Mode: Train env_steps 200 total rewards -1886.309509754181 total energy tensor([[5.6477]])
[0.23821297]
Mode: Train env_steps 200 total rewards -1722.3111329078674 total energy tensor([[21.0893]])
[-0.03957774]
Mode: Train env_steps 200 total rewards -1718.103648751974 total energy tensor([[21.3607]])


100%|██████████| 25/25 [01:34<00:00,  3.80s/it]


[0.44239423]
Mode: Test env_steps 200 total rewards -1903.939073562622 total energy tensor([[4.0020]])
[0.06805965]
Mode: Test env_steps 200 total rewards -1387.8616145834676 total energy tensor([[20.4449]])
[0.29755518]
Mode: Test env_steps 200 total rewards -1891.3293857574463 total energy tensor([[5.6607]])
[-0.08687387]
Mode: Test env_steps 200 total rewards -1808.00324344635 total energy tensor([[9.4601]])
[0.7796385]
Mode: Test env_steps 200 total rewards -1737.6727023124695 total energy tensor([[19.4286]])
[0.2572593]
Mode: Test env_steps 200 total rewards -1736.162268280983 total energy tensor([[20.4354]])
[-0.6891838]
Mode: Test env_steps 200 total rewards -1640.7262412682176 total energy tensor([[22.9375]])
[0.94457126]
Mode: Test env_steps 200 total rewards -1700.144299775362 total energy tensor([[21.2927]])
[0.55290264]
Mode: Test env_steps 200 total rewards -1858.2984099388123 total energy tensor([[5.8093]])
[0.69903004]
Mode: Test env_steps 200 total rewards -1819.5021247

100%|██████████| 25/25 [01:29<00:00,  3.56s/it]


[-0.6783426]
Mode: Train env_steps 200 total rewards -1782.638923048973 total energy tensor([[16.2449]])
[-0.13358653]
Mode: Train env_steps 200 total rewards -1798.5296258926392 total energy tensor([[11.0895]])
[0.92208856]
Mode: Train env_steps 200 total rewards -1917.99906539917 total energy tensor([[2.6951]])
[0.79094696]
Mode: Train env_steps 200 total rewards -1751.598008275032 total energy tensor([[17.7042]])
[-0.65157354]
Mode: Train env_steps 200 total rewards -1689.3832034021616 total energy tensor([[21.0657]])


 56%|█████▌    | 14/25 [00:56<00:43,  3.92s/it]

## Draw the learning curve

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curve["x"], learning_curve["y"])
plt.fill_between(np.array(learning_curve["x"]), np.array(learning_curve["y"])-np.array(learning_curve["z"]), np.array(learning_curve["y"])+np.array(learning_curve["z"]))
plt.xlabel("env steps")
plt.ylabel("return")
plt.show()

In [None]:
leaning_curve_ncde_64_rk4 = learning_curve

In [None]:
learning_curve


In [None]:
timess=torch.linspace(0, 65-1, 65)

In [None]:
file1 = open('config.txt', 'w')
file1.write(str(conf))

file1.close()
file2 = open('results.txt', 'w')
file2.write(str(learning_curve))
file2.close()