In [1]:
import gym
import numpy as np
import torch
import torchkit.pytorch_utils as ptu
import torchsde
from torch.nn import functional as F
import random as rnd
import copy as cp
# import environments
import envs.pomdp
import pdb
# import recurrent model-free RL (separate architecture)
from policies.models.policy_rnn import ModelFreeOffPolicy_Separate_RNN as Policy_RNN
from policies.models.policy_rnn_shared import ModelFreeOffPolicy_Shared_RNN as Policy_Shared_RNN
from policies.models.policy_mlp import ModelFreeOffPolicy_MLP as Policy_MLP
from tqdm import tqdm
# import the replay buffer
from buffers.seq_replay_buffer_vanilla import SeqReplayBuffer
from buffers.simple_replay_buffer import SimpleReplayBuffer 
from utils import helpers as utl
from typing import Sequence
from read_ini import read_ini
conf =read_ini("C:/Users/alexander.vasilyev/pomdp-baselines-main/configfile.ini")

  logger.warn(
  from collections import OrderedDict, Set
  if not hasattr(tensorboard, "__version__") or LooseVersion(


## Build a POMDP environment: Pendulum-V (only observe the velocity)

In [2]:
cuda_id = 0  # -1 if using cpu
ptu.set_gpu_mode(torch.cuda.is_available() and cuda_id >= 0, cuda_id)

env = gym.make(conf["env_name"])
max_trajectory_len = env._max_episode_steps
act_dim = env.action_space.shape[0]
obs_dim = env.observation_space.shape[0]

shared = False
markov = False

if markov:
    agent = Policy_MLP(
        obs_dim=obs_dim,
        action_dim=act_dim,
        algo_name=conf["algo_name"],
        dqn_layers=[128, 128],
        policy_layers=[128, 128],
        lr=3e-4,
        gamma=0.99,
        tau=5e-3,
    ).to(ptu.device)
    encoder="Nan"
else:
    if shared:
        agent = Policy_Shared_RNN(
            obs_dim=obs_dim,
            action_dim=act_dim,
            encoder=conf["encoder"],
            algo_name=conf["algo_name"],
            action_embedding_size=int(conf["action_embedding_size"]),
            observ_embedding_size=int(conf["observ_embedding_size"]),
            reward_embedding_size=int(conf["reward_embedding_size"]),
            rnn_hidden_size=int(conf["hidden_size"]),
            dqn_layers=[128, 128],
            policy_layers=[128, 128],
            lr=float(conf["lr"]),
            gamma=0.9,
            tau=0.005,
            embed=True,
        ).to(ptu.device)
    else: 
        agent = Policy_RNN(
            obs_dim=obs_dim,
            action_dim=act_dim,
            encoder=conf["encoder"],
            algo_name=conf["algo_name"],
            action_embedding_size=int(conf["action_embedding_size"]),
            observ_embedding_size=int(conf["observ_embedding_size"]),
            reward_embedding_size=int(conf["reward_embedding_size"]),
            rnn_hidden_size=int(conf["hidden_size"]),
            dqn_layers=[128, 128],
            policy_layers=[128, 128],
            lr=float(conf["lr"]),
            gamma=0.9,
            tau=0.005,
            radii=40,
            embed=True,
            activation = conf["activation"],
        ).to(ptu.device)
    
print(agent)
lr=float(conf["lr"])
encoder=conf["encoder"]
num_updates_per_iter = int(conf["num_updates_per_iter"])  # training frequency
sampled_seq_len = int(conf["sampled_seq_len"])  # context length
buffer_size = int(float(conf["buffer_size"]))
batch_size = int(conf["batch_size"])
dropout_rate=float(conf["dropout_rate"])
num_iters = int(conf["num_iters"])
num_init_rollouts_pool = int(conf["num_init_rollouts_pool"])
num_rollouts_per_iter = int(conf["num_rollouts_per_iter"])
total_rollouts = num_init_rollouts_pool + num_iters * num_rollouts_per_iter
n_env_steps_total = max_trajectory_len * total_rollouts
_n_env_steps_total = 0
print("total env episodes", total_rollouts, "total env steps", n_env_steps_total)


ModelFreeOffPolicy_Separate_RNN(
  (critic): Critic_RNN(
    (observ_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=32, bias=True)
    )
    (action_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=8, bias=True)
    )
    (reward_embedder): FeatureExtractor(
      (fc): Linear(in_features=1, out_features=8, bias=True)
    )
    (rnn): NeuralCDE(
      (func): CDEFunc(
        (linear0): Linear(in_features=72, out_features=72, bias=True)
        (linear1): Linear(in_features=72, out_features=72, bias=True)
        (linear2): Linear(in_features=72, out_features=3528, bias=True)
      )
      (initial): Linear(in_features=49, out_features=72, bias=True)
      (readout): Linear(in_features=72, out_features=72, bias=True)
    )
    (current_shortcut_embedder): FeatureExtractor(
      (fc): Linear(in_features=2, out_features=48, bias=True)
    )
    (qf1): FlattenMlp(
      (fc0): Linear(in_features=120, out_features=128, bias=True)
   

## Build a recurent model-free RL agent: separate architecture, `lstm` encoder, `oar` policy input space, `td3` RL algorithm (context length set later)

## Define other training parameters such as context length and training frequency

## Define key functions: collect rollouts and policy update

In [3]:
def create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,init):
    
    if init:
        obs_row= obs
        rew_row = prev_reward
        act_row = prev_action
    else:
        obs_row=torch.cat((obs, next_obs),0)
        rew_row=torch.cat((prev_reward, reward),0)
        act_row=torch.cat((prev_action, action),0)
 
    if shared: 
        obs_row=agent.observ_embedder(obs_row)
        rew_row=agent.reward_embedder(rew_row)
        act_row=agent.action_embedder(act_row)
    else: 
        obs_row=agent.actor.observ_embedder(obs_row)
        rew_row=agent.actor.reward_embedder(rew_row)
        act_row=agent.actor.action_embedder(act_row)
    
    if init:
        time_tensor=torch.tensor([[steps]]).to(ptu.device)
    else:
        time_tensor=torch.tensor([[steps],[steps+1]]).to(ptu.device)

    ncde_row=torch.cat((time_tensor,act_row,obs_row,rew_row),1)
    ncde_row=ncde_row[None,:]
    
    return ncde_row



@torch.no_grad()
def collect_rollouts(
    num_rollouts, random_actions=False, deterministic=True, train_mode=True
):
    """collect num_rollouts of trajectories in task and save into policy buffer
    :param
        random_actions: whether to use policy to sample actions, or randomly sample action space
        deterministic: deterministic action selection?
        train_mode: whether to train (stored to buffer) or test
    """
    if not train_mode:
        assert random_actions == False and deterministic == True

    total_steps = 0
    total_rewards = 0.0
    trewards =[]
    for idx in range(num_rollouts):
        steps = 0
        rewards = 0.0
        energy = 0.0
        print(env.reset())
        obs = ptu.from_numpy(env.reset())
        obs = obs.reshape(1, obs.shape[-1])
        done_rollout = False
        init=True
        # get hidden state at timestep=0, None for mlp
        
        if not markov:
            action, reward, internal_state = agent.get_initial_info()

            if encoder == "ncde":
                internal_state= None
                ncde_row= create_ncde_row(obs, obs, action, action, reward, reward, steps,init)
                prev_action= action.clone()
                prev_reward= reward.clone()
                next_obs= obs.clone()
        
        
        if train_mode:
            # temporary storage
            obs_list, act_list, rew_list, next_obs_list, term_list = (
                [],
                [],
                [],
                [],
                [],
            )
                           

        while not done_rollout:
            if markov: 
                action = agent.act(obs=obs, deterministic=deterministic)[0]
            else:
                if encoder == "ncde":
                    (action,_,_,_), internal_state= agent.ncde_act(ncde_row=ncde_row, prev_internal_state=internal_state, obs=obs,  deterministic=deterministic)
                else:
                    (action, _, _, _), internal_state = agent.act(
                        prev_internal_state=internal_state,
                        prev_action=action,
                        reward=reward,
                        obs=obs,
                        deterministic=deterministic,
                    )
            # observe reward and next obs (B=1, dim)
            #pdb.set_trace()
        
            #print(torch.norm(internal_state))
            next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))
            done_rollout = False if ptu.get_numpy(done[0][0]) == 0.0 else True
            init=False
            
            if not markov:
                if encoder == "ncde":
   
                    ncde_row= create_ncde_row(obs, next_obs, prev_action, action, prev_reward, reward, steps,init)
            
            #switch on/off dropouts
            #drop_trigger=rnd.uniform(0,1)
            #if drop_trigger<dropout_rate:
            #    next_obs=cp.deepcopy(obs)
            # update statistics
           
            rewards += reward.item()
            energy += action*action
           
            # early stopping env: such as rmdp, pomdp, generalize tasks. term ignores timeout
            term = (
                False
                if "TimeLimit.truncated" in info or steps >= max_trajectory_len
                else done_rollout
            )

            if train_mode:
                # append tensors to temporary storage
                obs_list.append(obs)  # (1, dim)
                act_list.append(action)  # (1, dim)
                rew_list.append(reward)  # (1, dim)
                term_list.append(term)  # bool
                next_obs_list.append(next_obs)  # (1, dim)
            steps += 1
            # set: obs <- next_obs
            obs = next_obs.clone()
            prev_reward= reward.clone()
            prev_action= action.clone()
        if train_mode:
            # add collected sequence to buffer
            policy_storage.add_episode(
                observations=ptu.get_numpy(torch.cat(obs_list, dim=0)),  # (L, dim)
                actions=ptu.get_numpy(torch.cat(act_list, dim=0)),  # (L, dim)
                rewards=ptu.get_numpy(torch.cat(rew_list, dim=0)),  # (L, dim)
                terminals=np.array(term_list).reshape(-1, 1),  # (L, 1)
                next_observations=ptu.get_numpy(
                    torch.cat(next_obs_list, dim=0)
                ),  # (L, dim)
            )
        print(
            "Mode:",
            "Train" if train_mode else "Test",
            "env_steps",
            steps,
            "total rewards",
            rewards,
            "total energy",
            energy,
        )
        total_steps += steps
        total_rewards += rewards
        trewards.append(rewards)
    if train_mode:
        return total_steps
    else:
        return total_rewards / num_rollouts, np.std(trewards)


def update(num_updates, factor):
    rl_losses_agg = {}
    # print(num_updates)
    for update in tqdm(range(num_updates), leave=True):
        # sample random RL batch: in transitions
        batch = ptu.np_to_pytorch_batch(policy_storage.random_episodes(batch_size))
        # RL update
        
        rl_losses = agent.update(batch, factor)

        for k, v in rl_losses.items():
            if update == 0:  # first iterate - create list
                rl_losses_agg[k] = [v]
            else:  # append values
                rl_losses_agg[k].append(v)
    # statistics
    for k in rl_losses_agg:
        rl_losses_agg[k] = np.mean(rl_losses_agg[k])
    return rl_losses_agg

## Train and Evaluate the agent: only costs < 20 min

In [None]:
policy_storage = SeqReplayBuffer(
    max_replay_buffer_size=buffer_size,
    observation_dim=obs_dim,
    action_dim=act_dim,
    sampled_seq_len=sampled_seq_len,
    sample_weight_baseline=0.0,
)

env_steps = collect_rollouts(
    num_rollouts=num_init_rollouts_pool, random_actions=False, train_mode=True
)
_n_env_steps_total += env_steps

# evaluation parameters
last_eval_num_iters = 10
log_interval = 5
eval_num_rollouts = 10
learning_curve = {
    "x": [],
    "y": [],
    "z": [],
}
epoch=0
lambda_pat = 0.65

while _n_env_steps_total < n_env_steps_total:

    env_steps = collect_rollouts(num_rollouts=num_rollouts_per_iter, train_mode=True)
    _n_env_steps_total += env_steps

    #train_stats = update(int(num_updates_per_iter * env_steps))
    factor= lambda_pat **(epoch )
    #train_stats = update(int(num_updates_per_iter * env_steps))
    train_stats = update(25, lr)
    
    epoch += 1
    current_num_iters = _n_env_steps_total // (
        num_rollouts_per_iter * max_trajectory_len
    )
    if (
        current_num_iters != last_eval_num_iters
        and current_num_iters % log_interval == 0
    ):
        last_eval_num_iters = current_num_iters
        average_returns, std_returns = collect_rollouts(
            num_rollouts=eval_num_rollouts,
            train_mode=False,
            random_actions=False,
            deterministic=True,
        )
        learning_curve["x"].append(_n_env_steps_total)
        learning_curve["y"].append(average_returns)
        learning_curve["z"].append(std_returns)
        print(_n_env_steps_total, average_returns)

buffer RAM usage: 0.02 GB
[-0.18286392]




Mode: Train env_steps 200 total rewards -627.1723878339399 total energy tensor([[0.0009]])
[-0.86359847]
Mode: Train env_steps 200 total rewards -1465.099901676178 total energy tensor([[0.0006]])
[0.74585605]
Mode: Train env_steps 200 total rewards -968.4575101137161 total energy tensor([[0.0010]])
[-0.8119963]
Mode: Train env_steps 200 total rewards -1064.254163146019 total energy tensor([[0.0011]])
[-0.65336865]
Mode: Train env_steps 200 total rewards -1254.5659635066986 total energy tensor([[0.0009]])
[-0.84177446]
Mode: Train env_steps 200 total rewards -1584.4519271850586 total energy tensor([[0.0006]])
[0.05887923]
Mode: Train env_steps 200 total rewards -628.1147714760154 total energy tensor([[0.0009]])
[-0.29377258]
Mode: Train env_steps 200 total rewards -737.794536806643 total energy tensor([[0.0010]])
[-0.8305148]
Mode: Train env_steps 200 total rewards -1557.0814833641052 total energy tensor([[0.0006]])
[0.61149436]
Mode: Train env_steps 200 total rewards -1171.859337568283

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if v.dtype == np.bool:
100%|██████████| 25/25 [01:02<00:00,  2.49s/it]


[0.6731633]
Mode: Train env_steps 200 total rewards -1343.8932387828827 total energy tensor([[31.7528]])
[-0.27208433]
Mode: Train env_steps 200 total rewards -1266.3245940208435 total energy tensor([[32.4656]])
[0.5807534]
Mode: Train env_steps 200 total rewards -1699.2578616142273 total energy tensor([[19.7072]])
[-0.04527098]
Mode: Train env_steps 200 total rewards -1307.411400437355 total energy tensor([[31.1604]])
[-0.97157735]
Mode: Train env_steps 200 total rewards -1351.332904547453 total energy tensor([[32.0476]])


100%|██████████| 25/25 [01:07<00:00,  2.68s/it]


[-0.48463875]
Mode: Train env_steps 200 total rewards -1013.9208480650559 total energy tensor([[3.8427]])
[0.5292008]
Mode: Train env_steps 200 total rewards -905.1960357353091 total energy tensor([[2.5171]])
[-0.27751896]
Mode: Train env_steps 200 total rewards -1270.2195472717285 total energy tensor([[2.2214]])
[0.39900678]
Mode: Train env_steps 200 total rewards -1548.196283340454 total energy tensor([[1.6130]])
[0.55401415]
Mode: Train env_steps 200 total rewards -1056.3510165549815 total energy tensor([[4.3005]])


100%|██████████| 25/25 [01:14<00:00,  2.99s/it]


[0.8541971]
Mode: Train env_steps 200 total rewards -1039.1114730238914 total energy tensor([[6.0354]])
[0.41087466]
Mode: Train env_steps 200 total rewards -1600.6446633338928 total energy tensor([[2.6838]])
[0.76770645]
Mode: Train env_steps 200 total rewards -959.2560813948512 total energy tensor([[5.9694]])
[0.56193846]
Mode: Train env_steps 200 total rewards -1045.8042680621147 total energy tensor([[6.0185]])
[-0.4370459]
Mode: Train env_steps 200 total rewards -1092.2935677319765 total energy tensor([[6.8191]])


100%|██████████| 25/25 [01:10<00:00,  2.83s/it]


[-0.6180618]
Mode: Test env_steps 200 total rewards -1662.0854816436768 total energy tensor([[14.0714]])
[-0.8945591]
Mode: Test env_steps 200 total rewards -1202.432815656066 total energy tensor([[18.3976]])
[0.890028]
Mode: Test env_steps 200 total rewards -1941.72012424469 total energy tensor([[0.5264]])
[0.80951387]
Mode: Test env_steps 200 total rewards -1864.3609809875488 total energy tensor([[5.9019]])
[0.42710203]
Mode: Test env_steps 200 total rewards -1746.466650724411 total energy tensor([[13.6720]])
[0.74520326]
Mode: Test env_steps 200 total rewards -1943.074589729309 total energy tensor([[0.6629]])
[-0.25170442]
Mode: Test env_steps 200 total rewards -1814.7647194862366 total energy tensor([[6.0244]])
[0.27936238]
Mode: Test env_steps 200 total rewards -1629.9115783572197 total energy tensor([[21.4080]])
[-0.00369019]
Mode: Test env_steps 200 total rewards -1777.9158170223236 total energy tensor([[11.5687]])
[0.16060762]
Mode: Test env_steps 200 total rewards -1491.595801

100%|██████████| 25/25 [01:15<00:00,  3.03s/it]


[0.9493371]
Mode: Train env_steps 200 total rewards -1725.1589938402176 total energy tensor([[11.8734]])
[-0.6375915]
Mode: Train env_steps 200 total rewards -1766.9023275375366 total energy tensor([[9.1713]])
[0.1520319]
Mode: Train env_steps 200 total rewards -1849.0722332000732 total energy tensor([[3.8441]])
[-0.89367926]
Mode: Train env_steps 200 total rewards -1626.971121788025 total energy tensor([[16.1817]])
[0.26634613]
Mode: Train env_steps 200 total rewards -1375.0553357303143 total energy tensor([[42.9877]])


100%|██████████| 25/25 [01:09<00:00,  2.79s/it]


[0.77172476]
Mode: Train env_steps 200 total rewards -1404.228477358818 total energy tensor([[56.1248]])
[-0.14430985]
Mode: Train env_steps 200 total rewards -1414.0377424955368 total energy tensor([[56.4431]])
[-0.18019481]
Mode: Train env_steps 200 total rewards -1420.5737288594246 total energy tensor([[57.7075]])
[0.05015166]
Mode: Train env_steps 200 total rewards -1876.0046792030334 total energy tensor([[2.0594]])
[0.8314296]
Mode: Train env_steps 200 total rewards -1534.730565071106 total energy tensor([[3.8204]])


100%|██████████| 25/25 [01:12<00:00,  2.90s/it]


[0.3954042]
Mode: Train env_steps 200 total rewards -1383.6435743272305 total energy tensor([[94.4836]])
[0.20333406]
Mode: Train env_steps 200 total rewards -1355.230950385332 total energy tensor([[81.6843]])
[0.8841586]
Mode: Train env_steps 200 total rewards -1192.585041904822 total energy tensor([[68.4791]])
[0.83319604]
Mode: Train env_steps 200 total rewards -1731.3692646026611 total energy tensor([[5.9787]])
[0.0804089]
Mode: Train env_steps 200 total rewards -908.4952503293753 total energy tensor([[2.1642]])


100%|██████████| 25/25 [01:23<00:00,  3.35s/it]


[0.15499073]
Mode: Train env_steps 200 total rewards -1607.0602875351906 total energy tensor([[163.1991]])
[-0.19695659]
Mode: Train env_steps 200 total rewards -1518.6114336028695 total energy tensor([[137.2252]])
[0.4013462]
Mode: Train env_steps 200 total rewards -1890.2161655426025 total energy tensor([[2.0544]])
[-0.5135966]
Mode: Train env_steps 200 total rewards -1951.1738548278809 total energy tensor([[0.4269]])
[0.1571486]
Mode: Train env_steps 200 total rewards -1874.922622680664 total energy tensor([[1.8949]])


100%|██████████| 25/25 [01:32<00:00,  3.71s/it]


[0.9871652]
Mode: Test env_steps 200 total rewards -1388.7813239395618 total energy tensor([[108.5133]])
[0.48804292]
Mode: Test env_steps 200 total rewards -1611.185109257698 total energy tensor([[168.8188]])
[0.5315943]
Mode: Test env_steps 200 total rewards -1460.1157260090113 total energy tensor([[131.8175]])
[-0.8156318]
Mode: Test env_steps 200 total rewards -1585.284763097763 total energy tensor([[163.9564]])
[0.4804839]
Mode: Test env_steps 200 total rewards -1534.7785720825195 total energy tensor([[142.8596]])
[0.24370894]
Mode: Test env_steps 200 total rewards -1515.9913493543863 total energy tensor([[144.6905]])
[-0.45471397]
Mode: Test env_steps 200 total rewards -1577.2756012380123 total energy tensor([[166.0901]])
[0.6268348]
Mode: Test env_steps 200 total rewards -1913.2778205871582 total energy tensor([[1.8658]])
[0.9688026]
Mode: Test env_steps 200 total rewards -1491.8189459443092 total energy tensor([[130.5346]])
[-0.37145162]
Mode: Test env_steps 200 total rewards -

100%|██████████| 25/25 [01:48<00:00,  4.33s/it]


[-0.1599921]
Mode: Train env_steps 200 total rewards -1633.14479804039 total energy tensor([[180.2180]])
[-0.6888604]
Mode: Train env_steps 200 total rewards -1848.7131152153015 total energy tensor([[2.3004]])
[-0.61524165]
Mode: Train env_steps 200 total rewards -1926.1032905578613 total energy tensor([[1.4972]])
[-0.28260043]
Mode: Train env_steps 200 total rewards -1555.1142466068268 total energy tensor([[156.3283]])
[0.1762983]
Mode: Train env_steps 200 total rewards -1938.098424911499 total energy tensor([[0.9474]])


100%|██████████| 25/25 [01:16<00:00,  3.07s/it]


[0.32413808]
Mode: Train env_steps 200 total rewards -1633.885877341032 total energy tensor([[182.8791]])
[0.43044457]
Mode: Train env_steps 200 total rewards -1645.906468808651 total energy tensor([[183.3582]])
[0.16840471]
Mode: Train env_steps 200 total rewards -1919.4635977745056 total energy tensor([[1.5437]])
[0.99078465]
Mode: Train env_steps 200 total rewards -1890.980372428894 total energy tensor([[2.2630]])
[-0.6927419]
Mode: Train env_steps 200 total rewards -1838.686912536621 total energy tensor([[3.0092]])


100%|██████████| 25/25 [01:16<00:00,  3.06s/it]


[-0.8044674]
Mode: Train env_steps 200 total rewards -1782.8735976219177 total energy tensor([[2.9067]])
[-0.42788523]
Mode: Train env_steps 200 total rewards -1516.7631882429123 total energy tensor([[142.3286]])
[0.6187882]
Mode: Train env_steps 200 total rewards -1560.5180988311768 total energy tensor([[163.9623]])
[0.62228]
Mode: Train env_steps 200 total rewards -1621.3189475536346 total energy tensor([[184.8376]])
[0.4871931]
Mode: Train env_steps 200 total rewards -1531.0628660917282 total energy tensor([[149.9875]])


100%|██████████| 25/25 [01:21<00:00,  3.27s/it]


[0.6548461]
Mode: Train env_steps 200 total rewards -1401.1626613512635 total energy tensor([[140.8096]])
[-0.7380303]
Mode: Train env_steps 200 total rewards -1946.8271017074585 total energy tensor([[0.6389]])
[-0.13667013]
Mode: Train env_steps 200 total rewards -1580.3786436319351 total energy tensor([[171.2686]])
[0.4475782]
Mode: Train env_steps 200 total rewards -1579.7315204441547 total energy tensor([[181.3769]])
[-0.7010048]
Mode: Train env_steps 200 total rewards -1939.3147706985474 total energy tensor([[1.1237]])


100%|██████████| 25/25 [01:18<00:00,  3.15s/it]


[0.7755755]
Mode: Test env_steps 200 total rewards -1583.850512623787 total energy tensor([[170.8460]])
[0.4115116]
Mode: Test env_steps 200 total rewards -1221.2874257480726 total energy tensor([[105.6162]])
[-0.75288486]
Mode: Test env_steps 200 total rewards -1579.1257704626769 total energy tensor([[182.1161]])
[0.18034057]
Mode: Test env_steps 200 total rewards -1568.1845009326935 total energy tensor([[168.7669]])
[0.8122652]
Mode: Test env_steps 200 total rewards -1912.386471748352 total energy tensor([[1.9767]])
[-0.19943881]
Mode: Test env_steps 200 total rewards -1134.3799818726256 total energy tensor([[99.1288]])
[-0.28473204]
Mode: Test env_steps 200 total rewards -1524.179848909378 total energy tensor([[160.4480]])
[-0.8504278]
Mode: Test env_steps 200 total rewards -1924.3445177078247 total energy tensor([[1.8270]])
[-0.9131769]
Mode: Test env_steps 200 total rewards -1921.9585580825806 total energy tensor([[1.9523]])
[-0.27747452]
Mode: Test env_steps 200 total rewards -18

100%|██████████| 25/25 [01:21<00:00,  3.24s/it]


[-0.70642376]
Mode: Train env_steps 200 total rewards -1580.261923134327 total energy tensor([[180.5030]])
[-0.38362578]
Mode: Train env_steps 200 total rewards -1265.0422469377518 total energy tensor([[83.9454]])
[-0.65218663]
Mode: Train env_steps 200 total rewards -1563.4446083307266 total energy tensor([[167.5109]])
[-0.909472]
Mode: Train env_steps 200 total rewards -1813.333912372589 total energy tensor([[3.5617]])
[-0.62521166]
Mode: Train env_steps 200 total rewards -1602.8814917057753 total energy tensor([[187.6884]])


100%|██████████| 25/25 [01:13<00:00,  2.92s/it]


[-0.63980085]
Mode: Train env_steps 200 total rewards -1929.8248805999756 total energy tensor([[1.3611]])
[-0.41099325]
Mode: Train env_steps 200 total rewards -1420.258905224502 total energy tensor([[149.0931]])
[0.79847807]
Mode: Train env_steps 200 total rewards -1762.5496401786804 total energy tensor([[3.6167]])
[0.7797261]
Mode: Train env_steps 200 total rewards -1521.9018638134003 total energy tensor([[150.9355]])
[-0.0791804]
Mode: Train env_steps 200 total rewards -1488.4741319417953 total energy tensor([[129.9081]])


100%|██████████| 25/25 [01:23<00:00,  3.32s/it]


[0.13292982]
Mode: Train env_steps 200 total rewards -1475.902972459793 total energy tensor([[134.6350]])
[-0.6884185]
Mode: Train env_steps 200 total rewards -1633.0113563537598 total energy tensor([[189.8305]])
[-0.61551696]
Mode: Train env_steps 200 total rewards -1462.1281624436378 total energy tensor([[126.4449]])
[-0.1768803]
Mode: Train env_steps 200 total rewards -1543.240952834487 total energy tensor([[162.6967]])
[0.26363453]
Mode: Train env_steps 200 total rewards -1526.7065216004848 total energy tensor([[157.6806]])


100%|██████████| 25/25 [01:21<00:00,  3.25s/it]


[0.4762924]
Mode: Train env_steps 200 total rewards -1623.2258843183517 total energy tensor([[187.1408]])
[0.99629647]
Mode: Train env_steps 200 total rewards -1555.9145169258118 total energy tensor([[164.2038]])
[0.12906049]
Mode: Train env_steps 200 total rewards -1868.6943001747131 total energy tensor([[2.7933]])
[0.64094114]
Mode: Train env_steps 200 total rewards -1558.6559108784422 total energy tensor([[177.5862]])
[-0.6625711]
Mode: Train env_steps 200 total rewards -1693.5302453041077 total energy tensor([[4.9267]])


100%|██████████| 25/25 [01:23<00:00,  3.33s/it]


[0.6380241]
Mode: Test env_steps 200 total rewards -1956.2669343948364 total energy tensor([[0.5630]])
[-0.36989704]
Mode: Test env_steps 200 total rewards -1564.3688442111015 total energy tensor([[165.5622]])
[0.778222]
Mode: Test env_steps 200 total rewards -1294.252350345254 total energy tensor([[82.4476]])
[0.536951]
Mode: Test env_steps 200 total rewards -1560.7814950942993 total energy tensor([[165.5268]])
[0.85153985]
Mode: Test env_steps 200 total rewards -1584.728413619101 total energy tensor([[180.8173]])
[-0.4528105]
Mode: Test env_steps 200 total rewards -1791.3184142112732 total energy tensor([[4.3643]])
[0.02103655]
Mode: Test env_steps 200 total rewards -1622.4556182920933 total energy tensor([[187.3471]])
[0.82927877]
Mode: Test env_steps 200 total rewards -1657.6632115840912 total energy tensor([[189.1216]])
[-0.26068917]
Mode: Test env_steps 200 total rewards -1185.4368878901005 total energy tensor([[20.9354]])
[-0.7090724]
Mode: Test env_steps 200 total rewards -1432

100%|██████████| 25/25 [01:25<00:00,  3.42s/it]


[-0.22729349]
Mode: Train env_steps 200 total rewards -1836.8441700935364 total energy tensor([[4.8747]])
[0.45464116]
Mode: Train env_steps 200 total rewards -1450.4538716003299 total energy tensor([[154.2228]])
[-0.9383594]
Mode: Train env_steps 200 total rewards -1205.8366934508085 total energy tensor([[47.2379]])
[-0.23815139]
Mode: Train env_steps 200 total rewards -1338.94384470582 total energy tensor([[104.3900]])
[-0.76947427]
Mode: Train env_steps 200 total rewards -1949.2370471954346 total energy tensor([[0.5583]])


100%|██████████| 25/25 [01:23<00:00,  3.34s/it]


[-0.7039796]
Mode: Train env_steps 200 total rewards -1859.483868598938 total energy tensor([[3.2718]])
[-0.98048836]
Mode: Train env_steps 200 total rewards -1552.5771242678165 total energy tensor([[164.8958]])
[-0.6726894]
Mode: Train env_steps 200 total rewards -1918.0378684997559 total energy tensor([[2.1549]])
[-0.91917175]
Mode: Train env_steps 200 total rewards -1874.0962281227112 total energy tensor([[2.9727]])
[0.32118738]
Mode: Train env_steps 200 total rewards -1534.6853347420692 total energy tensor([[161.3980]])


100%|██████████| 25/25 [01:18<00:00,  3.12s/it]


[-0.9200177]
Mode: Train env_steps 200 total rewards -1931.9645223617554 total energy tensor([[1.3840]])
[-0.48450485]
Mode: Train env_steps 200 total rewards -1624.0453878641129 total energy tensor([[186.1614]])
[0.61567676]
Mode: Train env_steps 200 total rewards -1513.319035757333 total energy tensor([[161.4876]])
[-0.5664824]
Mode: Train env_steps 200 total rewards -1900.7502689361572 total energy tensor([[2.3817]])
[-0.6554988]
Mode: Train env_steps 200 total rewards -1925.666184425354 total energy tensor([[1.8183]])


100%|██████████| 25/25 [01:20<00:00,  3.23s/it]


[0.24706006]
Mode: Train env_steps 200 total rewards -1648.9352271556854 total energy tensor([[190.8652]])
[0.66996366]
Mode: Train env_steps 200 total rewards -1947.4450283050537 total energy tensor([[1.0803]])
[-0.27475578]
Mode: Train env_steps 200 total rewards -1652.222731411457 total energy tensor([[191.3376]])
[0.30260766]
Mode: Train env_steps 200 total rewards -1611.1025849878788 total energy tensor([[188.0383]])
[0.34630948]
Mode: Train env_steps 200 total rewards -1628.6147825717926 total energy tensor([[189.7363]])


100%|██████████| 25/25 [01:16<00:00,  3.06s/it]


[-0.16545616]
Mode: Test env_steps 200 total rewards -1062.615165702533 total energy tensor([[74.5741]])
[0.8755364]
Mode: Test env_steps 200 total rewards -1339.0814244151115 total energy tensor([[114.6412]])
[0.96283424]
Mode: Test env_steps 200 total rewards -1799.3880281448364 total energy tensor([[5.9255]])
[-0.5272495]
Mode: Test env_steps 200 total rewards -1870.2569093704224 total energy tensor([[4.3140]])
[-0.89746386]
Mode: Test env_steps 200 total rewards -1647.4229617714882 total energy tensor([[191.1246]])
[0.53423387]
Mode: Test env_steps 200 total rewards -1814.1247220039368 total energy tensor([[5.4035]])
[-0.36603174]
Mode: Test env_steps 200 total rewards -1855.7251420021057 total energy tensor([[5.0196]])
[-0.23064801]
Mode: Test env_steps 200 total rewards -1641.6919810771942 total energy tensor([[190.4475]])
[-0.1160769]
Mode: Test env_steps 200 total rewards -1835.3629174232483 total energy tensor([[5.9821]])
[-0.9464604]
Mode: Test env_steps 200 total rewards -18

100%|██████████| 25/25 [01:16<00:00,  3.05s/it]


[-0.8799375]
Mode: Train env_steps 200 total rewards -1637.9219407439232 total energy tensor([[190.1309]])
[0.57347476]
Mode: Train env_steps 200 total rewards -1659.9842394590378 total energy tensor([[191.2721]])
[0.5607218]
Mode: Train env_steps 200 total rewards -1135.4844479858875 total energy tensor([[51.0188]])
[-0.8271978]
Mode: Train env_steps 200 total rewards -1165.7514541344717 total energy tensor([[91.1422]])
[-0.5216405]
Mode: Train env_steps 200 total rewards -1938.1966609954834 total energy tensor([[1.3292]])


100%|██████████| 25/25 [01:22<00:00,  3.30s/it]


[-0.3053096]
Mode: Train env_steps 200 total rewards -1888.2909588813782 total energy tensor([[2.5855]])
[0.9674701]
Mode: Train env_steps 200 total rewards -1622.394239783287 total energy tensor([[187.5777]])
[-0.41176906]
Mode: Train env_steps 200 total rewards -1946.5116176605225 total energy tensor([[1.0062]])
[-0.4179811]
Mode: Train env_steps 200 total rewards -1633.4796619415283 total energy tensor([[190.4261]])
[-0.4677181]
Mode: Train env_steps 200 total rewards -1368.8728082180023 total energy tensor([[23.6058]])


100%|██████████| 25/25 [01:13<00:00,  2.95s/it]


[-0.3864167]
Mode: Train env_steps 200 total rewards -1946.841236114502 total energy tensor([[0.6581]])
[0.37941763]
Mode: Train env_steps 200 total rewards -1945.2846193313599 total energy tensor([[1.0908]])
[-0.7855552]
Mode: Train env_steps 200 total rewards -1625.236055135727 total energy tensor([[188.3865]])
[0.4721376]
Mode: Train env_steps 200 total rewards -1854.2765855789185 total energy tensor([[6.6304]])
[0.20459862]
Mode: Train env_steps 200 total rewards -1401.8805564939976 total energy tensor([[141.4057]])


100%|██████████| 25/25 [01:05<00:00,  2.63s/it]


[0.8268382]
Mode: Train env_steps 200 total rewards -1851.8244032859802 total energy tensor([[3.9174]])
[0.3122738]
Mode: Train env_steps 200 total rewards -1867.585491657257 total energy tensor([[3.1110]])
[-0.9366995]
Mode: Train env_steps 200 total rewards -1650.426581978798 total energy tensor([[191.6896]])
[-0.07240005]
Mode: Train env_steps 200 total rewards -1648.9161781668663 total energy tensor([[190.9430]])
[-0.22405425]
Mode: Train env_steps 200 total rewards -1606.114557981491 total energy tensor([[187.1293]])


100%|██████████| 25/25 [01:03<00:00,  2.53s/it]


[-0.09119865]
Mode: Test env_steps 200 total rewards -1862.539086818695 total energy tensor([[3.2006]])
[0.70383155]
Mode: Test env_steps 200 total rewards -1609.7961013615131 total energy tensor([[188.4602]])
[0.14944714]
Mode: Test env_steps 200 total rewards -1667.314734339714 total energy tensor([[192.5339]])
[0.16318478]
Mode: Test env_steps 200 total rewards -1731.7203001976013 total energy tensor([[12.0179]])
[0.38415682]
Mode: Test env_steps 200 total rewards -1590.2090647220612 total energy tensor([[181.1318]])
[-0.99951184]
Mode: Test env_steps 200 total rewards -1817.8987503051758 total energy tensor([[5.6888]])
[-0.11329564]
Mode: Test env_steps 200 total rewards -1656.6020753383636 total energy tensor([[20.7113]])
[-0.32362702]
Mode: Test env_steps 200 total rewards -1730.8598235845566 total energy tensor([[16.9991]])
[0.6062914]
Mode: Test env_steps 200 total rewards -1815.3972759246826 total energy tensor([[8.8554]])
[0.14368069]
Mode: Test env_steps 200 total rewards -1

  4%|▍         | 1/25 [00:03<01:12,  3.03s/it]

## Draw the learning curve

In [None]:
import matplotlib.pyplot as plt
print(learning_curve)
plt.plot(learning_curve["x"], learning_curve["y"])
plt.fill_between(np.array(learning_curve["x"]), np.array(learning_curve["y"])-np.array(learning_curve["z"]), np.array(learning_curve["y"])+np.array(learning_curve["z"]))
plt.xlabel("env steps")
plt.ylabel("return")
plt.show()

In [None]:
leaning_curve_ncde_64_rk4 = learning_curve

In [None]:
learning_curve


In [None]:
timess=torch.linspace(0, 65-1, 65)

In [None]:
file1 = open('config.txt', 'w')
file1.write(str(conf))

file1.close()
file2 = open('results.txt', 'w')
file2.write(str(learning_curve))
file2.close()