# Memory

In many real-world scenarios we do not have access to the ground truth state of the environment. In these cases we only get observations that have some mutual information with the true state. In this case the Markov property is violated because we are dealing with a  [POMDP](https://en.wikipedia.org/wiki/Partially_observable_Markov_decision_process) and hence information from past observations influences the estimate of the current state. Because the belief about the current state is dependent on previous observations, also the future behavior and action selection decision are dependent on them.

To respect this in our policy, it needs to be able to reason about sequences of observations. A natural way to implement this is using recurrent neural networs (RNNs) that carry an internal state that can transport information through time.

In [1]:
#define RL_TOOLS_BACKEND_ENABLE_OPENBLAS
#include <rl_tools/operations/cpu_mux.h>
#include <rl_tools/nn/optimizers/adam/instance/operations_generic.h>
#include <rl_tools/nn/operations_cpu.h>
#include <rl_tools/nn/layers/gru/operations_generic.h>
#include <rl_tools/nn/layers/sample_and_squash/operations_generic.h>
#include <rl_tools/rl/environments/memory/operations_cpu.h>
#include <rl_tools/rl/environments/pendulum/operations_cpu.h>
#include <rl_tools/nn_models/mlp/operations_generic.h>
#include <rl_tools/nn_models/random_uniform/operations_generic.h>
#include <rl_tools/nn_models/sequential/operations_generic.h>
#include <rl_tools/nn/optimizers/adam/operations_generic.h>

#include <rl_tools/rl/algorithms/sac/loop/core/config.h>
#include <rl_tools/rl/loop/steps/evaluation/config.h>
#include <rl_tools/rl/loop/steps/timing/config.h>
#include <rl_tools/rl/algorithms/sac/loop/core/operations_generic.h>
#include <rl_tools/rl/loop/steps/extrack/operations_cpu.h>
#include <rl_tools/rl/loop/steps/evaluation/operations_generic.h>
#include <rl_tools/rl/loop/steps/checkpoint/operations_cpu.h>
#include <rl_tools/rl/loop/steps/save_trajectories/operations_cpu.h>
#include <rl_tools/rl/loop/steps/timing/operations_cpu.h>

namespace rlt = rl_tools;

using DEVICE = rlt::devices::DEVICE_FACTORY<>;
using RNG = DEVICE::SPEC::RANDOM::ENGINE<>;
using T = float;
using TI = typename DEVICE::index_t;

In [2]:
#pragma cling load("openblas")

In [3]:
constexpr bool MEMORY = true;
constexpr bool MEMORY_LONG = false;

constexpr TI SEQUENCE_LENGTH = MEMORY ? (MEMORY_LONG ? 500 : 50) : 10;
constexpr TI SEQUENCE_LENGTH_PROXY = SEQUENCE_LENGTH;
constexpr TI BATCH_SIZE = MEMORY ? 4: 100;
constexpr TI NUM_CHECKPOINTS = 100;

In [4]:
struct ENVIRONMENT_PARAMETERS{
    static constexpr TI HORIZON = MEMORY_LONG ? 100 : 10;
    static constexpr T INPUT_PROBABILITY = HORIZON <= 4 ? 0.5 : (T)2/HORIZON;
    static constexpr TI EPISODE_STEP_LIMIT = 2000;
    static constexpr rlt::rl::environments::memory::Mode MODE = rlt::rl::environments::memory::Mode::COUNT_INPUT;
};


In [5]:
using MEMORY_ENVIRONMENT_SPEC = rlt::rl::environments::memory::Specification<T, TI, ENVIRONMENT_PARAMETERS>;
using MEMORY_ENVIRONMENT = rlt::rl::environments::Memory<MEMORY_ENVIRONMENT_SPEC>;
using PENDULUM_ENVIRONMENT_SPEC = rlt::rl::environments::pendulum::Specification<T, TI, rlt::rl::environments::pendulum::DefaultParameters<T>>;
using PENDULUM_ENVIRONMENT = rlt::rl::environments::Pendulum<PENDULUM_ENVIRONMENT_SPEC>;

using ENVIRONMENT = rlt::utils::typing::conditional_t<MEMORY, MEMORY_ENVIRONMENT, PENDULUM_ENVIRONMENT>;


In [6]:
struct LOOP_CORE_PARAMETERS: rlt::rl::algorithms::sac::loop::core::DefaultParameters<T, TI, ENVIRONMENT>{
    struct SAC_PARAMETERS: rlt::rl::algorithms::sac::DefaultParameters<T, TI, ENVIRONMENT::ACTION_DIM>{
        static constexpr T GAMMA = MEMORY ? 0.0 : 0.99;
        static constexpr TI ACTOR_BATCH_SIZE = BATCH_SIZE;
        static constexpr TI CRITIC_BATCH_SIZE = BATCH_SIZE;
        static constexpr TI SEQUENCE_LENGTH = SEQUENCE_LENGTH_PROXY;
        static constexpr TI CRITIC_TRAINING_INTERVAL = 1;
        static constexpr TI ACTOR_TRAINING_INTERVAL = 2;
        static constexpr bool ENTROPY_BONUS = true;
        static constexpr bool ENTROPY_BONUS_NEXT_STEP = false;

        static constexpr T TARGET_ENTROPY = MEMORY ? -4 : -1;
        static constexpr T ALPHA = 1;
        static constexpr bool ADAPTIVE_ALPHA = true;
    };
    static constexpr TI N_WARMUP_STEPS = 1000;
    static constexpr TI N_WARMUP_STEPS_CRITIC = 1000;
    static constexpr TI N_WARMUP_STEPS_ACTOR = MEMORY ? 10000: 1000;
    static constexpr TI STEP_LIMIT = 200000;
    static constexpr TI REPLAY_BUFFER_CAP = STEP_LIMIT;
    static constexpr TI ACTOR_HIDDEN_DIM = MEMORY ? (MEMORY_LONG ? 64 : 16) : 32;
    static constexpr TI ACTOR_NUM_LAYERS = 4;
    static constexpr auto ACTOR_ACTIVATION_FUNCTION = rlt::nn::activation_functions::ActivationFunction::TANH;
    static constexpr TI CRITIC_HIDDEN_DIM = ACTOR_HIDDEN_DIM;
    static constexpr TI CRITIC_NUM_LAYERS = 4;
    static constexpr auto CRITIC_ACTIVATION_FUNCTION = ACTOR_ACTIVATION_FUNCTION;
    static constexpr bool SHARED_BATCH = false;
    static constexpr TI N_ENVIRONMENTS = 1;

    struct BATCH_SAMPLING_PARAMETERS{
        static constexpr bool INCLUDE_FIRST_STEP_IN_TARGETS = true;
        static constexpr bool ALWAYS_SAMPLE_FROM_INITIAL_STATE = false;
        static constexpr bool RANDOM_SEQ_LENGTH = true;
        static constexpr bool ENABLE_NOMINAL_SEQUENCE_LENGTH_PROBABILITY = true;
        static constexpr T NOMINAL_SEQUENCE_LENGTH_PROBABILITY = 0.5;
    };

    struct ACTOR_OPTIMIZER_PARAMETERS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
        static constexpr T ALPHA = 1e-4;
        static constexpr bool ENABLE_BIAS_LR_FACTOR = false;
        static constexpr T BIAS_LR_FACTOR = 1;
    };
    struct CRITIC_OPTIMIZER_PARAMETERS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
        static constexpr T ALPHA = 1e-3;
        static constexpr bool ENABLE_BIAS_LR_FACTOR = false;
        static constexpr T BIAS_LR_FACTOR = 1;
    };
    struct ALPHA_OPTIMIZER_PARAMETERS: rlt::nn::optimizers::adam::DEFAULT_PARAMETERS_TENSORFLOW<T>{
        static constexpr T ALPHA = 1e-3;
        static constexpr bool ENABLE_BIAS_LR_FACTOR = false;
        static constexpr T BIAS_LR_FACTOR = 1;
    };
};



In [7]:
using RNG = DEVICE::SPEC::RANDOM::ENGINE<>;
using LOOP_CORE_CONFIG = rlt::rl::algorithms::sac::loop::core::Config<T, TI, RNG, ENVIRONMENT, LOOP_CORE_PARAMETERS, rlt::rl::algorithms::sac::loop::core::ConfigApproximatorsGRU>;
using LOOP_EXTRACK_CONFIG = rlt::rl::loop::steps::extrack::Config<LOOP_CORE_CONFIG>;

In [8]:
struct LOOP_EVAL_PARAMETERS: rlt::rl::loop::steps::evaluation::Parameters<T, TI, LOOP_EXTRACK_CONFIG>{
    static constexpr TI EVALUATION_INTERVAL = 1000;
    static constexpr TI NUM_EVALUATION_EPISODES = 10;
    static constexpr TI N_EVALUATIONS = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / EVALUATION_INTERVAL;
};

In [9]:
using LOOP_EVAL_CONFIG = rlt::rl::loop::steps::evaluation::Config<LOOP_EXTRACK_CONFIG, LOOP_EVAL_PARAMETERS>;

In [10]:
struct LOOP_CHECKPOINT_PARAMETERS: rlt::rl::loop::steps::checkpoint::Parameters<T, TI>{
    static constexpr TI CHECKPOINT_INTERVAL_TEMP = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / NUM_CHECKPOINTS;
    static constexpr TI CHECKPOINT_INTERVAL = CHECKPOINT_INTERVAL_TEMP == 0 ? 1 : CHECKPOINT_INTERVAL_TEMP;
};

In [11]:
using LOOP_CHECKPOINT_CONFIG = rlt::rl::loop::steps::checkpoint::Config<LOOP_EVAL_CONFIG, LOOP_CHECKPOINT_PARAMETERS>;

In [12]:
struct LOOP_SAVE_TRAJECTORIES_PARAMETERS: rlt::rl::loop::steps::save_trajectories::Parameters<T, TI, LOOP_CHECKPOINT_CONFIG>{
    static constexpr TI INTERVAL_TEMP = LOOP_CORE_CONFIG::CORE_PARAMETERS::STEP_LIMIT / 10;
    static constexpr TI INTERVAL = INTERVAL_TEMP == 0 ? 1 : INTERVAL_TEMP;
    static constexpr TI NUM_EPISODES = 10;
};

In [13]:
using LOOP_SAVE_TRAJECTORIES_CONFIG = rlt::rl::loop::steps::save_trajectories::Config<LOOP_CHECKPOINT_CONFIG, LOOP_SAVE_TRAJECTORIES_PARAMETERS>;
using LOOP_TIMING_CONFIG = rlt::rl::loop::steps::timing::Config<LOOP_SAVE_TRAJECTORIES_CONFIG>;
using LOOP_CONFIG = LOOP_TIMING_CONFIG;

In [None]:
using LOOP_STATE = LOOP_CONFIG::State<LOOP_CONFIG>;
TI seed = 1;
DEVICE device;
LOOP_STATE ts;
ts.extrack_config.name = "sequential";
ts.extrack_config.population_variates = "algorithm_environment";
ts.extrack_config.population_values = "sac_memory";
rlt::malloc(device);
rlt::init(device);
rlt::malloc(device, ts);
rlt::init(device, ts, seed);
DEVICE::SPEC::RANDOM::ENGINE<> myrng;
rlt::init(device, myrng, seed);
bool done = false;
while(!done){
    if(ts.step % 1000 == 0){
        constexpr TI TEST_SEQUENCE_LENGTH = SEQUENCE_LENGTH;
        rlt::Tensor<rlt::tensor::Specification<T, TI, rlt::tensor::Shape<TI, TEST_SEQUENCE_LENGTH, 1, 2>>> test_critic_input;
        rlt::Tensor<rlt::tensor::Specification<T, TI, rlt::tensor::Shape<TI, TEST_SEQUENCE_LENGTH, 1, 1>>> test_critic_output;
        using EVALUATION_ACTOR = decltype(ts.actor_critic.actor)::CHANGE_BATCH_SIZE<TI, 1>;
        using EVALUATION_CRITIC = rlt::utils::typing::remove_reference_t<decltype(ts.actor_critic.critics[0])>::CHANGE_BATCH_SIZE<TI, 1>;
        EVALUATION_ACTOR::Buffer<1> actor_buffer;
        EVALUATION_CRITIC::Buffer<1> critic_buffer;
        rlt::malloc(device, test_critic_input);
        rlt::malloc(device, test_critic_output);
        rlt::malloc(device, actor_buffer);
        rlt::malloc(device, critic_buffer);
        auto test_actor_input = rlt::view_range(device, test_critic_input, 0, rlt::tensor::ViewSpec<2, 1>{});
        auto test_actor_output = rlt::view_range(device, test_critic_input, 1, rlt::tensor::ViewSpec<2, 1>{});
        constexpr TI N_EXAMPLES = 10;
        TI critic_correct_examples = 0;
        TI actor_correct_examples = 0;
        for(TI example_i = 0; example_i < N_EXAMPLES; example_i++){
            rlt::Mode<rlt::mode::Evaluation<>> mode;
            std::vector<TI> values;
            if(TEST_SEQUENCE_LENGTH >= 2){
                for(TI seq_i = 0; seq_i < TEST_SEQUENCE_LENGTH-1; seq_i++){
                    TI value = rlt::random::uniform_real_distribution(device.random, (T)0, (T)1, myrng) < ENVIRONMENT_PARAMETERS::INPUT_PROBABILITY ? 1 : 0;
                    values.push_back(value);
                    while(values.size() > ENVIRONMENT_PARAMETERS::HORIZON){
                        values.erase(values.begin());
                    }
                    rlt::set(device, test_critic_input, (T)value, seq_i, 0, 0);
                }
            }

//            rlt::Mode<rlt::nn::layers::gru::StepByStepMode<TI, rlt::mode::Evaluation>> mode;
//            mode.reset = true;
            while(values.size() > ENVIRONMENT_PARAMETERS::HORIZON-1){
                values.erase(values.begin());
            }
            TI pre_count = std::accumulate(values.begin(), values.end(), 0);

            for(TI input_i = 0; input_i < 2; input_i++){
//                    TI input_i = real_input_i - 1;
                rlt::set(device, test_critic_input, (T)input_i, TEST_SEQUENCE_LENGTH-1, 0, 0);
                TI count = pre_count + input_i;
                // line search
                T max_value = 0;
                bool max_value_set = false;
                TI max_action = 0;
                for(TI action_i = 0; action_i < 5; action_i++){
                    T action = ((T)action_i)/10;
                    rlt::set(device, test_critic_input, action, TEST_SEQUENCE_LENGTH-1, 0, 1);
//                    rlt::utils::assert_exit(device, rlt::get(device, test_critic_input, TEST_SEQUENCE_LENGTH-2, 0, 0) + rlt::get(device, test_critic_input, TEST_SEQUENCE_LENGTH-1, 0, 0) == count, "Count mismatch");
//                    rlt::print(device, test_critic_input);
                    rlt::evaluate(device, ts.actor_critic.actor, test_actor_input, test_actor_output, actor_buffer, myrng, mode); // to calculate the missing action
                    rlt::evaluate(device, ts.actor_critic.critics[0], test_critic_input, test_critic_output, critic_buffer, myrng, mode);
                    T value = rlt::get(device, test_critic_output, TEST_SEQUENCE_LENGTH-1, 0, 0);
                    if(!max_value_set || value > max_value){
                        max_value = value;
                        max_value_set = true;
                        max_action = action_i;
                    }
//                        std::cout << "Count " << count << " action " << action << " value: " << rlt::get(device, test_critic_output, TEST_SEQUENCE_LENGTH-1, 0, 0) << std::endl;
                }
                critic_correct_examples += max_action == count;
//                    std::cout << "Input " << input_i << " max_action " << max_action << (max_action == count ? " correct" : " incorrect") << std::endl;
                rlt::evaluate(device, ts.actor_critic.actor, test_actor_input, test_actor_output, actor_buffer, myrng, mode);
                bool actor_correct = round(rlt::get(device, test_actor_output, TEST_SEQUENCE_LENGTH-1, 0, 0) * 10) == count;
                std::cout << "Count " << count << " actor_action " << rlt::get(device, test_actor_output, TEST_SEQUENCE_LENGTH-1, 0, 0) << (actor_correct ? " ✅" : " ❌") << std::endl;
                actor_correct_examples += actor_correct;
            }
        }
        rlt::add_scalar(device, device.logger, "critic_evaluation_accuracy", critic_correct_examples / ((T)2*N_EXAMPLES));
        rlt::add_scalar(device, device.logger, "actor_evaluation_accuracy", actor_correct_examples / ((T)2*N_EXAMPLES));
        rlt::free(device, test_critic_input);
        rlt::free(device, test_critic_output);
        rlt::free(device, actor_buffer);
        rlt::free(device, critic_buffer);
    }
    done = rlt::step(device, ts);
}


Seed: 1
Extrack Experiment: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001"


Count 5 actor_action -0.175485 ❌
Count 6 actor_action -0.196689 ❌
Count 0 actor_action -0.129703 ❌
Count 1 actor_action -0.151617 ❌
Count 0 actor_action -0.129954 ❌
Count 1 actor_action -0.151858 ❌
Count 0 actor_action -0.129913 ❌
Count 1 actor_action -0.151818 ❌
Count 0 actor_action -0.130264 ❌
Count 1 actor_action -0.152157 ❌
Count 1 actor_action -0.133836 ❌
Count 2 actor_action -0.155762 ❌
Count 0 actor_action -0.129702 ❌
Count 1 actor_action -0.151616 ❌
Count 2 actor_action -0.142455 ❌
Count 3 actor_action -0.16451 ❌
Count 2 actor_action -0.148916 ❌
Count 3 actor_action -0.170926 ❌
Count 2 actor_action -0.135248 ❌
Count 3 actor_action -0.157141 ❌
Description written to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/description.txt"


Saving Trajectories to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000000000/trajectories.json"
Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000000000/checkpoint.h"


Step: 0/200000 Mean return: -6956.41 Mean episode length: 2000
Count 2 actor_action -0.137958 ❌
Count 3 actor_action -0.159942 ❌
Count 0 actor_action -0.131179 ❌
Count 1 actor_action -0.153038 ❌
Count 0 actor_action -0.12979 ❌
Count 1 actor_action -0.151701 ❌
Count 0 actor_action -0.129805 ❌
Count 1 actor_action -0.151715 ❌
Count 3 actor_action -0.14474 ❌
Count 4 actor_action -0.166892 ❌
Count 0 actor_action -0.130641 ❌
Count 1 actor_action -0.152521 ❌
Count 2 actor_action -0.137927 ❌
Count 3 actor_action -0.159912 ❌
Count 2 actor_action -0.155638 ❌
Count 3 actor_action -0.176984 ❌
Count 1 actor_action -0.138334 ❌
Count 2 actor_action -0.160392 ❌
Count 1 actor_action -0.131271 ❌
Count 2 actor_action -0.153134 ❌
Step: 1000/200000 Mean return: -6700.21 Mean episode length: 2000
Loop step: 1098, env step: 1098, SPS: 109.749 (elapsed: 10.004 s)
Loop step: 1237, env step: 1237, SPS: 13.8175 (elapsed: 20.064 s)
Loop step: 1379, env step: 1379, SPS: 14.1694 (elapsed: 30.085 s)
Loop step: 1522

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000002000/checkpoint.h"


Step: 2000/200000 Mean return: -6810.38 Mean episode length: 2000
Loop step: 2072, env step: 2072, SPS: 12.4968 (elapsed: 80.256 s)
Loop step: 2214, env step: 2214, SPS: 14.1397 (elapsed: 90.298 s)
Loop step: 2355, env step: 2355, SPS: 14.0492 (elapsed: 100.335 s)
Loop step: 2497, env step: 2497, SPS: 14.1603 (elapsed: 110.363 s)
Loop step: 2616, env step: 2616, SPS: 11.8923 (elapsed: 120.369 s)
Loop step: 2718, env step: 2718, SPS: 10.1474 (elapsed: 130.421 s)
Loop step: 2819, env step: 2819, SPS: 10.0836 (elapsed: 140.437 s)
Loop step: 2918, env step: 2918, SPS: 9.81799 (elapsed: 150.521 s)
Count 1 actor_action -0.131157 ❌
Count 2 actor_action -0.153027 ❌
Count 0 actor_action -0.129665 ❌
Count 1 actor_action -0.151582 ❌
Count 0 actor_action -0.129692 ❌
Count 1 actor_action -0.151607 ❌
Count 1 actor_action -0.132196 ❌
Count 2 actor_action -0.154105 ❌
Count 2 actor_action -0.134742 ❌
Count 3 actor_action -0.156655 ❌
Count 1 actor_action -0.14395 ❌
Count 2 actor_action -0.165897 ❌
Count

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000004000/checkpoint.h"


Step: 4000/200000 Mean return: -6827.07 Mean episode length: 2000
Loop step: 4078, env step: 4078, SPS: 11.1553 (elapsed: 240.885 s)
Loop step: 4177, env step: 4177, SPS: 9.88301 (elapsed: 250.902 s)
Loop step: 4276, env step: 4276, SPS: 9.83199 (elapsed: 260.971 s)
Loop step: 4394, env step: 4394, SPS: 11.7024 (elapsed: 271.055 s)
Loop step: 4526, env step: 4526, SPS: 13.1188 (elapsed: 281.116 s)
Loop step: 4652, env step: 4652, SPS: 12.4923 (elapsed: 291.203 s)
Loop step: 4765, env step: 4765, SPS: 11.2369 (elapsed: 301.259 s)
Loop step: 4894, env step: 4894, SPS: 12.8628 (elapsed: 311.288 s)
Count 0 actor_action -0.129677 ❌
Count 1 actor_action -0.151593 ❌
Count 2 actor_action -0.140129 ❌
Count 3 actor_action -0.162137 ❌
Count 3 actor_action -0.141313 ❌
Count 4 actor_action -0.163307 ❌
Count 3 actor_action -0.148731 ❌
Count 4 actor_action -0.170678 ❌
Count 1 actor_action -0.143261 ❌
Count 2 actor_action -0.165233 ❌
Count 2 actor_action -0.141187 ❌
Count 3 actor_action -0.163221 ❌
Co

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000006000/checkpoint.h"


Step: 6000/200000 Mean return: -6749.4 Mean episode length: 2000
Loop step: 6037, env step: 6037, SPS: 8.09868 (elapsed: 421.721 s)
Loop step: 6133, env step: 6133, SPS: 9.5689 (elapsed: 431.753 s)
Loop step: 6243, env step: 6243, SPS: 10.9556 (elapsed: 441.794 s)
Loop step: 6333, env step: 6333, SPS: 8.95862 (elapsed: 451.84 s)
Loop step: 6428, env step: 6428, SPS: 9.43536 (elapsed: 461.908 s)
Loop step: 6524, env step: 6524, SPS: 9.526 (elapsed: 471.986 s)
Loop step: 6637, env step: 6637, SPS: 11.2577 (elapsed: 482.024 s)
Loop step: 6771, env step: 6771, SPS: 13.3262 (elapsed: 492.079 s)
Loop step: 6906, env step: 6906, SPS: 13.4116 (elapsed: 502.145 s)
Count 1 actor_action -0.13067 ❌
Count 2 actor_action -0.152555 ❌
Count 2 actor_action -0.151329 ❌
Count 3 actor_action -0.172636 ❌
Count 3 actor_action -0.141268 ❌
Count 4 actor_action -0.163272 ❌
Count 1 actor_action -0.130925 ❌
Count 2 actor_action -0.152813 ❌
Count 4 actor_action -0.154583 ❌
Count 5 actor_action -0.175847 ❌
Count 1

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000008000/checkpoint.h"


Step: 8000/200000 Mean return: -6735.97 Mean episode length: 2000
Loop step: 8041, env step: 8041, SPS: 9.17485 (elapsed: 592.607 s)
Loop step: 8121, env step: 8121, SPS: 7.94475 (elapsed: 602.677 s)
Loop step: 8201, env step: 8201, SPS: 7.96396 (elapsed: 612.722 s)
Loop step: 8284, env step: 8284, SPS: 8.25582 (elapsed: 622.775 s)
Loop step: 8367, env step: 8367, SPS: 8.22073 (elapsed: 632.872 s)
Loop step: 8449, env step: 8449, SPS: 8.15949 (elapsed: 642.922 s)
Loop step: 8531, env step: 8531, SPS: 8.16411 (elapsed: 652.965 s)
Loop step: 8613, env step: 8613, SPS: 8.09482 (elapsed: 663.095 s)
Loop step: 8695, env step: 8695, SPS: 8.19795 (elapsed: 673.098 s)
Loop step: 8800, env step: 8800, SPS: 10.4955 (elapsed: 683.102 s)
Loop step: 8905, env step: 8905, SPS: 10.405 (elapsed: 693.194 s)
Count 3 actor_action -0.158835 ❌
Count 4 actor_action -0.180187 ❌
Count 0 actor_action -0.130786 ❌
Count 1 actor_action -0.152661 ❌
Count 4 actor_action -0.156047 ❌
Count 5 actor_action -0.177361 ❌


Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000010000/checkpoint.h"


Step: 10000/200000 Mean return: -6778.27 Mean episode length: 2000
Loop step: 10101, env step: 10101, SPS: 11.2963 (elapsed: 804.514 s)
Loop step: 10241, env step: 10241, SPS: 13.951 (elapsed: 814.55 s)
Loop step: 10377, env step: 10377, SPS: 13.5774 (elapsed: 824.566 s)
Loop step: 10511, env step: 10511, SPS: 13.3757 (elapsed: 834.584 s)
Loop step: 10624, env step: 10624, SPS: 11.2295 (elapsed: 844.647 s)
Loop step: 10740, env step: 10740, SPS: 11.3749 (elapsed: 854.845 s)
Loop step: 10798, env step: 10798, SPS: 5.76484 (elapsed: 864.906 s)
Loop step: 10888, env step: 10888, SPS: 8.9704 (elapsed: 874.939 s)
Count 2 actor_action 0.249966 ✅
Count 3 actor_action 0.242563 ❌
Count 1 actor_action 0.250579 ❌
Count 2 actor_action 0.243077 ✅
Count 2 actor_action 0.24807 ✅
Count 3 actor_action 0.240424 ❌
Count 0 actor_action 0.253297 ❌
Count 1 actor_action 0.245961 ❌
Count 4 actor_action 0.242859 ❌
Count 5 actor_action 0.234944 ❌
Count 0 actor_action 0.253617 ❌
Count 1 actor_action 0.246283 ❌
C

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000012000/checkpoint.h"


Step: 12000/200000 Mean return: -1977.31 Mean episode length: 2000
Loop step: 12026, env step: 12026, SPS: 11.2659 (elapsed: 965.435 s)
Loop step: 12182, env step: 12182, SPS: 15.5347 (elapsed: 975.477 s)
Loop step: 12338, env step: 12338, SPS: 15.548 (elapsed: 985.51 s)
Loop step: 12494, env step: 12494, SPS: 15.5295 (elapsed: 995.556 s)
Loop step: 12651, env step: 12651, SPS: 15.5901 (elapsed: 1005.63 s)
Loop step: 12776, env step: 12776, SPS: 12.4582 (elapsed: 1015.66 s)
Loop step: 12916, env step: 12916, SPS: 13.9623 (elapsed: 1025.69 s)
Count 1 actor_action 0.154896 ❌
Count 2 actor_action 0.194512 ✅
Count 0 actor_action 0.1444 ❌
Count 1 actor_action 0.184544 ❌
Count 3 actor_action 0.214911 ❌
Count 4 actor_action 0.248632 ❌
Count 3 actor_action 0.201466 ❌
Count 4 actor_action 0.23763 ❌
Count 2 actor_action 0.169357 ✅
Count 3 actor_action 0.208185 ❌
Count 2 actor_action 0.184416 ✅
Count 3 actor_action 0.222323 ❌
Count 5 actor_action 0.19366 ❌
Count 6 actor_action 0.231101 ❌
Count 0 

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000014000/checkpoint.h"


Step: 14000/200000 Mean return: -1148.19 Mean episode length: 2000
Loop step: 14001, env step: 14001, SPS: 14.2561 (elapsed: 1106.73 s)
Loop step: 14157, env step: 14157, SPS: 15.5544 (elapsed: 1116.76 s)
Loop step: 14313, env step: 14313, SPS: 15.5847 (elapsed: 1126.77 s)
Loop step: 14469, env step: 14469, SPS: 15.5903 (elapsed: 1136.77 s)
Loop step: 14625, env step: 14625, SPS: 15.5577 (elapsed: 1146.8 s)
Loop step: 14781, env step: 14781, SPS: 15.55 (elapsed: 1156.83 s)
Loop step: 14937, env step: 14937, SPS: 15.5002 (elapsed: 1166.9 s)
Count 1 actor_action 0.133864 ✅
Count 2 actor_action 0.223955 ✅
Count 1 actor_action 0.133949 ✅
Count 2 actor_action 0.21828 ✅
Count 2 actor_action 0.234278 ✅
Count 3 actor_action 0.312999 ✅
Count 0 actor_action 0.0380967 ✅
Count 1 actor_action 0.137344 ✅
Count 2 actor_action 0.190052 ✅
Count 3 actor_action 0.275985 ✅
Count 0 actor_action 0.030543 ✅
Count 1 actor_action 0.130302 ✅
Count 1 actor_action 0.0988951 ✅
Count 2 actor_action 0.193426 ✅
Count

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000016000/checkpoint.h"


Step: 16000/200000 Mean return: -857.017 Mean episode length: 2000
Loop step: 16004, env step: 16004, SPS: 14.2767 (elapsed: 1237.11 s)
Loop step: 16160, env step: 16160, SPS: 15.5197 (elapsed: 1247.16 s)
Loop step: 16316, env step: 16316, SPS: 15.5621 (elapsed: 1257.18 s)
Loop step: 16471, env step: 16471, SPS: 15.4628 (elapsed: 1267.21 s)
Loop step: 16624, env step: 16624, SPS: 15.2807 (elapsed: 1277.22 s)
Loop step: 16767, env step: 16767, SPS: 14.2681 (elapsed: 1287.24 s)
Loop step: 16898, env step: 16898, SPS: 13 (elapsed: 1297.32 s)
Count 0 actor_action 0.0829269 ❌
Count 1 actor_action 0.175913 ❌
Count 0 actor_action 0.0538324 ❌
Count 1 actor_action 0.148549 ✅
Count 1 actor_action 0.0797912 ✅
Count 2 actor_action 0.172659 ✅
Count 4 actor_action 0.379622 ✅
Count 5 actor_action 0.441732 ❌
Count 3 actor_action 0.262047 ✅
Count 4 actor_action 0.338224 ❌
Count 2 actor_action 0.265538 ❌
Count 3 actor_action 0.332771 ✅
Count 1 actor_action 0.0765809 ✅
Count 2 actor_action 0.169773 ✅
Cou

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000018000/checkpoint.h"


Step: 18000/200000 Mean return: -840.592 Mean episode length: 2000
Loop step: 18084, env step: 18084, SPS: 14.2647 (elapsed: 1377.53 s)
Loop step: 18235, env step: 18235, SPS: 14.9894 (elapsed: 1387.6 s)
Loop step: 18391, env step: 18391, SPS: 15.5321 (elapsed: 1397.64 s)
Loop step: 18547, env step: 18547, SPS: 15.5809 (elapsed: 1407.66 s)
Loop step: 18703, env step: 18703, SPS: 15.5909 (elapsed: 1417.66 s)
Loop step: 18860, env step: 18860, SPS: 15.6293 (elapsed: 1427.71 s)
Count 2 actor_action 0.20838 ✅
Count 3 actor_action 0.28933 ✅
Count 2 actor_action 0.233271 ✅
Count 3 actor_action 0.312107 ✅
Count 5 actor_action 0.330584 ❌
Count 6 actor_action 0.400502 ❌
Count 5 actor_action 0.449241 ❌
Count 6 actor_action 0.497863 ❌
Count 5 actor_action 0.416243 ❌
Count 6 actor_action 0.471294 ❌
Count 2 actor_action 0.147154 ❌
Count 3 actor_action 0.234604 ❌
Count 0 actor_action 0.0359034 ✅
Count 1 actor_action 0.130204 ✅
Count 2 actor_action 0.202786 ✅
Count 3 actor_action 0.285778 ✅
Count 3 a

Saving Trajectories to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000020000/trajectories.json"
Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000020000/checkpoint.h"


Step: 20000/200000 Mean return: -832.984 Mean episode length: 2000
Loop step: 20068, env step: 20068, SPS: 12.8409 (elapsed: 1507.95 s)
Loop step: 20224, env step: 20224, SPS: 15.5204 (elapsed: 1518 s)
Loop step: 20375, env step: 20375, SPS: 15.0945 (elapsed: 1528.01 s)
Loop step: 20517, env step: 20517, SPS: 14.1569 (elapsed: 1538.04 s)
Loop step: 20668, env step: 20668, SPS: 15.0578 (elapsed: 1548.07 s)
Loop step: 20790, env step: 20790, SPS: 12.1441 (elapsed: 1558.11 s)
Loop step: 20874, env step: 20874, SPS: 8.37813 (elapsed: 1568.14 s)
Loop step: 20979, env step: 20979, SPS: 10.4275 (elapsed: 1578.21 s)
Count 0 actor_action 0.0319692 ✅
Count 1 actor_action 0.128546 ✅
Count 1 actor_action 0.144074 ✅
Count 2 actor_action 0.233186 ✅
Count 0 actor_action -0.0217756 ✅
Count 1 actor_action 0.0765995 ✅
Count 2 actor_action 0.224506 ✅
Count 3 actor_action 0.305842 ✅
Count 4 actor_action 0.368106 ✅
Count 5 actor_action 0.431504 ❌
Count 2 actor_action 0.131177 ❌
Count 3 actor_action 0.22267

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000022000/checkpoint.h"


Step: 22000/200000 Mean return: -830.929 Mean episode length: 2000
Loop step: 22042, env step: 22042, SPS: 14.2381 (elapsed: 1648.36 s)
Loop step: 22198, env step: 22198, SPS: 15.5544 (elapsed: 1658.39 s)
Loop step: 22354, env step: 22354, SPS: 15.5387 (elapsed: 1668.43 s)
Loop step: 22510, env step: 22510, SPS: 15.5265 (elapsed: 1678.48 s)
Loop step: 22666, env step: 22666, SPS: 15.5239 (elapsed: 1688.53 s)
Loop step: 22822, env step: 22822, SPS: 15.5661 (elapsed: 1698.55 s)
Loop step: 22978, env step: 22978, SPS: 15.5966 (elapsed: 1708.55 s)
Count 4 actor_action 0.285306 ❌
Count 5 actor_action 0.360853 ❌
Count 2 actor_action 0.247212 ✅
Count 3 actor_action 0.324488 ✅
Count 1 actor_action 0.120368 ✅
Count 2 actor_action 0.213423 ✅
Count 2 actor_action 0.153367 ✅
Count 3 actor_action 0.244223 ❌
Count 2 actor_action 0.180588 ✅
Count 3 actor_action 0.270131 ✅
Count 1 actor_action 0.0994423 ✅
Count 2 actor_action 0.193542 ✅
Count 2 actor_action 0.210304 ✅
Count 3 actor_action 0.296806 ✅
C

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000024000/checkpoint.h"


Step: 24000/200000 Mean return: -817.696 Mean episode length: 2000
Loop step: 24118, env step: 24118, SPS: 12.1419 (elapsed: 1788.69 s)
Loop step: 24273, env step: 24273, SPS: 15.465 (elapsed: 1798.72 s)
Loop step: 24428, env step: 24428, SPS: 15.484 (elapsed: 1808.73 s)
Loop step: 24584, env step: 24584, SPS: 15.5857 (elapsed: 1818.74 s)
Loop step: 24740, env step: 24740, SPS: 15.5671 (elapsed: 1828.76 s)
Loop step: 24897, env step: 24897, SPS: 15.5868 (elapsed: 1838.83 s)
Count 1 actor_action 0.137355 ✅
Count 2 actor_action 0.233577 ✅
Count 1 actor_action 0.126225 ✅
Count 2 actor_action 0.223225 ✅
Count 2 actor_action 0.169976 ✅
Count 3 actor_action 0.263273 ✅
Count 0 actor_action 0.0479982 ✅
Count 1 actor_action 0.148393 ✅
Count 4 actor_action 0.380101 ✅
Count 5 actor_action 0.446969 ❌
Count 2 actor_action 0.22034 ✅
Count 3 actor_action 0.307404 ✅
Count 4 actor_action 0.402509 ✅
Count 5 actor_action 0.467641 ✅
Count 0 actor_action 0.0496908 ✅
Count 1 actor_action 0.150025 ❌
Count 0 

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000026000/checkpoint.h"


Step: 26000/200000 Mean return: -810.812 Mean episode length: 2000
Loop step: 26121, env step: 26121, SPS: 14.2459 (elapsed: 1919.01 s)
Loop step: 26277, env step: 26277, SPS: 15.5647 (elapsed: 1929.03 s)
Loop step: 26433, env step: 26433, SPS: 15.5792 (elapsed: 1939.04 s)
Loop step: 26581, env step: 26581, SPS: 14.6952 (elapsed: 1949.12 s)
Loop step: 26729, env step: 26729, SPS: 14.7141 (elapsed: 1959.17 s)
Loop step: 26882, env step: 26882, SPS: 15.2749 (elapsed: 1969.19 s)
Count 3 actor_action 0.327954 ✅
Count 4 actor_action 0.405421 ✅
Count 1 actor_action 0.0988064 ✅
Count 2 actor_action 0.195348 ✅
Count 0 actor_action 0.00830873 ✅
Count 1 actor_action 0.106931 ✅
Count 2 actor_action 0.188557 ✅
Count 3 actor_action 0.279815 ✅
Count 2 actor_action 0.133473 ❌
Count 3 actor_action 0.228044 ❌
Count 3 actor_action 0.294816 ✅
Count 4 actor_action 0.372342 ✅
Count 0 actor_action -0.0266353 ✅
Count 1 actor_action 0.0725639 ✅
Count 2 actor_action 0.152034 ✅
Count 3 actor_action 0.245639 ❌
C

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000028000/checkpoint.h"


Step: 28000/200000 Mean return: -811.263 Mean episode length: 2000
Loop step: 28049, env step: 28049, SPS: 14.2429 (elapsed: 2049.51 s)
Loop step: 28205, env step: 28205, SPS: 15.5445 (elapsed: 2059.54 s)
Loop step: 28361, env step: 28361, SPS: 15.5994 (elapsed: 2069.54 s)
Loop step: 28517, env step: 28517, SPS: 15.5664 (elapsed: 2079.57 s)
Loop step: 28673, env step: 28673, SPS: 15.5661 (elapsed: 2089.59 s)
Loop step: 28829, env step: 28829, SPS: 15.5659 (elapsed: 2099.61 s)
Loop step: 28982, env step: 28982, SPS: 15.2979 (elapsed: 2109.61 s)
Count 3 actor_action 0.27351 ✅
Count 4 actor_action 0.362007 ✅
Count 3 actor_action 0.317397 ✅
Count 4 actor_action 0.401828 ✅
Count 2 actor_action 0.147446 ❌
Count 3 actor_action 0.243712 ❌
Count 5 actor_action 0.472724 ✅
Count 6 actor_action 0.533578 ❌
Count 2 actor_action 0.175163 ✅
Count 3 actor_action 0.270162 ✅
Count 2 actor_action 0.183422 ✅
Count 3 actor_action 0.277895 ✅
Count 2 actor_action 0.196601 ✅
Count 3 actor_action 0.289126 ✅
Cou

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000030000/checkpoint.h"


Step: 30000/200000 Mean return: -792.598 Mean episode length: 2000
Loop step: 30048, env step: 30048, SPS: 14.2576 (elapsed: 2179.79 s)
Loop step: 30201, env step: 30201, SPS: 15.2873 (elapsed: 2189.8 s)
Loop step: 30347, env step: 30347, SPS: 14.5725 (elapsed: 2199.82 s)
Loop step: 30476, env step: 30476, SPS: 12.8618 (elapsed: 2209.85 s)
Loop step: 30619, env step: 30619, SPS: 14.202 (elapsed: 2219.92 s)
Loop step: 30766, env step: 30766, SPS: 14.6905 (elapsed: 2229.93 s)
Loop step: 30922, env step: 30922, SPS: 15.563 (elapsed: 2239.95 s)
Count 0 actor_action 0.083342 ❌
Count 1 actor_action 0.185624 ❌
Count 5 actor_action 0.489084 ✅
Count 6 actor_action 0.552785 ✅
Count 1 actor_action 0.098688 ✅
Count 2 actor_action 0.202923 ✅
Count 2 actor_action 0.205423 ✅
Count 3 actor_action 0.303644 ✅
Count 1 actor_action 0.0802865 ✅
Count 2 actor_action 0.183209 ✅
Count 1 actor_action 0.190873 ❌
Count 2 actor_action 0.290723 ❌
Count 2 actor_action 0.22293 ✅
Count 3 actor_action 0.319464 ✅
Count

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000032000/checkpoint.h"


Step: 32000/200000 Mean return: -791.199 Mean episode length: 2000
Loop step: 32001, env step: 32001, SPS: 14.3649 (elapsed: 2310.92 s)
Loop step: 32157, env step: 32157, SPS: 15.5918 (elapsed: 2320.93 s)
Loop step: 32314, env step: 32314, SPS: 15.6214 (elapsed: 2330.98 s)
Loop step: 32470, env step: 32470, SPS: 15.5833 (elapsed: 2340.99 s)
Loop step: 32626, env step: 32626, SPS: 15.5682 (elapsed: 2351.01 s)
Loop step: 32782, env step: 32782, SPS: 15.5966 (elapsed: 2361.01 s)
Loop step: 32939, env step: 32939, SPS: 15.6122 (elapsed: 2371.07 s)
Count 4 actor_action 0.400567 ✅
Count 5 actor_action 0.481747 ✅
Count 3 actor_action 0.273091 ✅
Count 4 actor_action 0.367265 ✅
Count 1 actor_action 0.0787967 ✅
Count 2 actor_action 0.181348 ✅
Count 0 actor_action -0.0314099 ✅
Count 1 actor_action 0.072411 ✅
Count 1 actor_action 0.100645 ✅
Count 2 actor_action 0.202924 ✅
Count 1 actor_action 0.0813773 ✅
Count 2 actor_action 0.183697 ✅
Count 2 actor_action 0.205378 ✅
Count 3 actor_action 0.305845 

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000034000/checkpoint.h"


Step: 34000/200000 Mean return: -757.112 Mean episode length: 2000
Loop step: 34104, env step: 34104, SPS: 13.5988 (elapsed: 2451.33 s)
Loop step: 34241, env step: 34241, SPS: 13.6398 (elapsed: 2461.38 s)
Loop step: 34389, env step: 34389, SPS: 14.7794 (elapsed: 2471.39 s)
Loop step: 34545, env step: 34545, SPS: 15.5288 (elapsed: 2481.43 s)
Loop step: 34702, env step: 34702, SPS: 15.6264 (elapsed: 2491.48 s)
Loop step: 34858, env step: 34858, SPS: 15.5809 (elapsed: 2501.49 s)
Count 3 actor_action 0.390663 ❌
Count 4 actor_action 0.470539 ❌
Count 0 actor_action 0.0877439 ❌
Count 1 actor_action 0.185179 ❌
Count 2 actor_action 0.213689 ✅
Count 3 actor_action 0.308044 ✅
Count 2 actor_action 0.263075 ❌
Count 3 actor_action 0.358039 ❌
Count 4 actor_action 0.38832 ✅
Count 5 actor_action 0.469807 ✅
Count 0 actor_action 0.0512245 ❌
Count 1 actor_action 0.149415 ✅
Count 2 actor_action 0.230304 ✅
Count 3 actor_action 0.322936 ✅
Count 1 actor_action 0.107124 ✅
Count 2 actor_action 0.204524 ✅
Count 

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000036000/checkpoint.h"


Step: 36000/200000 Mean return: -783.747 Mean episode length: 2000
Loop step: 36084, env step: 36084, SPS: 14.2942 (elapsed: 2581.75 s)
Loop step: 36240, env step: 36240, SPS: 15.5534 (elapsed: 2591.78 s)
Loop step: 36395, env step: 36395, SPS: 15.4939 (elapsed: 2601.78 s)
Loop step: 36552, env step: 36552, SPS: 15.6249 (elapsed: 2611.83 s)
Loop step: 36708, env step: 36708, SPS: 15.5759 (elapsed: 2621.85 s)
Loop step: 36863, env step: 36863, SPS: 15.4732 (elapsed: 2631.87 s)
Count 2 actor_action 0.143203 ❌
Count 3 actor_action 0.24913 ❌
Count 1 actor_action 0.024734 ❌
Count 2 actor_action 0.124399 ❌
Count 0 actor_action -0.0238218 ✅
Count 1 actor_action 0.0759388 ✅
Count 1 actor_action 0.0859752 ✅
Count 2 actor_action 0.185946 ✅
Count 3 actor_action 0.358531 ❌
Count 4 actor_action 0.445172 ✅
Count 3 actor_action 0.282171 ✅
Count 4 actor_action 0.374759 ✅
Count 0 actor_action 0.0269056 ✅
Count 1 actor_action 0.126393 ✅
Count 2 actor_action 0.228484 ✅
Count 3 actor_action 0.32449 ✅
Coun

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000038000/checkpoint.h"


Step: 38000/200000 Mean return: -749.532 Mean episode length: 2000
Loop step: 38060, env step: 38060, SPS: 14.2757 (elapsed: 2712.08 s)
Loop step: 38214, env step: 38214, SPS: 15.3777 (elapsed: 2722.09 s)
Loop step: 38370, env step: 38370, SPS: 15.5502 (elapsed: 2732.12 s)
Loop step: 38527, env step: 38527, SPS: 15.6034 (elapsed: 2742.18 s)
Loop step: 38683, env step: 38683, SPS: 15.5958 (elapsed: 2752.19 s)
Loop step: 38840, env step: 38840, SPS: 15.6417 (elapsed: 2762.22 s)
Loop step: 38996, env step: 38996, SPS: 15.5335 (elapsed: 2772.27 s)
Count 1 actor_action 0.123404 ✅
Count 2 actor_action 0.232779 ✅
Count 2 actor_action 0.279516 ❌
Count 3 actor_action 0.373903 ❌
Count 0 actor_action 0.0158399 ✅
Count 1 actor_action 0.116333 ✅
Count 4 actor_action 0.388032 ✅
Count 5 actor_action 0.474223 ✅
Count 2 actor_action 0.226904 ✅
Count 3 actor_action 0.326496 ✅
Count 5 actor_action 0.487313 ✅
Count 6 actor_action 0.562373 ✅
Count 3 actor_action 0.262443 ✅
Count 4 actor_action 0.357625 ✅
C

Saving Trajectories to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000040000/trajectories.json"
Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000040000/checkpoint.h"


Step: 40000/200000 Mean return: -745.091 Mean episode length: 2000
Loop step: 40055, env step: 40055, SPS: 13.2483 (elapsed: 2842.5 s)
Loop step: 40207, env step: 40207, SPS: 15.1807 (elapsed: 2852.51 s)
Loop step: 40336, env step: 40336, SPS: 12.8453 (elapsed: 2862.55 s)
Loop step: 40434, env step: 40434, SPS: 9.75416 (elapsed: 2872.6 s)
Loop step: 40587, env step: 40587, SPS: 15.1905 (elapsed: 2882.67 s)
Loop step: 40736, env step: 40736, SPS: 14.8361 (elapsed: 2892.71 s)
Loop step: 40868, env step: 40868, SPS: 13.144 (elapsed: 2902.76 s)
Count 1 actor_action 0.0983352 ✅
Count 2 actor_action 0.203382 ✅
Count 2 actor_action 0.250763 ❌
Count 3 actor_action 0.348292 ✅
Count 1 actor_action 0.149164 ✅
Count 2 actor_action 0.246921 ✅
Count 2 actor_action 0.198639 ✅
Count 3 actor_action 0.299306 ✅
Count 2 actor_action 0.109739 ❌
Count 3 actor_action 0.207689 ❌
Count 3 actor_action 0.292592 ✅
Count 4 actor_action 0.384765 ✅
Count 3 actor_action 0.305435 ✅
Count 4 actor_action 0.39899 ✅
Count

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000042000/checkpoint.h"


Step: 42000/200000 Mean return: -752.157 Mean episode length: 2000
Loop step: 42092, env step: 42092, SPS: 14.2826 (elapsed: 2982.99 s)
Loop step: 42248, env step: 42248, SPS: 15.5526 (elapsed: 2993.02 s)
Loop step: 42404, env step: 42404, SPS: 15.5897 (elapsed: 3003.03 s)
Loop step: 42560, env step: 42560, SPS: 15.5399 (elapsed: 3013.07 s)
Loop step: 42716, env step: 42716, SPS: 15.5376 (elapsed: 3023.11 s)
Loop step: 42872, env step: 42872, SPS: 15.5783 (elapsed: 3033.12 s)
Count 2 actor_action 0.17875 ✅
Count 3 actor_action 0.289645 ✅
Count 2 actor_action 0.185979 ✅
Count 3 actor_action 0.291618 ✅
Count 2 actor_action 0.236918 ✅
Count 3 actor_action 0.337607 ✅
Count 1 actor_action 0.07194 ✅
Count 2 actor_action 0.174978 ✅
Count 1 actor_action 0.120073 ✅
Count 2 actor_action 0.219896 ✅
Count 1 actor_action 0.0814195 ✅
Count 2 actor_action 0.196193 ✅
Count 3 actor_action 0.284411 ✅
Count 4 actor_action 0.379554 ✅
Count 1 actor_action 0.125254 ✅
Count 2 actor_action 0.238293 ✅
Count 2 

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000044000/checkpoint.h"


Step: 44000/200000 Mean return: -750.482 Mean episode length: 2000
Loop step: 44025, env step: 44025, SPS: 13.557 (elapsed: 3113.35 s)
Loop step: 44165, env step: 44165, SPS: 13.8453 (elapsed: 3123.46 s)
Loop step: 44304, env step: 44304, SPS: 13.8591 (elapsed: 3133.49 s)
Loop step: 44460, env step: 44460, SPS: 15.5921 (elapsed: 3143.5 s)
Loop step: 44617, env step: 44617, SPS: 15.5897 (elapsed: 3153.57 s)
Loop step: 44773, env step: 44773, SPS: 15.5926 (elapsed: 3163.57 s)
Loop step: 44923, env step: 44923, SPS: 14.9599 (elapsed: 3173.6 s)
Count 4 actor_action 0.390111 ✅
Count 5 actor_action 0.478816 ✅
Count 4 actor_action 0.37757 ✅
Count 5 actor_action 0.467389 ✅
Count 2 actor_action 0.200948 ✅
Count 3 actor_action 0.299938 ✅
Count 1 actor_action 0.0942643 ✅
Count 2 actor_action 0.192686 ✅
Count 1 actor_action 0.0229381 ❌
Count 2 actor_action 0.12149 ❌
Count 2 actor_action 0.124394 ❌
Count 3 actor_action 0.221689 ❌
Count 0 actor_action -0.00199628 ✅
Count 1 actor_action 0.0964904 ✅
C

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000046000/checkpoint.h"


Step: 46000/200000 Mean return: -713.268 Mean episode length: 2000
Loop step: 46001, env step: 46001, SPS: 14.2684 (elapsed: 3244.65 s)
Loop step: 46157, env step: 46157, SPS: 15.5526 (elapsed: 3254.68 s)
Loop step: 46313, env step: 46313, SPS: 15.5924 (elapsed: 3264.68 s)
Loop step: 46470, env step: 46470, SPS: 15.6309 (elapsed: 3274.72 s)
Loop step: 46626, env step: 46626, SPS: 15.5749 (elapsed: 3284.74 s)
Loop step: 46783, env step: 46783, SPS: 15.5882 (elapsed: 3294.81 s)
Loop step: 46933, env step: 46933, SPS: 14.9514 (elapsed: 3304.84 s)
Count 3 actor_action 0.311327 ✅
Count 4 actor_action 0.40526 ✅
Count 2 actor_action 0.190707 ✅
Count 3 actor_action 0.285988 ✅
Count 3 actor_action 0.275015 ✅
Count 4 actor_action 0.369866 ✅
Count 0 actor_action 0.00998483 ✅
Count 1 actor_action 0.109386 ✅
Count 3 actor_action 0.247157 ❌
Count 4 actor_action 0.351057 ✅
Count 1 actor_action 0.0899335 ✅
Count 2 actor_action 0.187727 ✅
Count 2 actor_action 0.299331 ❌
Count 3 actor_action 0.392901 ❌


Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000048000/checkpoint.h"


Step: 48000/200000 Mean return: -706.664 Mean episode length: 2000
Loop step: 48115, env step: 48115, SPS: 14.228 (elapsed: 3395.28 s)
Loop step: 48271, env step: 48271, SPS: 15.5468 (elapsed: 3405.32 s)
Loop step: 48428, env step: 48428, SPS: 15.6242 (elapsed: 3415.36 s)
Loop step: 48584, env step: 48584, SPS: 15.5965 (elapsed: 3425.37 s)
Loop step: 48741, env step: 48741, SPS: 15.6049 (elapsed: 3435.43 s)
Loop step: 48897, env step: 48897, SPS: 15.5742 (elapsed: 3445.44 s)
Count 1 actor_action 0.15337 ❌
Count 2 actor_action 0.24904 ✅
Count 2 actor_action 0.213468 ✅
Count 3 actor_action 0.31837 ✅
Count 1 actor_action 0.135519 ✅
Count 2 actor_action 0.232159 ✅
Count 2 actor_action 0.113027 ❌
Count 3 actor_action 0.207775 ❌
Count 1 actor_action 0.0997262 ✅
Count 2 actor_action 0.195686 ✅
Count 4 actor_action 0.38426 ✅
Count 5 actor_action 0.473586 ✅
Count 3 actor_action 0.317631 ✅
Count 4 actor_action 0.411586 ✅
Count 2 actor_action 0.135381 ❌
Count 3 actor_action 0.240421 ❌
Count 3 act

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000050000/checkpoint.h"


Step: 50000/200000 Mean return: -677.007 Mean episode length: 2000
Loop step: 50115, env step: 50115, SPS: 13.9946 (elapsed: 3525.8 s)
Loop step: 50267, env step: 50267, SPS: 15.1388 (elapsed: 3535.84 s)
Loop step: 50366, env step: 50366, SPS: 9.89613 (elapsed: 3545.85 s)
Loop step: 50513, env step: 50513, SPS: 14.6886 (elapsed: 3555.85 s)
Loop step: 50665, env step: 50665, SPS: 15.1335 (elapsed: 3565.9 s)
Loop step: 50806, env step: 50806, SPS: 14.0394 (elapsed: 3575.94 s)
Loop step: 50929, env step: 50929, SPS: 12.2151 (elapsed: 3586.01 s)
Count 1 actor_action 0.0570827 ✅
Count 2 actor_action 0.159048 ✅
Count 3 actor_action 0.256851 ✅
Count 4 actor_action 0.362717 ✅
Count 1 actor_action 0.113259 ✅
Count 2 actor_action 0.210745 ✅
Count 4 actor_action 0.441798 ✅
Count 5 actor_action 0.53124 ✅
Count 3 actor_action 0.247643 ❌
Count 4 actor_action 0.343288 ❌
Count 2 actor_action 0.243691 ✅
Count 3 actor_action 0.346806 ✅
Count 3 actor_action 0.306628 ✅
Count 4 actor_action 0.405152 ✅
Coun

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000052000/checkpoint.h"


Step: 52000/200000 Mean return: -652.202 Mean episode length: 2000
Loop step: 52001, env step: 52001, SPS: 14.325 (elapsed: 3656.56 s)
Loop step: 52157, env step: 52157, SPS: 15.5753 (elapsed: 3666.57 s)
Loop step: 52313, env step: 52313, SPS: 15.5989 (elapsed: 3676.58 s)
Loop step: 52469, env step: 52469, SPS: 15.5935 (elapsed: 3686.58 s)
Loop step: 52625, env step: 52625, SPS: 15.5957 (elapsed: 3696.58 s)
Loop step: 52782, env step: 52782, SPS: 15.6275 (elapsed: 3706.63 s)
Loop step: 52937, env step: 52937, SPS: 15.468 (elapsed: 3716.65 s)
Count 0 actor_action -0.00552002 ✅
Count 1 actor_action 0.0986735 ✅
Count 5 actor_action 0.414248 ❌
Count 6 actor_action 0.505869 ❌
Count 3 actor_action 0.335158 ✅
Count 4 actor_action 0.434864 ✅
Count 1 actor_action 0.145825 ✅
Count 2 actor_action 0.251129 ❌
Count 3 actor_action 0.270365 ✅
Count 4 actor_action 0.3731 ✅
Count 2 actor_action 0.214115 ✅
Count 3 actor_action 0.307813 ✅
Count 1 actor_action 0.0834279 ✅
Count 2 actor_action 0.18344 ✅
Co

Checkpointing to: "experiments/2025-10-11_06-22-15/no-hash_sequential_algorithm_environment/sac_memory/0001/steps/000000000054000/checkpoint.h"


Step: 54000/200000 Mean return: -657.761 Mean episode length: 2000
Loop step: 54105, env step: 54105, SPS: 13.9822 (elapsed: 3797.1 s)
Loop step: 54250, env step: 54250, SPS: 14.4789 (elapsed: 3807.11 s)
Loop step: 54395, env step: 54395, SPS: 14.4318 (elapsed: 3817.16 s)
Loop step: 54525, env step: 54525, SPS: 12.9547 (elapsed: 3827.2 s)
Loop step: 54679, env step: 54679, SPS: 15.2915 (elapsed: 3837.27 s)
