diff --git a/docs/api/mujoco.rst b/docs/api/mujoco.rst index d9aa57a8..9bf55508 100644 --- a/docs/api/mujoco.rst +++ b/docs/api/mujoco.rst @@ -33,3 +33,14 @@ HalfCheetah-v4 - Observation space: ``(17)``, first 8 elements for ``qpos[1:]``, next 9 elements for ``qvel``; - Action space: ``(6)``, with range ``[-1, 1]``. + + +Hopper-v4 +--------- + +`gym Hopper-v4 source code +`_ + +- Observation space: ``(11)``, first 5 elements for ``qpos[1:]``, next 6 + elements for ``qvel``; +- Action space: ``(3)``, with range ``[-1, 1]``. diff --git a/envpool/make_test.py b/envpool/make_test.py index 10089752..af1a7648 100644 --- a/envpool/make_test.py +++ b/envpool/make_test.py @@ -72,7 +72,7 @@ def test_make_classic_and_toytext(self) -> None: env_gym.reset() def test_make_mujoco(self) -> None: - mujoco = ["Ant-v4", "HalfCheetah-v4"] + mujoco = ["Ant-v4", "HalfCheetah-v4", "Hopper-v4"] for task_id in mujoco: envpool.make_spec(task_id) env_gym = envpool.make_gym(task_id) diff --git a/envpool/mujoco/BUILD b/envpool/mujoco/BUILD index 8c58eabf..bda8858f 100644 --- a/envpool/mujoco/BUILD +++ b/envpool/mujoco/BUILD @@ -46,12 +46,22 @@ cc_library( ], ) +cc_library( + name = "hopper", + hdrs = ["hopper.h"], + deps = [ + ":mujoco_env", + "//envpool/core:async_envpool", + ], +) + pybind_extension( name = "mujoco_envpool", srcs = ["mujoco_envpool.cc"], deps = [ ":ant", ":half_cheetah", + ":hopper", "//envpool/core:py_envpool", ], ) diff --git a/envpool/mujoco/__init__.py b/envpool/mujoco/__init__.py index 9cdf7214..87c187e2 100644 --- a/envpool/mujoco/__init__.py +++ b/envpool/mujoco/__init__.py @@ -20,13 +20,20 @@ _AntEnvSpec, _HalfCheetahEnvPool, _HalfCheetahEnvSpec, + _HopperEnvPool, + _HopperEnvSpec, ) AntEnvSpec, AntDMEnvPool, AntGymEnvPool = py_env(_AntEnvSpec, _AntEnvPool) + HalfCheetahEnvSpec, HalfCheetahDMEnvPool, HalfCheetahGymEnvPool = py_env( _HalfCheetahEnvSpec, _HalfCheetahEnvPool ) +HopperEnvSpec, HopperDMEnvPool, HopperGymEnvPool = py_env( + _HopperEnvSpec, _HopperEnvPool +) + __all__ = [ "AntEnvSpec", "AntDMEnvPool", @@ -34,4 +41,7 @@ "HalfCheetahEnvSpec", "HalfCheetahDMEnvPool", "HalfCheetahGymEnvPool", + "HopperEnvSpec", + "HopperDMEnvPool", + "HopperGymEnvPool", ] diff --git a/envpool/mujoco/half_cheetah.h b/envpool/mujoco/half_cheetah.h index f84ef4f7..fe4f342f 100644 --- a/envpool/mujoco/half_cheetah.h +++ b/envpool/mujoco/half_cheetah.h @@ -34,8 +34,7 @@ class HalfCheetahEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(4800.0), "frame_skip"_.bind(5), "post_constraint"_.bind(true), - "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(0.1), - "reset_noise_scale"_.bind(0.1)); + "ctrl_cost_weight"_.bind(0.1), "reset_noise_scale"_.bind(0.1)); } template static decltype(auto) StateSpec(const Config& conf) { @@ -60,7 +59,7 @@ typedef class EnvSpec HalfCheetahEnvSpec; class HalfCheetahEnv : public Env, public MujocoEnv { protected: int max_episode_steps_, elapsed_step_; - mjtNum forward_reward_weight_, ctrl_cost_weight_; + mjtNum ctrl_cost_weight_; std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_; std::normal_distribution<> dist_qvel_; @@ -73,7 +72,6 @@ class HalfCheetahEnv : public Env, public MujocoEnv { spec.config["frame_skip"_], spec.config["post_constraint"_]), max_episode_steps_(spec.config["max_episode_steps"_]), elapsed_step_(max_episode_steps_ + 1), - forward_reward_weight_(spec.config["forward_reward_weight"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), qpos0_(new mjtNum[model_->nq]), qvel0_(new mjtNum[model_->nv]), @@ -116,7 +114,7 @@ class HalfCheetahEnv : public Env, public MujocoEnv { mjtNum dt = frame_skip_ * model_->opt.timestep; mjtNum xv = (x_after - x_before) / dt; // reward and done - float reward = forward_reward_weight_ * xv - ctrl_cost; + float reward = xv - ctrl_cost; done_ = (++elapsed_step_ >= max_episode_steps_); WriteObs(reward, xv, ctrl_cost, x_after); } @@ -135,7 +133,7 @@ class HalfCheetahEnv : public Env, public MujocoEnv { *(obs++) = data_->qvel[i]; } // info - state["info:reward_run"_] = forward_reward_weight_ * xv; + state["info:reward_run"_] = xv; state["info:reward_ctrl"_] = -ctrl_cost; state["info:x_position"_] = x_after; state["info:x_velocity"_] = xv; diff --git a/envpool/mujoco/hopper.h b/envpool/mujoco/hopper.h new file mode 100644 index 00000000..37598db6 --- /dev/null +++ b/envpool/mujoco/hopper.h @@ -0,0 +1,182 @@ +/* + * Copyright 2022 Garena Online Private Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENVPOOL_MUJOCO_HOPPER_H_ +#define ENVPOOL_MUJOCO_HOPPER_H_ + +#include +#include +#include +#include + +#include "envpool/core/async_envpool.h" +#include "envpool/core/env.h" +#include "envpool/mujoco/mujoco_env.h" + +namespace mujoco { + +class HopperEnvFns { + public: + static decltype(auto) DefaultConfig() { + return MakeDict( + "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0), + "frame_skip"_.bind(4), "post_constraint"_.bind(true), + "ctrl_cost_weight"_.bind(1e-3), "healthy_reward"_.bind(1.0), + "velocity_min"_.bind(-10.0), "velocity_max"_.bind(10.0), + "healthy_state_min"_.bind(-100.0), "healthy_state_max"_.bind(100.0), + "healthy_angle_min"_.bind(-0.2), "healthy_angle_max"_.bind(0.2), + "healthy_z_min"_.bind(0.7), "reset_noise_scale"_.bind(5e-3)); + } + template + static decltype(auto) StateSpec(const Config& conf) { + mjtNum inf = std::numeric_limits::infinity(); + return MakeDict("obs"_.bind(Spec({11}, {-inf, inf})), + "info:x_position"_.bind(Spec({-1})), + "info:x_velocity"_.bind(Spec({-1})), + // TODO(jiayi): remove these two lines for speed + "info:qpos0"_.bind(Spec({6})), + "info:qvel0"_.bind(Spec({6}))); + } + template + static decltype(auto) ActionSpec(const Config& conf) { + return MakeDict("action"_.bind(Spec({-1, 3}, {-1.0f, 1.0f}))); + } +}; + +typedef class EnvSpec HopperEnvSpec; + +class HopperEnv : public Env, public MujocoEnv { + protected: + int max_episode_steps_, elapsed_step_; + mjtNum ctrl_cost_weight_, healthy_reward_, healthy_z_min_; + mjtNum velocity_min_, velocity_max_; + mjtNum healthy_state_min_, healthy_state_max_; + mjtNum healthy_angle_min_, healthy_angle_max_; + std::unique_ptr qpos0_, qvel0_; // for align check + std::uniform_real_distribution<> dist_; + bool done_; + + public: + HopperEnv(const Spec& spec, int env_id) + : Env(spec, env_id), + MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/hopper.xml", + spec.config["frame_skip"_], spec.config["post_constraint"_]), + max_episode_steps_(spec.config["max_episode_steps"_]), + elapsed_step_(max_episode_steps_ + 1), + ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), + healthy_reward_(spec.config["healthy_reward"_]), + healthy_z_min_(spec.config["healthy_z_min"_]), + velocity_min_(spec.config["velocity_min"_]), + velocity_max_(spec.config["velocity_max"_]), + healthy_state_min_(spec.config["healthy_state_min"_]), + healthy_state_max_(spec.config["healthy_state_max"_]), + healthy_angle_min_(spec.config["healthy_angle_min"_]), + healthy_angle_max_(spec.config["healthy_angle_max"_]), + qpos0_(new mjtNum[model_->nq]), + qvel0_(new mjtNum[model_->nv]), + dist_(-spec.config["reset_noise_scale"_], + spec.config["reset_noise_scale"_]), + done_(true) {} + + void MujocoResetModel() { + for (int i = 0; i < model_->nq; ++i) { + data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + } + for (int i = 0; i < model_->nv; ++i) { + data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + } + } + + bool IsDone() override { return done_; } + + void Reset() override { + done_ = false; + elapsed_step_ = 0; + MujocoReset(); + WriteObs(0.0f, 0, 0); + } + + void Step(const Action& action) override { + // step + mjtNum* act = static_cast(action["action"_].data()); + mjtNum x_before = data_->qpos[0]; + MujocoStep(act); + mjtNum x_after = data_->qpos[0]; + + // ctrl_cost + mjtNum ctrl_cost = 0.0; + for (int i = 0; i < model_->nu; ++i) { + ctrl_cost += ctrl_cost_weight_ * act[i] * act[i]; + } + // xv + mjtNum dt = frame_skip_ * model_->opt.timestep; + mjtNum xv = (x_after - x_before) / dt; + // reward and done + float reward = xv + healthy_reward_ - ctrl_cost; + ++elapsed_step_; + done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_); + WriteObs(reward, xv, x_after); + } + + private: + bool IsHealthy() { + mjtNum z = data_->qpos[1], angle = data_->qpos[2]; + if (angle <= healthy_angle_min_ || angle >= healthy_angle_max_ || + z <= healthy_z_min_) { + return false; + } + for (int i = 2; i < model_->nq; ++i) { + if (data_->qpos[i] <= healthy_state_min_ || + data_->qpos[i] >= healthy_state_max_) { + return false; + } + } + for (int i = 0; i < model_->nv; ++i) { + if (data_->qvel[i] <= healthy_state_min_ || + data_->qvel[i] >= healthy_state_max_) { + return false; + } + } + return true; + } + + void WriteObs(float reward, mjtNum xv, mjtNum x_after) { // NOLINT + State state = Allocate(); + state["reward"_] = reward; + // obs + mjtNum* obs = static_cast(state["obs"_].data()); + for (int i = 1; i < model_->nq; ++i) { + *(obs++) = data_->qpos[i]; + } + for (int i = 0; i < model_->nv; ++i) { + mjtNum x = data_->qvel[i]; + x = std::min(velocity_max_, x); + x = std::max(velocity_min_, x); + *(obs++) = x; + } + // info + state["info:x_position"_] = x_after; + state["info:x_velocity"_] = xv; + state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); + state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); + } +}; + +typedef AsyncEnvPool HopperEnvPool; + +} // namespace mujoco + +#endif // ENVPOOL_MUJOCO_HOPPER_H_ diff --git a/envpool/mujoco/mujoco_envpool.cc b/envpool/mujoco/mujoco_envpool.cc index 7bff230a..b0a66a99 100644 --- a/envpool/mujoco/mujoco_envpool.cc +++ b/envpool/mujoco/mujoco_envpool.cc @@ -15,6 +15,7 @@ #include "envpool/core/py_envpool.h" #include "envpool/mujoco/ant.h" #include "envpool/mujoco/half_cheetah.h" +#include "envpool/mujoco/hopper.h" typedef PyEnvSpec AntEnvSpec; typedef PyEnvPool AntEnvPool; @@ -22,7 +23,11 @@ typedef PyEnvPool AntEnvPool; typedef PyEnvSpec HalfCheetahEnvSpec; typedef PyEnvPool HalfCheetahEnvPool; +typedef PyEnvSpec HopperEnvSpec; +typedef PyEnvPool HopperEnvPool; + PYBIND11_MODULE(mujoco_envpool, m) { REGISTER(m, AntEnvSpec, AntEnvPool) REGISTER(m, HalfCheetahEnvSpec, HalfCheetahEnvPool) + REGISTER(m, HopperEnvSpec, HopperEnvPool) } diff --git a/envpool/mujoco/mujoco_test.py b/envpool/mujoco/mujoco_test.py index bc1f39bf..5643533f 100644 --- a/envpool/mujoco/mujoco_test.py +++ b/envpool/mujoco/mujoco_test.py @@ -26,6 +26,8 @@ AntGymEnvPool, HalfCheetahEnvSpec, HalfCheetahGymEnvPool, + HopperEnvSpec, + HopperGymEnvPool, ) @@ -121,6 +123,13 @@ def test_half_cheetah(self) -> None: self.run_align_check(env0, env1, no_time_limit=True) self.run_deterministic_check(HalfCheetahEnvSpec, HalfCheetahGymEnvPool) + def test_hopper(self) -> None: + env0 = mjc_mwe.HopperEnv() + env1 = HopperGymEnvPool(HopperEnvSpec(HopperEnvSpec.gen_config())) + self.run_space_check(env0, env1) + self.run_align_check(env0, env1) + self.run_deterministic_check(HopperEnvSpec, HopperGymEnvPool) + if __name__ == "__main__": absltest.main() diff --git a/envpool/mujoco/registration.py b/envpool/mujoco/registration.py index 70cc537b..cab7583d 100644 --- a/envpool/mujoco/registration.py +++ b/envpool/mujoco/registration.py @@ -40,3 +40,14 @@ reward_threshold=4800.0, base_path=base_path, ) + +register( + task_id="Hopper-v4", + import_path="envpool.mujoco", + spec_cls="HopperEnvSpec", + dm_cls="HopperDMEnvPool", + gym_cls="HopperGymEnvPool", + max_episode_steps=1000, + reward_threshold=3800.0, + base_path=base_path, +)