In [1]:
%%writefile mjkey.txt
MuJoCo Pro Individual license activation key, number 7777, type 6.

Issued to Everyone.

Expires October 18, 2031.

Do not modify this file. Its entire content, including the
plain text section, is used by the activation manager.

9aaedeefb37011a8a52361c736643665c7f60e796ff8ff70bb3f7a1d78e9a605
0453a3c853e4aa416e712d7e80cf799c6314ee5480ec6bd0f1ab51d1bb3c768f
8c06e7e572f411ecb25c3d6ef82cc20b00f672db88e6001b3dfdd3ab79e6c480
185d681811cfdaff640fb63295e391b05374edba90dd54cc1e162a9d99b82a8b
ea3e87f2c67d08006c53daac2e563269cdb286838b168a2071c48c29fedfbea2
5effe96fe3cb05e85fb8af2d3851f385618ef8cdac42876831f095e052bd18c9
5dce57ff9c83670aad77e5a1f41444bec45e30e4e827f7bf9799b29f2c934e23
dcf6d3c3ee9c8dd2ed057317100cd21b4abbbf652d02bf72c3d322e0c55dcc24

Writing mjkey.txt


In [2]:
%%bash
wget -q https://www.roboti.us/download/mjpro150_linux.zip
unzip -q -d ~/.mujoco mjpro150_linux.zip
wget -q https://www.roboti.us/download/mujoco200_linux.zip
unzip -q -d ~/.mujoco mujoco200_linux.zip
ln -s ~/.mujoco/mujoco200_linux ~/.mujoco/mujoco200
wget -q https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
tar -xf mujoco210-linux-x86_64.tar.gz -C ~/.mujoco/
cp mjkey.txt ~/.mujoco/

In [3]:
import os

os.environ['LD_LIBRARY_PATH'] += ":/root/.mujoco/mujoco210/bin"

In [4]:
!git clone https://github.com/vwxyzjn/cleanrl.git

Cloning into 'cleanrl'...
remote: Enumerating objects: 9833, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 9833 (delta 22), reused 18 (delta 18), pack-reused 9793 (from 2)[K
Receiving objects: 100% (9833/9833), 137.20 MiB | 44.51 MiB/s, done.
Resolving deltas: 100% (6961/6961), done.


In [5]:
%%writefile cleanrl/cleanrl/td3_continuous_action.py

# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/td3/#td3_continuous_actionpy
import os
import random
import time
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tyro
from stable_baselines3.common.buffers import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter


@dataclass
class Args:
    exp_name: str = os.path.basename(__file__)[: -len(".py")]
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    track: bool = False
    """if toggled, this experiment will be tracked with Weights and Biases"""
    wandb_project_name: str = "cleanRL"
    """the wandb's project name"""
    wandb_entity: str = None
    """the entity (team) of wandb's project"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""
    save_model: bool = False
    """whether to save model into the `runs/{run_name}` folder"""
    upload_model: bool = False
    """whether to upload the saved model to huggingface"""
    hf_entity: str = ""
    """the user or org name of the model repository from the Hugging Face Hub"""

    # Algorithm specific arguments
    env_id: str = "Hopper-v4"
    """the id of the environment"""
    total_timesteps: int = 1000000
    """total timesteps of the experiments"""
    learning_rate: float = 3e-4
    """the learning rate of the optimizer"""
    num_envs: int = 1
    """the number of parallel game environments"""
    buffer_size: int = int(1e6)
    """the replay memory buffer size"""
    gamma: float = 0.99
    """the discount factor gamma"""
    tau: float = 0.005
    """target smoothing coefficient (default: 0.005)"""
    batch_size: int = 256
    """the batch size of sample from the reply memory"""
    policy_noise: float = 0.2
    """the scale of policy noise"""
    exploration_noise: float = 0.1
    """the scale of exploration noise"""
    learning_starts: int = 25e3
    """timestep to start learning"""
    policy_frequency: int = 2
    """the frequency of training policy (delayed)"""
    noise_clip: float = 0.5
    """noise clip parameter of the Target Policy Smoothing Regularization"""


def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.action_space.seed(seed)
        return env

    return thunk


# ALGO LOGIC: initialize agent here:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(
            np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape),
            256,
        )
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x, a):
        x = torch.cat([x, a], 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class Actor(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc_mu = nn.Linear(256, np.prod(env.single_action_space.shape))
        # action rescaling
        self.register_buffer(
            "action_scale",
            torch.tensor(
                (env.single_action_space.high - env.single_action_space.low) / 2.0,
                dtype=torch.float32,
            ),
        )
        self.register_buffer(
            "action_bias",
            torch.tensor(
                (env.single_action_space.high + env.single_action_space.low) / 2.0,
                dtype=torch.float32,
            ),
        )

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.tanh(self.fc_mu(x))
        return x * self.action_scale + self.action_bias


if __name__ == "__main__":
    import stable_baselines3 as sb3

    if sb3.__version__ < "2.0":
        raise ValueError(
            """Ongoing migration: run the following command to install the new dependencies:
poetry run pip install "stable_baselines3==2.0.0a1"
"""
        )

    args = tyro.cli(Args)
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
    if args.track:
        import wandb

        wandb.init(
            project=args.wandb_project_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

    # TRY NOT TO MODIFY: seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # env setup
    envs = gym.vector.SyncVectorEnv(
        [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
    )
    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

    actor = Actor(envs).to(device)
    qf1 = QNetwork(envs).to(device)
    qf2 = QNetwork(envs).to(device)
    qf1_target = QNetwork(envs).to(device)
    qf2_target = QNetwork(envs).to(device)
    target_actor = Actor(envs).to(device)
    target_actor.load_state_dict(actor.state_dict())
    qf1_target.load_state_dict(qf1.state_dict())
    qf2_target.load_state_dict(qf2.state_dict())
    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate)
    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate)

    envs.single_observation_space.dtype = np.float32
    rb = ReplayBuffer(
        args.buffer_size,
        envs.single_observation_space,
        envs.single_action_space,
        device,
        n_envs=args.num_envs,
        handle_timeout_termination=False,
    )
    start_time = time.time()

    # TRY NOT TO MODIFY: start the game
    obs, _ = envs.reset(seed=args.seed)
    for global_step in range(args.total_timesteps):
        # ALGO LOGIC: put action logic here
        if global_step < args.learning_starts:
            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
        else:
            with torch.no_grad():
                actions = actor(torch.Tensor(obs).to(device))
                actions += torch.normal(0, actor.action_scale * args.exploration_noise)
                actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, rewards, terminations, truncations, infos = envs.step(actions)

        # TRY NOT TO MODIFY: record rewards for plotting purposes
        if "final_info" in infos:
            for info in infos["final_info"]:
                if info is not None:
                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                    writer.add_scalar("eval_episodic_return", info["episode"]["r"], global_step)
                    writer.add_scalar("eval_episodic_length", info["episode"]["l"], global_step)
                    break

        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
        real_next_obs = next_obs.copy()
        for idx, trunc in enumerate(truncations):
            if trunc:
                real_next_obs[idx] = infos["final_observation"][idx]
        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs

        # ALGO LOGIC: training.
        if global_step > args.learning_starts:
            data = rb.sample(args.batch_size)
            with torch.no_grad():
                clipped_noise = (torch.randn_like(data.actions, device=device) * args.policy_noise).clamp(
                    -args.noise_clip, args.noise_clip
                ) * target_actor.action_scale

                next_state_actions = (target_actor(data.next_observations) + clipped_noise).clamp(
                    envs.single_action_space.low[0], envs.single_action_space.high[0]
                )
                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target)
                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)

            qf1_a_values = qf1(data.observations, data.actions).view(-1)
            qf2_a_values = qf2(data.observations, data.actions).view(-1)
            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
            qf_loss = qf1_loss + qf2_loss

            # optimize the model
            q_optimizer.zero_grad()
            qf_loss.backward()
            q_optimizer.step()

            if global_step % args.policy_frequency == 0:
                actor_loss = -qf1(data.observations, actor(data.observations)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                # update the target network
                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)

            writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
            writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
            writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
            writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
            writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
            writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
            print("SPS:", int(global_step / (time.time() - start_time)))
            writer.add_scalar(
                "steps_per_second",
                int(global_step / (time.time() - start_time)),
                global_step,
            )

    if args.save_model:
        model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
        torch.save((actor.state_dict(), qf1.state_dict(), qf2.state_dict()), model_path)
        print(f"model saved to {model_path}")
        from cleanrl_utils.evals.td3_eval import evaluate

        episodic_returns = evaluate(
            model_path,
            make_env,
            args.env_id,
            eval_episodes=10,
            run_name=f"{run_name}-eval",
            Model=(Actor, QNetwork),
            device=device,
            exploration_noise=args.exploration_noise,
        )
        for idx, episodic_return in enumerate(episodic_returns):
            writer.add_scalar("eval/episodic_return", episodic_return, idx)

        if args.upload_model:
            from cleanrl_utils.huggingface import push_to_hub

            repo_name = f"{args.env_id}-{args.exp_name}-seed{args.seed}"
            repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
            push_to_hub(
                args,
                episodic_returns,
                repo_id,
                "TD3",
                f"runs/{run_name}",
                f"videos/{run_name}-eval",
            )

    envs.close()
    writer.close()

Overwriting cleanrl/cleanrl/td3_continuous_action.py


In [6]:
!pip -q install poetry

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.8/277.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.3/332.3 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.8/453.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.3/53.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
!cd cleanrl && poetry install

Creating virtualenv [36mcleanrl-TaQOIog5-py3.10[39m in /root/.cache/pypoetry/virtualenvs
[34mInstalling dependencies from lock file[39m

[39;1mPackage operations[39;22m: [34m81[39m installs, [34m0[39m updates, [34m0[39m removals

  [34;1m-[39;22m [39mInstalling [39m[36mnumpy[39m[39m ([39m[39;1m1.24.4[39;22m[39m)[39m: [34mPending...[39m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mnumpy[39m[39m ([39m[39;1m1.24.4[39;22m[39m)[39m: [34mDownloading...[39m [39;1m0%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mnumpy[39m[39m ([39m[39;1m1.24.4[39;22m[39m)[39m: [34mDownloading...[39m [39;1m60%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mnumpy[39m[39m ([39m[39;1m1.24.4[39;22m[39m)[39m: [34mDownloading...[39m [39;1m100%[39;22m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mnumpy[39m[39m ([39m[39;1m1.24.4[39;22m[39m)[39m: [34mInstalling...[39m
[1A[0J  [32;1m-[39;22m [39mIn

In [8]:
!sudo apt-get -qq -y install wget unzip software-properties-common \
    libosmesa6-dev patchelf

debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 8.)
debconf: falling back to frontend: Readline
Selecting previously unselected package libpciaccess-dev:amd64.
(Reading database ... 127400 files and directories currently installed.)
Preparing to unpack .../0-libpciaccess-dev_0.16-3_amd64.deb ...
Unpacking libpciaccess-dev:amd64 (0.16-3) ...
Selecting previously unselected package libdrm-dev:amd64.
Preparing to unpack .../1-libdrm-dev_2.4.113-2~ubuntu0.22.04.1_amd64.deb ...
Unpacking libdrm-dev:amd64 (2.4.113-2~ubuntu0.22.04.1) ...
Selecting previously unselected package libglx-dev:amd64.
Preparing to unpack .../2-libglx-dev_1.4.0-1_amd64.deb ...
Unpacking libglx-dev:amd64 (1.4.0-1) ...
Selecting previously unselected package libgl-dev:amd64.
Preparing to unpack .../3-libgl-dev_1.4.0-1_amd64.deb ...
Unpacking lib

In [9]:
!cd cleanrl && python3.10 -m pip install -r requirements/requirements-mujoco.txt

Ignoring importlib-metadata: markers 'python_version >= "3.8" and python_version < "3.10"' don't match your environment
Ignoring zipp: markers 'python_version >= "3.8" and python_version < "3.10"' don't match your environment
Collecting appdirs==1.4.4 (from -r requirements/requirements-mujoco.txt (line 2))
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting cachetools==5.3.0 (from -r requirements/requirements-mujoco.txt (line 3))
  Downloading cachetools-5.3.0-py3-none-any.whl.metadata (5.2 kB)
Collecting certifi==2023.5.7 (from -r requirements/requirements-mujoco.txt (line 4))
  Downloading certifi-2023.5.7-py3-none-any.whl.metadata (2.2 kB)
Collecting charset-normalizer==3.1.0 (from -r requirements/requirements-mujoco.txt (line 5))
  Downloading charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (30 kB)
Collecting click==8.1.3 (from -r requirements/requirements-mujoco.txt (line 6))
  Downloading click-8.1.

In [10]:
!cd cleanrl && poetry install -E mujoco

[34mInstalling dependencies from lock file[39m

[39;1mPackage operations[39;22m: [34m3[39m installs, [34m0[39m updates, [34m0[39m removals

  [34;1m-[39;22m [39mInstalling [39m[36mglfw[39m[39m ([39m[39;1m1.12.0[39;22m[39m)[39m: [34mPending...[39m
  [34;1m-[39;22m [39mInstalling [39m[36mpyopengl[39m[39m ([39m[39;1m3.1.6[39;22m[39m)[39m: [34mPending...[39m
[2A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpyopengl[39m[39m ([39m[39;1m3.1.6[39;22m[39m)[39m: [34mPending...[39m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mglfw[39m[39m ([39m[39;1m1.12.0[39;22m[39m)[39m: [34mDownloading...[39m [39;1m0%[39;22m
  [34;1m-[39;22m [39mInstalling [39m[36mpyopengl[39m[39m ([39m[39;1m3.1.6[39;22m[39m)[39m: [34mPending...[39m
[1A[0J  [34;1m-[39;22m [39mInstalling [39m[36mpyopengl[39m[39m ([39m[39;1m3.1.6[39;22m[39m)[39m: [34mDownloading...[39m [39;1m0%[39;22m
[2A[0J  [34;1m-[39;22m [39mIn

In [None]:
!wandb login

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
!cd cleanrl && poetry run python3.10 cleanrl/td3_continuous_action.py \
    --seed 1 \
    --policy-frequency 1 \
    --env-id HalfCheetah-v4 \
    --total-timesteps 300000 \
    --track \
    --wandb-project-name alm_HalfCheetah-v2

Error in sitecustomize; set PYTHONVERBOSE for traceback:
ModuleNotFoundError: No module named 'log'
  """Reads a number token from the source file, either a float
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Error in sitecustomize; set PYTHONVERBOSE for traceback:
ModuleNotFoundError: No module named 'log'
[34m[1mwandb[0m: Currently logged in as: [33mpolinasha960[0m ([33mpolinasha-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.19.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip i