### Dependencies 🔽

In [None]:
!apt install swig cmake

In [None]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt

In [None]:
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

In [None]:
!pip install gymnasium[box2d]
!pip install wandb
!pip install imageio-ffmpeg
!pip install huggingface_hub
!pip install stable-baselines3[extra]
!pip install sb3-contrib
!pip install huggingface_sb3

### Virtual display 🔽

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

### Huggingface login 🔽

In [None]:
from huggingface_hub import notebook_login
notebook_login()
!git config --global credential.helper store

### Imports 🔽

In [None]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import wandb
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from sb3_contrib import ARS, TQC
from stable_baselines3 import PPO, TD3, SAC
from huggingface_sb3 import package_to_hub, load_from_hub
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common.monitor import Monitor

### Arguments 🔽

In [None]:
def parse_args(algorithm, environment):
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp-name", type=str, default=f"{algorithm}-{environment}",
        help="the name of this experiment")
    parser.add_argument("--seed", type=int, default=1,
        help="seed of the experiment")
    parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, `torch.backends.cudnn.deterministic=False`")
    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, cuda will be enabled by default")
    parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
        help="if toggled, this experiment will be tracked with Weights and Biases")
    parser.add_argument("--wandb-project-name", type=str, default=f"{algorithm}_{environment}",
        help="the wandb's project name")
    parser.add_argument("--wandb-entity", type=str, default=None,
        help="the entity (team) of wandb's project")
    parser.add_argument("--env-id", type=str, default=environment,
        help="the id of the environment")
    parser.add_argument("--repo-id", type=str, default=f"MadFritz/{algorithm}-{environment}",
      help="id of the model repository from the Hugging Face Hub {username/repo_name}")
    parser.add_argument("--total-timesteps", type=int, default=5000 * 1000,
        help="total timesteps of the experiments (== steps_per_iteration * no_iterations)")

    args, unknown = parser.parse_known_args()
    return args

### Training 🔽

-   `algorithm`: ars, ppo, sac, tqc, td3
-   `load_model`: whether train from beginning or resume an already trained agent
-   `environment`: BipedalWalker-v3 or BipedalWalkerHardcore-v3
-   `hardcore`: whether the BipedalWalkerHardcore-v3 environment has been chosen
-   do not forget about normalization: when building the environment and when saving the normalizer

In [None]:
if __name__ == "__main__":
    algorithm = 'sac'
    load_model = False # resume training
    environment = 'BipedalWalker-v3'
    hardcore = True

    args = parse_args(algorithm, environment)
    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"

    # seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    if algorithm == 'ars':
      args.env_id='BipedalWalker-v3'
      envs = make_vec_env(args.env_id, n_envs=1)
      envs = VecNormalize(envs, norm_obs=True, norm_reward=False, clip_obs=np.inf, clip_reward=np.inf)

      if load_model:
        agent = ARS.load(load_from_hub(args.repo_id, f"{args.exp_name}.zip"),
                         env=envs,
                         print_system_info=True)
      else:
        agent = ARS(policy="LinearPolicy",
                    env=envs,
                    learning_rate=0.02,
                    delta_std=0.0075,
                    n_delta=64,
                    n_top=32,
                    alive_bonus_offset=-0.1,
                    tensorboard_log=f'runs/{run_name}',
                    verbose=5)

      args.total_timesteps = 5e6
    elif algorithm == 'ppo':
      args.env_id='BipedalWalker-v3'
      envs = make_vec_env(args.env_id, n_envs=16)
      envs = VecNormalize(envs, norm_obs=True, norm_reward=False, clip_obs=np.inf, clip_reward=np.inf)

      if load_model:
        agent = PPO.load(load_from_hub(args.repo_id, f"{args.exp_name}.zip"),
                         env=envs,
                         print_system_info=True)
      else:
        agent = PPO(policy='MlpPolicy',
                    env=envs,
                    n_steps=2048,
                    batch_size=64,
                    gae_lambda=0.95,
                    gamma=0.999,
                    n_epochs=10,
                    ent_coef=0.0,
                    learning_rate=3e-4,
                    clip_range=0.18,
                    tensorboard_log=f'runs/{run_name}',
                    verbose=5)

      args.total_timesteps = 5e6
    elif algorithm == 'sac':
      args.env_id='BipedalWalker-v3'
      envs = make_vec_env(args.env_id, n_envs=16)
      # envs = VecNormalize(envs, norm_obs=True, norm_reward=False, clip_obs=np.inf, clip_reward=np.inf)

      if load_model:
        agent = SAC.load(load_from_hub(args.repo_id, f"{args.exp_name}.zip"),
                         env=envs,
                         print_system_info=True)
      else:
        agent = SAC(policy='MlpPolicy',
                    env=envs,
                    buffer_size=3e5,
                    batch_size=256,
                    tau=0.02,
                    gamma=0.98,
                    train_freq=64,
                    ent_coef='auto',
                    learning_rate=3e-4,
                    gradient_steps=64,
                    learning_starts=1e4,
                    use_sde=True,
                    tensorboard_log=f'runs/{run_name}',
                    verbose=5)

      args.total_timesteps = 5e5
    elif algorithm == 'tqc':
      args.env_id='BipedalWalker-v3'
      # envs = make_vec_env(args.env_id, n_envs=16, env_kwargs=dict(hardcore=hardcore))

      if load_model:
        agent = TQC.load(load_from_hub(args.repo_id, f"{args.exp_name}.zip"),
                         env=envs,
                         print_system_info=True)
      else:
        agent = TQC(policy='MlpPolicy',
                    env=envs,
                    learning_rate=7.3e-4,
                    buffer_size=300000,
                    batch_size=256,
                    ent_coef='auto',
                    gamma=0.98,
                    tau=0.02,
                    train_freq=64,
                    gradient_steps=64,
                    learning_starts=10000,
                    use_sde=True,
                    policy_kwargs=dict(log_std_init=-3, net_arch=[400, 300]),
                    tensorboard_log=f'runs/{run_name}',
                    verbose=5)

      args.total_timesteps = 5e5
    elif algorithm == 'td3':
      args.env_id='BipedalWalker-v3'
      envs = make_vec_env(args.env_id, n_envs=1, env_kwargs=dict(hardcore=hardcore))
      #envs = VecNormalize(envs, norm_obs=True, norm_reward=False, clip_obs=np.inf, clip_reward=np.inf)

      if load_model:
        agent = TD3.load(load_from_hub(args.repo_id, f"{args.exp_name}.zip"),
                         env=envs,
                         print_system_info=True)
      else:
        agent = TD3(policy='MlpPolicy',
                    env=envs,
                    gamma=0.98,
                    buffer_size=2e5,
                    learning_starts=10000,
                    noise_type='normal',
                    noise_std=0.1,
                    gradient_step=-1,
                    training_freq=(1,'episode'),
                    learning_rate=1e-3,
                    verbose=5,
                    tensorboard_log=f'runs/{run_name}',
                    )

      args.total_timesteps = 1e6

    ##############################################################
    print("Experiment args: ")
    print('\n'.join(f'{k}={v}' for k, v in vars(args).items()))

    if args.track:
        wandb.init(
            project=args.wandb_project_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
            name=run_name,
            monitor_gym=True,
            save_code=True,
        )
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )
    ##############################################################

    agent.learn(total_timesteps=args.total_timesteps, progress_bar=True)
    agent.save(args.exp_name)
    # envs.save("vec_normalize.pkl")

    envs.close()
    writer.close()

    # Create the evaluation environment
    eval_env = make_vec_env(args.env_id, n_envs=1)
    # eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)
    eval_env.render_mode = 'rgb_array'
    # eval_env.training = False
    # eval_env.norm_obs = True
    # eval_env.norm_reward = False
    # eval_env.clip_obs = np.inf
    # eval_env.clip_reward = np.inf

    mean_reward, std_reward = evaluate_policy(agent, eval_env, n_eval_episodes=10, deterministic=True)
    print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

    args.env_id = environment

    package_to_hub(model=agent,
                   model_name=args.exp_name,
                   model_architecture=algorithm,
                   env_id=args.env_id,
                   eval_env=eval_env,
                   repo_id=args.repo_id,
                   commit_message=f"Upload {algorithm} {args.env_id} trained agent",
                   logs= f"runs/{run_name}"
                   )