In [None]:
!pip install setuptools==65.5.0

In [None]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!apt install swig cmake
!pip install pyglet==1.5
!pip3 install pyvirtualdisplay

In [None]:
!pip install gym==0.22
!pip install imageio-ffmpeg
!pip install huggingface_hub
!pip install gym[box2d]==0.22
!pip install wandb
!pip install optuna

In [None]:
!pip install stable-baselines3[extra]
!pip install sb3-contrib

In [None]:
import gym
import numpy as np
import optuna
import torch
from sb3_contrib import ARS
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances




In [None]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = -1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during the training
N_TIMESTEPS = int(2e5)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = None

ENV_ID = "BipedalWalker-v3"

DEFAULT_HYPERPARAMS = {
    "policy": "LinearPolicy",
    "env": ENV_ID,
}

#optuna.logging.set_verbosity(optuna.logging.DEBUG)

In [None]:
def sample_ars_params(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    n_delta = trial.suggest_int("n_delta", 4, 64)
    n_top = trial.suggest_int("n_top", 2, n_delta)
    delta_std = trial.suggest_float("delta_std", 1e-6, 1, log=True)

    params = {
        "learning_rate": learning_rate,
        "n_delta": n_delta,
        "n_top": n_top,
        "delta_std": delta_std,
    }

    return params

In [None]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.

    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int,
        eval_freq: int,
        deterministic: bool,
        verbose: int,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """
    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sample_ars_params(trial))

    model = ARS(verbose=0, **kwargs)

    eval_env = make_vec_env(ENV_ID, n_envs=N_EVAL_ENVS, seed=trial.number)
    #eval_env = VecNormalize(eval_env)

    eval_callback = TrialEvalCallback(
        eval_env,
        trial=trial,
        n_eval_episodes=N_EVAL_EPISODES,
        eval_freq=EVAL_FREQ,
        deterministic=True,
        verbose=0,
    )

    nan_encountered = False

    try:
      model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
      # Sometimes, random hyperparams can generate NaN
      print(e)
      nan_encountered = True
    finally:
      # Free memory
      model.env.close()
      eval_env.close()

    if nan_encountered:
      return float("nan")

    if eval_callback.is_pruned:
      raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [None]:
#torch.set_num_threads(1)
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)

In [None]:
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_ars_bipedalwalker.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2023-12-31 16:32:18,736] A new study created in memory with name: no-name-52ae5aa8-0a0c-4344-89d6-3faf69e0b30e
[I 2023-12-31 16:39:24,299] Trial 1 finished with value: -91.95766819999999 and parameters: {'learning_rate': 1.5193652817249861e-05, 'n_delta': 50, 'n_top': 23, 'delta_std': 0.0004113221908716642}. Best is trial 1 with value: -91.95766819999999.
[I 2023-12-31 16:44:36,193] Trial 0 finished with value: -8.9572568 and parameters: {'learning_rate': 0.04083223884727603, 'n_delta': 61, 'n_top': 38, 'delta_std': 4.693492323427873e-06}. Best is trial 0 with value: -8.9572568.
[I 2023-12-31 16:46:19,702] Trial 2 finished with value: -16.0622056 and parameters: {'learning_rate': 0.018637233948271076, 'n_delta': 15, 'n_top': 11, 'delta_std': 0.08644413556949784}. Best is trial 0 with value: -8.9572568.
[I 2023-12-31 16:51:28,336] Trial 3 finished with value: -91.89280769999999 and parameters: {'learning_rate': 0.0007858935447865484, 'n_delta': 58, 'n_top': 45, 'delta_std': 4.1478597

Number of finished trials:  100
Best trial:
  Value: 3.6002798999999994
  Params: 
    learning_rate: 0.004770748837092764
    n_delta: 19
    n_top: 3
    delta_std: 0.006952138014079718
  User attrs:


___

___