In [None]:
!pip install gymnasium



In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class RiverSwimEnv(gym.Env):
    def __init__(self, nS=6):
        super(RiverSwimEnv, self).__init__()
        self.nS = nS
        self.nA = 2  # LEFT = 0, RIGHT = 1
        self.state = 0
        self.steps_taken = 0
        self.max_steps = 20

        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)

        # Define transition probabilities and rewards
        self.P = self._init_dynamics()

    def _init_dynamics(self):
        P = {}
        for s in range(self.nS):
            P[s] = {a: [] for a in range(self.nA)}

        # LEFT transitions
        for s in range(self.nS):
            P[s][0] = [(1.0, max(0, s-1), 5/1000 if s == 0 else 0, False)]

        # RIGHT transitions
        P[0][1] = [(0.3, 0, 0, False), (0.7, 1, 0, False)]
        for s in range(1, self.nS - 1):
            P[s][1] = [
                (0.1, max(0, s-1), 0, False),
                (0.6, s, 0, False),
                (0.3, min(self.nS-1, s+1), 0, False)
            ]
        P[self.nS-1][1] = [(0.7, self.nS-1, 1, False), (0.3, self.nS-2, 0, False)]

        return P

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = 0
        self.steps_taken = 0
        return self.state, {}

    def step(self, action):
        transitions = self.P[self.state][action]
        i = self.np_random.choice(len(transitions), p=[t[0] for t in transitions])
        p, next_state, reward, _ = transitions[i]
        self.state = next_state
        self.steps_taken += 1

        # Check if max steps reached
        done = self.steps_taken >= self.max_steps

        return next_state, reward, done, False, {}

    def render(self):
        print(f"Current state: {self.state}")

# Register the environment
gym.register(
    id='RiverSwim-v0',
    entry_point='__main__:RiverSwimEnv',
    max_episode_steps=20,
)

# Create the environment
env = gym.make('RiverSwim-v0')

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
!pip install optuna plotly kaleido



In [None]:
!pip install stable_baselines3



In [None]:
import optuna
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

def optimize_a2c(trial):
    # Define the hyperparameters to optimize
    #n_steps = trial.suggest_int("n_steps", 32, 2048)
    n_steps = 1024
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)

    # Create the vectorized environment
    env = make_vec_env('RiverSwim-v0', n_envs=1)

    # Create and train the model
    model = A2C(
        "MlpPolicy",
        env,
        n_steps=n_steps,
        learning_rate=learning_rate,
        verbose=0
    )

    model.learn(total_timesteps=200000)

    # Evaluate the model
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)

    return mean_reward

def main():
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_a2c, n_trials=100)

    print("Best trial:")
    trial = study.best_trial

    print("Value: ", trial.value)
    print("Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Train a final model with the best hyperparameters
    env = make_vec_env('RiverSwim-v0', n_envs=1)
    best_model = A2C("MlpPolicy", env, **trial.params, verbose=1)
    best_model.learn(total_timesteps=200000)

    # Save the best model
    best_model.save("a2c_deepsea_best")

    # Final evaluation
    final_env = gym.make('DeepSea-v0')
    mean_reward, std_reward = evaluate_policy(best_model, final_env, n_eval_episodes=100)
    print(f"Final mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

if __name__ == "__main__":
    main()

[I 2024-10-05 22:09:31,403] A new study created in memory with name: no-name-e7dbfd18-fab5-4e69-a937-2aaa833c51a0
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  logger.warn(
[I 2024-10-05 22:13:07,078] Trial 0 finished with value: 2.4 and parameters: {'learning_rate': 0.0032172072865183283}. Best is trial 0 with value: 2.4.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  logger.warn(
[I 2024-10-05 22:16:37,550] Trial 1 finished with value: 0.1 and parameters: {'learning_rate': 1.3614164615577724e-05}. Best is trial 0 with value: 2.4.
[I 2024-10-05 22:20:06,691] Trial 2 finished with value: 0.1 and parameters: {'learning_rate': 9.856430524779494e-05}. Best is trial 0 with value: 2.4.
[I 2024-10-05 22:23:37,244] Trial 3 finished with value: 0.1 and parameters: {'learning_rate': 1.0939795909201206e-05}. Best is trial 0 with value: 2.4.
[I 2024-10-05 22:27:08,761] Trial 4 finished with value: 0.1 and parameters: {'learning_rate': 4.25030