# Installation

In [1]:
!pip install gymnasium



In [2]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cublas_cu12-12.4.5.8-py

# DQN

In [3]:
import gymnasium as gym
from stable_baselines3 import DQN

In [4]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

In [5]:
# https://stable-baselines3.readthedocs.io/en/feat-gymnasium-support/modules/dqn.html
model = DQN(policy='MlpPolicy',
            env=env,
            learning_rate=0.001,
            buffer_size=1000,
            batch_size=16,
            train_freq=(5, 'episode'),
            target_update_interval=10,
            exploration_initial_eps=0.9,
            exploration_final_eps=0.1,
            exploration_fraction=0.05,
            device='cpu',
            verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
model

<stable_baselines3.dqn.dqn.DQN at 0x79ede2bb9990>

In [7]:
model.learn(total_timesteps=10000, log_interval=10)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 15.8     |
|    ep_rew_mean      | 15.8     |
|    exploration_rate | 0.647    |
| time/               |          |
|    episodes         | 10       |
|    fps              | 1168     |
|    time_elapsed     | 0        |
|    total_timesteps  | 158      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 14.8     |
|    ep_rew_mean      | 14.8     |
|    exploration_rate | 0.428    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 920      |
|    time_elapsed     | 0        |
|    total_timesteps  | 295      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.535    |
|    n_updates        | 2        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    

<stable_baselines3.dqn.dqn.DQN at 0x79ede2bb9990>

In [8]:
state, _ = env.reset()

In [9]:
state

array([-0.03924289, -0.03409402,  0.03738723,  0.01763499], dtype=float32)

In [11]:
model.predict(state, deterministic=True)

(array(1), None)

In [12]:
action, _ = model.predict(state, deterministic=True)

In [13]:
action

array(1)

In [14]:
next_state, reward, done, _, _ = env.step(action)

In [15]:
next_state

array([-0.03992477,  0.16047236,  0.03773993, -0.26302135], dtype=float32)

In [16]:
reward

1.0

# DDPG

In [17]:
import numpy as np
import gymnasium as gym
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise

In [18]:
env = gym.make('Pendulum-v1', render_mode='rgb_array')

In [19]:
n_actions = env.action_space.shape[-1]
noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

In [20]:
# https://stable-baselines3.readthedocs.io/en/feat-gymnasium-support/modules/ddpg.html
model = DDPG(policy='MlpPolicy',
             env=env,
             learning_rate=0.001,
             buffer_size=1000,
             batch_size=16,
             train_freq=(5, 'episode'),
             action_noise=noise,
             device='cpu',
             verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [21]:
model.learn(total_timesteps=10000, log_interval=10)

----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.49e+03 |
| time/              |           |
|    episodes        | 10        |
|    fps             | 1496      |
|    time_elapsed    | 1         |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | 0.756     |
|    critic_loss     | 83.1      |
|    learning_rate   | 0.001     |
|    n_updates       | 1         |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.49e+03 |
| time/              |           |
|    episodes        | 20        |
|    fps             | 1376      |
|    time_elapsed    | 2         |
|    total_timesteps | 4000      |
| train/             |           |
|    actor_loss      | 4.23      |
|    critic_loss     | 34.4      |
|    learning_rate   | 0.001     |
|    n_updates      

<stable_baselines3.ddpg.ddpg.DDPG at 0x79edd52de990>

In [22]:
state, _ = env.reset()

In [23]:
state

array([ 0.9704954 ,  0.24111953, -0.72964835], dtype=float32)

In [24]:
action, _ = model.predict(state, deterministic=False)

In [25]:
action

array([-1.0837885], dtype=float32)

In [26]:
next_state, reward, done, _, _ = env.step(action)

In [27]:
next_state

array([ 0.97845614,  0.20645489, -0.71137697], dtype=float32)

In [28]:
reward

np.float64(-0.11371489297256246)