In [1]:
from gymnasium.wrappers import RecordVideo
import gymnasium as gym
import os
from stable_baselines3 import PPO
from sb3_contrib import RecurrentPPO  # Recurrent PPO는 SB3-Contrib에서 제공

# 비디오 저장 경로 설정
video_folder = "./videos/"
os.makedirs(video_folder, exist_ok=True)

# 환경 생성 및 비디오 기록 래퍼 추가
env = gym.make("CartPole-v1", render_mode="rgb_array")  # render_mode 설정
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda episode_id: True)

# PPO 모델 생성 및 학습
ppo_model = PPO("MlpPolicy", env, verbose=1, device="cuda")  # CUDA 설정
ppo_model.learn(total_timesteps=50000)  # 학습

# 학습된 PPO 모델 실행
print("PPO 모델 실행 중...")
obs, info = env.reset()
for _ in range(1000):
    action, _state = ppo_model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
        obs, info = env.reset()

print(f"PPO 비디오가 {video_folder}에 저장되었습니다.")

# Recurrent PPO 모델 생성 및 학습
r_ppo_model = PPO(
    RecurrentActorCriticPolicy,  # 리커런트 정책 사용
    env,
    verbose=1,
    device="cuda",
    n_steps=128,  # RNN 상태 업데이트를 위해 적절히 설정
    batch_size=64,  # 배치 크기
)
r_ppo_model.learn(total_timesteps=50000)  # 학습

# 학습된 Recurrent PPO 모델 실행
print("Recurrent PPO 모델 실행 중...")
obs, info = env.reset()
rnn_states = None  # RNN 초기 상태
for _ in range(1000):
    action, rnn_states = r_ppo_model.predict(obs, state=rnn_states, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
        obs, info = env.reset()
        rnn_states = None  # 에피소드가 끝날 때 RNN 상태 초기화

env.close()
print(f"Recurrent PPO 비디오가 {video_folder}에 저장되었습니다.")


  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.9     |
|    ep_rew_mean     | 22.9     |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 21       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.4        |
|    ep_rew_mean          | 29.4        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 2           |
|    time_elapsed         | 38          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008917784 |
|    clip_fraction        | 0.0974      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00526    |
|    learning_rate        | 0.

ValueError: too many values to unpack (expected 4)

In [None]:
'''
import gymnasium as gym

from stable_baselines3 import A2C

env = gym.make("CartPole-v1", render_mode="rgb_array")

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render("human")
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()
'''