In [1]:
# System Update and Swig Installation
# Updates the system packages in your environment and installs swig, a tool needed for building certain Python libraries.
!apt-get update -qq
!apt-get install -y swig

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
swig is already the newest version (4.0.2-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 151 not upgraded.


In [2]:
# 2. Install Key Python Packages
# Installs specific versions of gym, box2d-py, and stable-baselines3. 
# These packages are required for running the LunarLander environment and the PPO reinforcement learning algorithm.
!pip install "gym==0.26.2" "box2d-py==2.3.5" "stable-baselines3==2.0.0"



In [3]:
# 3. Basic Environment Test (Step 1)
# Creates the LunarLander-v2 environment (with render_mode="rgb_array").
# Resets the environment and checks the shape of the initial observation (should be (8,)).

import gym

env = gym.make("LunarLander-v2", render_mode="rgb_array")
obs, info = env.reset()
print("Observation shape:", obs.shape)
env.close()

Observation shape: (8,)


In [4]:
# 4. Basic Environment Test (Step 2)
# After resetting, we sample one random action and step the environment forward.
# I then check the next observation’s shape, the immediate reward, and whether the episode has ended.

import gym

env = gym.make("LunarLander-v2", render_mode="rgb_array")
obs, info = env.reset()

print("Initial observation shape:", obs.shape)  # (8,) が出ればOK

action = env.action_space.sample()
next_obs, reward, done, truncated, _info = env.step(action)
print("Next observation shape:", next_obs.shape)
print("Reward:", reward)
print("Done:", done, "Truncated:", truncated)

env.close()

Initial observation shape: (8,)
Next observation shape: (8,)
Reward: -0.8033619055678674
Done: False Truncated: False


  if not isinstance(terminated, (bool, np.bool8)):


In [5]:
# 5. Training a PPO Agent and Evaluating Performance
# PPO: A popular on-policy algorithm for continuous or discrete action spaces.
# Hyperparameters: Adjust batch_size, learning_rate, and n_steps to potentially improve learning.
# Training: I run 200,000 timesteps this time.
# Evaluation: I test the model on a number of episodes (here 5), compute the average return, and print the result.

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create the environment and wrap it in a DummyVecEnv
env = gym.make("LunarLander-v2", render_mode="rgb_array")
vec_env = DummyVecEnv([lambda: env])

# Initialize the PPO model with some chosen hyperparameters
model = PPO(
    "MlpPolicy",
    vec_env,
    verbose=1,
    batch_size=64,        # more detatailed..
    learning_rate=3e-4,   # default
    n_steps=2048          # default
)

# Train the model for 200,000 timesteps
model.learn(total_timesteps=200000)
print("Training complete")

import numpy as np

# Define an evaluation function
def evaluate_model(model, env, n_eval_episodes=5):
    rewards_list = []
    for _ in range(n_eval_episodes):
        obs, _ = env.reset()
        done, truncated = False, False
        total_reward = 0
        while not (done or truncated):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
        rewards_list.append(total_reward)
    return np.mean(rewards_list)

# Evaluate the trained model on 5 episodes
mean_reward = evaluate_model(model, env)
print("Number of evaluation episodes:", 5, "Mean reward::", mean_reward)
env.close()

  File "/usr/local/lib/python3.11/dist-packages/gymnasium/envs/registration.py", line 602, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/usr/local/lib/python3.11/dist-packages/ale_py/__init__.py", line 68, in <module>
    register_v0_v4_envs()
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 178, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 63, in _register_rom_configs
    gymnasium.register(
    ^^^^^^^^^^^^^^^^^^
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {trace

Using cpu device


  if not isinstance(terminated, (bool, np.bool8)):


-----------------------------
| time/              |      |
|    fps             | 1058 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 793         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009390529 |
|    clip_fraction        | 0.0517      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -0.00188    |
|    learning_rate        | 0.0003      |
|    loss                 | 792         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00966    |
|    value_loss           | 1.89e+03    |
-----------------------------------------
----------------------------------