In [None]:
!pip3 install gymnasium stable_baselines3[extra] box2d ipywidgets ffmpeg imageio --break-system-packages

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# Create environment
env = gym.make("LunarLander-v3", render_mode="rgb_array")

# Instantiate the agent
model = PPO(
    'MlpPolicy',
    env,
    verbose=1,
    ##### YOUR HYPERPARAMETERS HERE!!!!
    learning_rate=0.001,
    batch_size=32,
    )

# Train the agent and display a progress bar
model.learn(total_timesteps=int(100000), progress_bar=True)

# Save the agent
model.save("ppo_lunar")
#del model  # delete trained model to demonstrate loading

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
from tqdm.notebook import tqdm

class CustomProgressBarCallback(BaseCallback):
    """
    Custom callback that combines progress bar with training metrics.
    """
    def __init__(self, total_timesteps):
        super().__init__()
        self.pbar = None
        self.total_timesteps = total_timesteps
        self.n_calls = 0

    def _on_training_start(self):
        self.pbar = tqdm(total=self.total_timesteps)

    def _on_step(self):
        n_steps = self.locals.get('n_steps', 1)
        self.pbar.update(n_steps)
        return True

    def _on_training_end(self):
        self.pbar.close()
        self.pbar = None

# Create environment
env = gym.make("LunarLander-v3", render_mode="rgb_array")

# Instantiate the agent
model = PPO(
    'MlpPolicy',
    env,
    verbose=1,  # Keep verbose=1 to see the training metrics
    learning_rate=0.001,
    batch_size=32,
    tensorboard_log="./lunar_lander_tensorboard/"
)

# Total timesteps for training
total_timesteps = 10000

# Create and use the custom callback
callback = CustomProgressBarCallback(total_timesteps)

# Train the agent
model.learn(
    total_timesteps=total_timesteps,
    callback=callback,
    progress_bar=False  # Disable default progress bar to use our custom one
)

# Save the agent
model.save("ppo_lunar")

# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print('\nFinal Evaluation:')
print(f'Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')

In [None]:
import imageio
import numpy as np
import os

# Create images directory if it doesn't exist
if not os.path.exists("images"):
    os.makedirs("images")

# Load the trained agent
env = gym.make("LunarLander-v3", render_mode="rgb_array")
model = PPO.load("ppo_lunar", env=env)

# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print('Mean reward:', mean_reward, 'Std. reward:', std_reward)

# Test the trained agent and save visualization
images = []
episodes = 0
obs, _ = env.reset()  # Updated reset call syntax

while episodes < 5:  # Limit to 5 episodes for reasonable file sizes
    img = env.render()
    images.append(img)
    
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)  # Updated step call syntax
    
    if terminated or truncated:
        episodes += 1
        print(f'Episode {episodes} finished with reward {reward}')
        
        # Save episode as GIF
        if len(images) > 0:
            print(f'Saving episode {episodes} animation...')
            imageio.mimsave(
                f'images/lunar_lander_episode_{episodes}.gif',
                images,
                fps=30
            )
        
        # Reset for next episode
        images = []
        obs, _ = env.reset()

env.close()
print("Done! Check the 'images' directory for the animation files.")