In [1]:
import gymnasium as gym
from sb3_contrib import TRPO, RecurrentPPO
from stable_baselines3.common.env_util import make_vec_env
import logging
import os

# Import your custom environment
from lv_letter_env import LoveLetterEnv  # Replace with your actual environment

In [3]:
def train_trpo():
    # Create the environment
    env = make_vec_env(lambda: LoveLetterEnv(), n_envs=4)
    # Initialize the TRPO model
    model = TRPO(
        "MultiInputPolicy",  # Use a multi-layer perceptron (MLP) policy
        env,
        verbose=1,    # Enable detailed training output
        learning_rate=0.001,  # Adjust learning rate as needed
        gamma=0.99,    # Discount factor for long-term rewards
        use_sde=False,
    )

    total_timesteps = 600000
    # Train the model
    model.learn(total_timesteps=total_timesteps)  # Adjust the number of timesteps based on computational resources

    # Save the model
    model.save(f"trpo_love_letter_1env_{total_timesteps}_seed_RL")

    print(f"Training complete. Model saved as 'trpo_love_letter_4env_{total_timesteps}'.")


In [4]:
train_trpo()

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.19     |
|    ep_rew_mean     | -115     |
| time/              |          |
|    fps             | 416      |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 8192     |
---------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 1.16     |
|    ep_rew_mean            | -113     |
| time/                     |          |
|    fps                    | 402      |
|    iterations             | 2        |
|    time_elapsed           | 40       |
|    total_timesteps        | 16384    |
| train/                    |          |
|    explained_variance     | -0.00167 |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00797  |
|    learning_rate          | 0.001    |
|    n_updates              | 1        |
|    policy_objective      

In [2]:
def train_ppo():
    # Create the environment
    env = make_vec_env(lambda: LoveLetterEnv(), n_envs=4)

    # Initialize the PPO model
    model = RecurrentPPO(
        "MultiInputLstmPolicy",
        env,
        verbose=1,    # Enable detailed training output
        learning_rate=0.0003,  # Default learning rate for PPO
        gamma=0.99,    # Discount factor for long-term rewards
        n_steps=2048,  # Number of steps to collect before updating the policy
        batch_size=64,  # Batch size for training
        ent_coef=0.01,  # Encourage exploration with entropy coefficient
    )

    # Train the model
    model.learn(total_timesteps=1000000)  # Adjust timesteps as needed

    # Save the model
    model.save("recurrent_ppo_love_letter_1000000")

    print("Training complete. Model saved as 'recurrent_ppo_love_letter'.")

In [3]:
train_ppo()

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.13     |
|    ep_rew_mean     | -96.7    |
| time/              |          |
|    fps             | 735      |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.23        |
|    ep_rew_mean          | -94.3       |
| time/                   |             |
|    fps                  | 154         |
|    iterations           | 2           |
|    time_elapsed         | 105         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.024403214 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.48       |
|    explained_variance   | -0.000481   |
|    learnin

In [4]:
def train_ppo():
    # Create the environment
    env = make_vec_env(lambda: LoveLetterEnv(), n_envs=1)

    # Initialize the PPO model
    model = RecurrentPPO(
        "MultiInputLstmPolicy",
        env,
        verbose=1,    # Enable detailed training output
        learning_rate=0.0003,  # Default learning rate for PPO
        gamma=0.99,    # Discount factor for long-term rewards
        n_steps=2048,  # Number of steps to collect before updating the policy
        batch_size=64,  # Batch size for training
        ent_coef=0.01,  # Encourage exploration with entropy coefficient
    )

    # Train the model
    model.learn(total_timesteps=1000000)  # Adjust timesteps as needed

    # Save the model
    model.save("recurrent_ppo_love_letter_1000000_env1")

    print("Training complete. Model saved as 'recurrent_ppo_love_letter'.")

In [5]:
train_ppo()

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.22     |
|    ep_rew_mean     | -99.5    |
| time/              |          |
|    fps             | 169      |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.23        |
|    ep_rew_mean          | -96.6       |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 2           |
|    time_elapsed         | 42          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.023398655 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.49       |
|    explained_variance   | -6.94e-05   |
|    learnin

In [2]:
def train_ppo():
    # Create the environment
    env = make_vec_env(lambda: LoveLetterEnv(), n_envs=1)

    # Initialize the PPO model
    model = RecurrentPPO(
        "MultiInputLstmPolicy",
        env,
        verbose=1,    # Enable detailed training output
        learning_rate=0.0003,  # Default learning rate for PPO
        gamma=0.99,    # Discount factor for long-term rewards
        n_steps=2048,  # Number of steps to collect before updating the policy
        batch_size=64,  # Batch size for training
        ent_coef=0.01,  # Encourage exploration with entropy coefficient
    )

    # Train the model
    model.learn(total_timesteps=1000000)  # Adjust timesteps as needed

    # Save the model
    model.save("recurrent_ppo_love_letter_1000000")

    print("Training complete. Model saved as 'recurrent_ppo_love_letter'.")

In [3]:
train_ppo()

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.1      |
|    ep_rew_mean     | -104     |
| time/              |          |
|    fps             | 147      |
|    iterations      | 1        |
|    time_elapsed    | 13       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.11        |
|    ep_rew_mean          | -103        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 2           |
|    time_elapsed         | 47          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009667741 |
|    clip_fraction        | 0.0455      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.5        |
|    explained_variance   | -0.000182   |
|    learnin

KeyboardInterrupt: 