In [None]:
%%writefile LunarLander-PG-a.py

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import argparse
from itertools import count



parser = argparse.ArgumentParser(description="Policy Gradient for LunarLander-v3")
parser.add_argument("--environment", type=str, required=True,
                    help="Gym environment name (e.g., LunarLander-v3")
parser.add_argument("--iterations", type=int, default=50,
                    help="Number of iterations (each iteration = batch of episodes)")
parser.add_argument("--batch", type=int, default=32,
                    help="Batch size (number of episodes per iteration)")
args = parser.parse_args()



def plot_rewards(rewards, filename):
    """Plot the average total rewards per iteration."""
    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(rewards, label='Average Reward per Iteration')
    plt.title(f"Random Agent Performance on {args.environment}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Total Reward")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


def get_state(obs):
    """Convert observation (state) to PyTorch tensor."""
    state = torch.from_numpy(np.array(obs)).float().unsqueeze(0)
    return state


def select_random_action(env):
    """Select a random valid action from the environment."""
    return env.action_space.sample()


def run_random_agent(env, iterations, batch_size):
    """Run random agent for given environment and collect average rewards."""
    all_rewards = []

    for it in range(iterations):
        batch_rewards = []

        for ep in range(batch_size):
            obs = env.reset()
            # Compatible with Gym v0.26+
            if isinstance(obs, tuple):
                obs = obs[0]

            total_reward = 0
            for t in count():
                action = select_random_action(env)
                step_output = env.step(action)

                if len(step_output) == 5:
                    obs, reward, terminated, truncated, _ = step_output
                    done = terminated or truncated
                else:
                    obs, reward, done, _ = step_output

                total_reward += reward

                if done:
                    break

            batch_rewards.append(total_reward)

        avg_reward = np.mean(batch_rewards)
        print(f"Iteration {it+1}/{iterations}: Average Reward = {avg_reward:.2f}")
        all_rewards.append(avg_reward)

    env.close()
    return all_rewards


if __name__ == "__main__":
    # Load Gym environment
    env = gym.make(args.environment)
    print(f"\nLoaded Environment: {args.environment}")
    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}\n")

    # Run random agent
    rewards = run_random_agent(env, args.iterations, args.batch)

    # Save plot
    file_name = f"{args.environment}_{args.iterations}_{args.batch}_random_agent.png"
    plot_rewards(rewards, file_name)
    print(f"Plot saved as {file_name}")


Overwriting LunarLander-PG-a.py


In [None]:
# !python3 LunarLander-PG-a.py --environment LunarLander-v3 --iterations 50 --batch 16
# run the python CMD via CLI



Loaded Environment: LunarLander-v3
Observation space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Action space: Discrete(4)

Iteration 1/50: Average Reward = -171.23
Iteration 2/50: Average Reward = -182.92
Iteration 3/50: Average Reward = -213.44
Iteration 4/50: Average Reward = -170.92
Iteration 5/50: Average Reward = -174.31
Iteration 6/50: Average Reward = -151.22
Iteration 7/50: Average Reward = -206.65
Iteration 8/50: Average Reward = -160.01
Iteration 9/50: Average Reward = -182.23
Iteration 10/50: Average Reward = -174.77
Iteration 11/50: Average Reward = -218.77
Iteration 12/50: Average Reward = -154.10
Iteration 13/50: Average Reward = -181.44
Iteration 14/50: Average Reward = -141.88
Iteration 15/50: Average Reward = -176.98
Iteration 16/50: Average Reward = -152.25
Iteration 17/50: Average Reward = -204.13
Iteration 18/50:

In [None]:
%%writefile LunarLander-PG-b.py
import gymnasium as gym
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import argparse


class Policy(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(Policy, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, action_dim)
        )
    
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight, gain=np.sqrt(2))
                nn.init.constant_(layer.bias, 0.0)
    
    def forward(self, x):
        return self.network(x)

def compute_returns(rewards, gamma, reward_to_go=False):
    """Compute discounted returns."""
    if reward_to_go:
        returns = []
        future_return = 0
        for r in reversed(rewards):
            future_return = r + gamma * future_return
            returns.insert(0, future_return)
        return returns
    else:
        total_return = sum([gamma**i * r for i, r in enumerate(rewards)])
        return [total_return] * len(rewards)

def plot_rewards(all_mean_rewards, fname):
    k = 100
    running_avg = np.convolve(all_mean_rewards, np.ones(k)/k, mode='valid')
    plt.figure(figsize=(8, 4))
    plt.plot(all_mean_rewards, label="All Returns", alpha=0.5)
    plt.plot(np.arange(k-1, len(all_mean_rewards)), running_avg, label=f"Running Average (k={k})", color='r', linewidth=2)
    plt.title(f"{fname}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Return")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(fname + ".png")
    print(f"Plot saved as {fname}.png\n")
    plt.close()

def pg_training(env_name="CartPole-v1", iterations=100, batch_size=2000,lr=1e-2, gamma=0.99, reward_to_go=True, advantage_norm=True,hidden_dim=128,):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"PG training on {env_name} | reward_to_go={reward_to_go} advantage_norm={advantage_norm}  | device = {device}")
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n
    
    policy = Policy(state_dim, n_actions, hidden_dim).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    all_mean_rewards = []

    for it in range(iterations):
        batch_states, batch_actions, batch_weights, batch_episode_rewards = [], [], [], []
        steps = 0

        # collect trajectories until we reach desired batch size
        while steps < batch_size:
            obs, info = env.reset()
            done = False
            states, actions, rewards = [], [], []
            while True:
                s_t = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                logits = policy(s_t)
                dist = torch.distributions.Categorical(logits=logits)
                action = int(dist.sample().item())

                next_obs, reward, terminated, truncated, info = env.step(action)
                done_flag = terminated or truncated

                states.append(obs)
                actions.append(action)
                rewards.append(reward)
                obs = next_obs

                if done_flag:
                    break

            steps += len(states)
            batch_states += states
            batch_actions += actions
            batch_episode_rewards.append(sum(rewards))
            batch_weights += list(compute_returns(rewards, gamma, reward_to_go))

        # prepare tensors
        batch_states_t = torch.tensor(np.array(batch_states), dtype=torch.float32, device=device)
        batch_actions_t = torch.tensor(batch_actions, dtype=torch.int64, device=device)
        batch_weights_t = torch.tensor(batch_weights, dtype=torch.float32, device=device)

        # advantage normalization
        if advantage_norm:
            mean = batch_weights_t.mean()
            std = batch_weights_t.std() + 1e-8
            batch_weights_t = (batch_weights_t - mean) / std

        logits = policy(batch_states_t)
        dists = torch.distributions.Categorical(logits=logits)
        logp = dists.log_prob(batch_actions_t)
        loss = -(logp * batch_weights_t).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        mean_reward = np.mean(batch_episode_rewards)
        all_mean_rewards.append(mean_reward)
        print(f"Iteration {it+1}/{iterations} | Mean Reward: {mean_reward:.2f} | Episodes: {len(batch_episode_rewards)}")

    return all_mean_rewards


if __name__ == "__main__":    
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default="LunarLander-v3")
    parser.add_argument("--iterations", type=int, default=300)
    parser.add_argument("--batch_size", type=int, default=8000)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--reward_to_go", action="store_true")
    parser.add_argument("--advantage_norm", action="store_true")
    parser.add_argument("--reward_scale", type=float, default=1.0)
    parser.add_argument("--reward_clip_min", type=float, default=None)
    parser.add_argument("--reward_clip_max", type=float, default=None)
    parser.add_argument("--hidden_dim", type=int, default=128)
    args = parser.parse_args()

    all_mean_rewards = pg_training(
        env_name=args.environment,
        iterations=args.iterations,
        batch_size=args.batch_size,
        lr=args.lr,
        gamma=args.gamma,
        reward_to_go=args.reward_to_go,
        advantage_norm=args.advantage_norm,
        hidden_dim=args.hidden_dim
    )

    fname = f"{args.environment}_PG_iters{args.iterations}_bs{args.batch_size}_g_lr_{'rtg' if args.reward_to_go else 'tot'}_{'advnorm' if args.advantage_norm else 'noadv'}_"
    plot_rewards(all_mean_rewards, fname)


Writing LunarLander-PG-b.py


In [None]:
# !python3 LunarLander-PG-b.py --environment LunarLander-v3 --iterations 5 --batch 40 --gamma 0.99 --learning_rate 5e-4 --reward_to_go --advantage_norm --reward_scale 1.0
# run the python CMD via CLI


Environment: LunarLander-v3
Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Discrete(4)
Device: cpu
Policy Params: 18180
Iter 1/5 | AvgReturn -205.115 | Loss 1.150
Iter 2/5 | AvgReturn -174.408 | Loss -5.880
Iter 3/5 | AvgReturn -177.800 | Loss -3.110
Iter 4/5 | AvgReturn -176.663 | Loss 12.527
Iter 5/5 | AvgReturn -189.604 | Loss -10.015
Training finished. Plot saved to LunarLander-v3_PG_iters5_batch40_g_lr_rtg_advnorm_.png
Saved plot: LunarLander-v3_PG_iters5_batch40_g_lr_rtg_advnorm_.png


In [None]:
%%writefile LunarLander-PG-c.py

import subprocess
import os
import sys

# Fixed settings
environment = "LunarLander-v3"
iterations = 1200
learning_rate = 1e-3
gamma = 0.99

# Batch sizes to compare
batch_sizes = [80, 800, 8000]

# Store results
batch_rewards = {}

for b in batch_sizes:
    script_path = os.path.join(os.path.dirname(__file__), "LunarLander-PG-b.py")
    try:
        result = subprocess.run(
            [
                sys.executable,
                script_path,
                "--environment", environment,
                "--iterations", str(iterations),
                "--batch_size", str(b),
                "--lr", str(learning_rate),
                "--gamma", str(gamma),
                "--reward_to_go",
                "--advantage_norm"
            ],
            capture_output=True,
            text=True,
            check=True,
            cwd=os.path.dirname(__file__)
        )
        # show subprocess output for debugging / parsing
        if result.stdout:
            print(result.stdout)
        if result.stderr:
            print("Subprocess stderr:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("Subprocess failed with returncode", e.returncode)
        if e.stdout:
            print(e.stdout)
        if e.stderr:
            print(e.stderr)
        continue


In [None]:
# run the python CMD via CLI
