In [None]:
%%writefile LunarLander-PG-a.py

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import random
import argparse
from itertools import count



parser = argparse.ArgumentParser(description="Policy Gradient for LunarLander-v3")
parser.add_argument("--environment", type=str, required=True,
                    help="Gym environment name (e.g., LunarLander-v3")
parser.add_argument("--iterations", type=int, default=50,
                    help="Number of iterations (each iteration = batch of episodes)")
parser.add_argument("--batch", type=int, default=32,
                    help="Batch size (number of episodes per iteration)")
args = parser.parse_args()



def plot_rewards(rewards, filename):
    """Plot the average total rewards per iteration."""
    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(rewards, label='Average Reward per Iteration')
    plt.title(f"Random Agent Performance on {args.environment}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Total Reward")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


def get_state(obs):
    """Convert observation (state) to PyTorch tensor."""
    state = torch.from_numpy(np.array(obs)).float().unsqueeze(0)
    return state


def select_random_action(env):
    """Select a random valid action from the environment."""
    return env.action_space.sample()


def run_random_agent(env, iterations, batch_size):
    """Run random agent for given environment and collect average rewards."""
    all_rewards = []

    for it in range(iterations):
        batch_rewards = []

        for ep in range(batch_size):
            obs = env.reset()
            # Compatible with Gym v0.26+
            if isinstance(obs, tuple):
                obs = obs[0]

            total_reward = 0
            for t in count():
                action = select_random_action(env)
                step_output = env.step(action)

                if len(step_output) == 5:
                    obs, reward, terminated, truncated, _ = step_output
                    done = terminated or truncated
                else:
                    obs, reward, done, _ = step_output

                total_reward += reward

                if done:
                    break

            batch_rewards.append(total_reward)

        avg_reward = np.mean(batch_rewards)
        print(f"Iteration {it+1}/{iterations}: Average Reward = {avg_reward:.2f}")
        all_rewards.append(avg_reward)

    env.close()
    return all_rewards


if __name__ == "__main__":
    # Load Gym environment
    env = gym.make(args.environment)
    print(f"\nLoaded Environment: {args.environment}")
    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}\n")

    # Run random agent
    rewards = run_random_agent(env, args.iterations, args.batch)

    # Save plot
    file_name = f"{args.environment}_{args.iterations}_{args.batch}_random_agent.png"
    plot_rewards(rewards, file_name)
    print(f"Plot saved as {file_name}")


Overwriting LunarLander-PG-a.py


In [None]:
# !python3 LunarLander-PG-a.py --environment LunarLander-v3 --iterations 50 --batch 16



Loaded Environment: LunarLander-v3
Observation space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Action space: Discrete(4)

Iteration 1/50: Average Reward = -171.23
Iteration 2/50: Average Reward = -182.92
Iteration 3/50: Average Reward = -213.44
Iteration 4/50: Average Reward = -170.92
Iteration 5/50: Average Reward = -174.31
Iteration 6/50: Average Reward = -151.22
Iteration 7/50: Average Reward = -206.65
Iteration 8/50: Average Reward = -160.01
Iteration 9/50: Average Reward = -182.23
Iteration 10/50: Average Reward = -174.77
Iteration 11/50: Average Reward = -218.77
Iteration 12/50: Average Reward = -154.10
Iteration 13/50: Average Reward = -181.44
Iteration 14/50: Average Reward = -141.88
Iteration 15/50: Average Reward = -176.98
Iteration 16/50: Average Reward = -152.25
Iteration 17/50: Average Reward = -204.13
Iteration 18/50:

In [None]:
%%writefile LunarLander-PG-b.py
import argparse, os, time, random
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt


class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128)):
        super().__init__()
        layers = []
        prev_dim = obs_dim
        for h in hidden_sizes:
            layers += [nn.Linear(prev_dim, h), nn.ReLU()]
            prev_dim = h
        layers += [nn.Linear(prev_dim, act_dim), nn.Softmax(dim=-1)]
        self.net = nn.Sequential(*layers)

    def forward(self, obs):
        return self.net(obs)


def discount_cumsum(rewards, gamma):
    """Compute discounted cumulative rewards."""
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = rewards[t] + gamma * running_add
        discounted[t] = running_add
    return list(discounted)


def get_obs(obs):
    return np.array(obs[0] if isinstance(obs, (tuple, list)) else obs)


def step_env(env, action):
    out = env.step(action)
    if len(out) == 5:
        obs, reward, term, trunc, info = out
        done = term or trunc
    else:
        obs, reward, done, info = out
    return obs, reward, done, info


def run_training(env_name, iterations, batch_size, lr, gamma,
                 reward_to_go, advantage_norm, reward_scale,
                 reward_clip, hidden_dim, device, fname):



    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    policy = Policy(obs_dim, act_dim, hidden_sizes=(hidden_dim, hidden_dim)).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    print(f"\nTraining {env_name} | Î³={gamma} | lr={lr} | batch={batch_size} | device={device}")

    all_returns = []
    best_return = float('-inf')
    best_policy_state = None

    for it in range(iterations):
        # Evaluate current best policy
        with torch.no_grad():
            eval_obs = get_obs(env.reset())
            eval_rewards = []
            eval_done = False
            while not eval_done:
                eval_obs_tensor = torch.tensor(eval_obs, dtype=torch.float32, device=device).unsqueeze(0)
                eval_probs = policy(eval_obs_tensor)
                # Use best action (no sampling)
                eval_action = torch.argmax(eval_probs)
                eval_next_obs, eval_reward, eval_done, _ = step_env(env, eval_action.item())
                eval_rewards.append(eval_reward)
                eval_obs = get_obs(eval_next_obs)
            eval_return = sum(eval_rewards)
            print(f"Iter {it+1:4d}/{iterations} | Best Policy Return: {eval_return:8.2f}")

        # Collect training data with exploration
        batch_obs, batch_acts, batch_weights = [], [], []
        ep_returns = []
        log_probs = []

        steps_collected = 0
        while steps_collected < batch_size:
            obs = get_obs(env.reset())
            ep_rewards, ep_logps, ep_obs, ep_acts = [], [], [], []

            done = False
            while not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                probs = policy(obs_tensor)
                dist = Categorical(probs)
                action = dist.sample()


                next_obs, reward, done, _ = step_env(env, action.item())
                if reward_scale != 1.0:
                    reward *= reward_scale
                if reward_clip:
                    reward = np.clip(reward, reward_clip[0], reward_clip[1])

                ep_obs.append(obs)
                ep_acts.append(action.item())
                ep_logps.append(dist.log_prob(action))
                ep_rewards.append(reward)
                obs = get_obs(next_obs)

            # Compute returns
            if reward_to_go:
                ep_returns = discount_cumsum(ep_rewards, gamma)
            else:
                G = sum([gamma**t * r for t, r in enumerate(ep_rewards)])
                ep_returns = [G for _ in ep_rewards]

            batch_obs += ep_obs
            batch_acts += ep_acts
            batch_weights += list(ep_returns)
            log_probs += ep_logps
            ep_returns_sum = sum(ep_rewards)
            ep_returns.append(ep_returns_sum)
            ep_returns = np.array(ep_returns)

            steps_collected += len(ep_rewards)

        # Convert to tensors
        log_probs_tensor = torch.stack(log_probs).to(device)
        advantages = torch.tensor(batch_weights, dtype=torch.float32, device=device)

        # Ensure dimensions match
        advantages = advantages.reshape(-1)  # Flatten to 1D
        log_probs_tensor = log_probs_tensor.reshape(-1)  # Flatten to 1D

        # Baseline + normalization
        if advantage_norm:
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)


        loss = -(log_probs_tensor * advantages).mean()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
        optimizer.step()

        avg_return = np.mean(ep_returns)
        all_returns.append(avg_return)


        print(f"Iter {it+1:4d}/{iterations} | AvgReturn: {avg_return:8.2f} | Loss: {loss:.4f}")

        # Evaluate current policy
        with torch.no_grad():
            eval_obs = get_obs(env.reset())
            eval_rewards = []
            eval_done = False
            while not eval_done:
                eval_obs_tensor = torch.tensor(eval_obs, dtype=torch.float32, device=device).unsqueeze(0)
                eval_probs = policy(eval_obs_tensor)
                eval_action = torch.argmax(eval_probs)
                eval_next_obs, eval_reward, eval_done, _ = step_env(env, eval_action.item())
                eval_rewards.append(eval_reward)
                eval_obs = get_obs(eval_next_obs)
            current_eval_return = sum(eval_rewards)

            # Update best policy if current one is better
            if current_eval_return > best_return:
                best_return = current_eval_return
                best_policy_state = policy.state_dict().copy()
                print(f"New best policy found. Return: {best_return:8.2f}")
            else:
                # Load best policy for next iteration
                policy.load_state_dict(best_policy_state)
                print(f"Reverting to best policy (Return: {best_return:8.2f})")

    env.close()
   
    k = 100
    running_avg = np.convolve(all_returns, np.ones(k)/k, mode='valid')
    plt.figure(figsize=(8, 4))
    plt.plot(all_returns, label="All Returns", alpha=0.5)
    plt.plot(np.arange(k-1, len(all_returns)), running_avg, label=f"Running Average (k={k})", color='r', linewidth=2)
    plt.title(f"{fname}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Return")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(fname + ".png")
    print(f"Plot saved as {fname}\n")
    return all_returns

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default="LunarLander-v3")
    parser.add_argument("--iterations", type=int, default=300)
    parser.add_argument("--batch_size", type=int, default=8000)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--reward_to_go", action="store_true")
    parser.add_argument("--advantage_norm", action="store_true")
    parser.add_argument("--reward_scale", type=float, default=1.0)
    parser.add_argument("--reward_clip_min", type=float, default=None)
    parser.add_argument("--reward_clip_max", type=float, default=None)
    parser.add_argument("--hidden_dim", type=int, default=128)
    args = parser.parse_args()

    clip_tuple = None
    if args.reward_clip_min is not None or args.reward_clip_max is not None:
        lo = -np.inf if args.reward_clip_min is None else args.reward_clip_min
        hi = np.inf if args.reward_clip_max is None else args.reward_clip_max
        clip_tuple = (lo, hi)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    fname = f"{args.environment}_PG_iters{args.iterations}_bs{args.batch_size}_g_lr_{'rtg' if args.reward_to_go else 'tot'}_{'advnorm' if args.advantage_norm else 'noadv'}_"
    run_training(
        env_name=args.environment,
        iterations=args.iterations,
        batch_size=args.batch_size,
        lr=args.lr,
        gamma=args.gamma,
        reward_to_go=args.reward_to_go,
        advantage_norm=args.advantage_norm,
        reward_scale=args.reward_scale,
        reward_clip=clip_tuple,
        hidden_dim=args.hidden_dim,
        device=device,
        fname=fname
    )


Writing LunarLander-PG-b.py


In [None]:
# !python3 LunarLander-PG-b.py --environment LunarLander-v3 --iterations 5 --batch 40 --gamma 0.99 --learning_rate 5e-4 --reward_to_go --advantage_norm --reward_scale 1.0


Environment: LunarLander-v3
Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Discrete(4)
Device: cpu
Policy Params: 18180
Iter 1/5 | AvgReturn -205.115 | Loss 1.150
Iter 2/5 | AvgReturn -174.408 | Loss -5.880
Iter 3/5 | AvgReturn -177.800 | Loss -3.110
Iter 4/5 | AvgReturn -176.663 | Loss 12.527
Iter 5/5 | AvgReturn -189.604 | Loss -10.015
Training finished. Plot saved to LunarLander-v3_PG_iters5_batch40_g_lr_rtg_advnorm_.png
Saved plot: LunarLander-v3_PG_iters5_batch40_g_lr_rtg_advnorm_.png


In [None]:
%%writefile LunarLander-PG-c.py

import subprocess
import matplotlib.pyplot as plt
import argparse, os, time, random
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import sys

# Fixed settings
environment = "CartPole-v1"
iterations = 2000
learning_rate = 1e-3
gamma = 0.99

# Batch sizes to compare
batch_sizes = [40, 400, 4000]

# Store results
batch_rewards = {}

for b in batch_sizes:
    script_path = os.path.join(os.path.dirname(__file__), "LunarLander-PG-b.py")
    try:
        result = subprocess.run(
            [
                sys.executable,
                script_path,
                "--environment", environment,
                "--iterations", str(iterations),
                "--batch_size", str(b),
                "--lr", str(learning_rate),
                "--gamma", str(gamma),
                "--reward_to_go",
                "--advantage_norm"
            ],
            capture_output=True,
            text=True,
            check=True,
            cwd=os.path.dirname(__file__)
        )
        # show subprocess output for debugging / parsing
        if result.stdout:
            print(result.stdout)
        if result.stderr:
            print("Subprocess stderr:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("Subprocess failed with returncode", e.returncode)
        if e.stdout:
            print(e.stdout)
        if e.stderr:
            print(e.stderr)
        continue

