In [None]:
%%writefile CartPole-PG-a.py

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import argparse
from itertools import count


parser = argparse.ArgumentParser(description="Policy Gradient for CartPole-v0/v1")
parser.add_argument("--environment", type=str, required=True,
                    help="Gym environment name (e.g., CartPole-v1")
parser.add_argument("--iterations", type=int, default=50,
                    help="Number of iterations (each iteration = batch of episodes)")
parser.add_argument("--batch", type=int, default=32,
                    help="Batch size (number of episodes per iteration)")
args = parser.parse_args()



def plot_rewards(rewards, filename):
    """Plot the average total rewards per iteration."""
    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(rewards, label='Average Reward per Iteration')
    plt.title(f"Random Agent Performance on {args.environment}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Total Reward")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


def get_state(obs):
    """Convert observation (state) to PyTorch tensor."""
    state = torch.from_numpy(np.array(obs)).float().unsqueeze(0)
    return state


def select_random_action(env):
    """Select a random valid action from the environment."""
    return env.action_space.sample()



def run_random_agent(env, iterations, batch_size):
    """Run random agent for given environment and collect average rewards."""
    all_rewards = []

    for it in range(iterations):
        batch_rewards = []

        for ep in range(batch_size):
            obs = env.reset()
            # Compatible with Gym v0.26+
            if isinstance(obs, tuple):
                obs = obs[0]

            total_reward = 0
            for t in count():
                action = select_random_action(env)
                step_output = env.step(action)

                if len(step_output) == 5:
                    obs, reward, terminated, truncated, _ = step_output
                    done = terminated or truncated
                else:
                    obs, reward, done, _ = step_output

                total_reward += reward

                if done:
                    break

            batch_rewards.append(total_reward)

        avg_reward = np.mean(batch_rewards)
        print(f"Iteration {it+1}/{iterations}: Average Reward = {avg_reward:.2f}")
        all_rewards.append(avg_reward)

    env.close()
    return all_rewards



if __name__ == "__main__":
    # Load Gym environment
    env = gym.make(args.environment)
    print(f"\nLoaded Environment: {args.environment}")
    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}\n")

    # Run random agent
    rewards = run_random_agent(env, args.iterations, args.batch)

    # Save plot
    file_name = f"{args.environment}_{args.iterations}_{args.batch}_random_agent.png"
    plot_rewards(rewards, file_name)
    print(f"Plot saved as {file_name}")


Writing CartPole-PG-a.py


In [None]:
# !python3 CartPole-PG-a.py --environment CartPole-v1 --iterations 500 --batch 16
# run the python CMD via CLI



Loaded Environment: CartPole-v1
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Action space: Discrete(2)

Iteration 1/500: Average Reward = 20.12
Iteration 2/500: Average Reward = 25.00
Iteration 3/500: Average Reward = 20.62
Iteration 4/500: Average Reward = 21.19
Iteration 5/500: Average Reward = 19.56
Iteration 6/500: Average Reward = 20.62
Iteration 7/500: Average Reward = 26.44
Iteration 8/500: Average Reward = 23.38
Iteration 9/500: Average Reward = 26.44
Iteration 10/500: Average Reward = 29.94
Iteration 11/500: Average Reward = 18.44
Iteration 12/500: Average Reward = 20.75
Iteration 13/500: Average Reward = 23.38
Iteration 14/500: Average Reward = 18.94
Iteration 15/500: Average Reward = 20.19
Iteration 16/500: Average Reward = 20.38
Iteration 17/500: Average Reward = 27.50
Iteration 18/500: Average Reward = 30.38
Iteration 19/500: Average Reward = 25.62
Iteration 20/500: Average Reward =

In [None]:
%%writefile CartPole-PG-b.py
import gymnasium as gym
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import argparse


class Policy(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(Policy, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, action_dim)
        )
    
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight, gain=np.sqrt(2))
                nn.init.constant_(layer.bias, 0.0)
    
    def forward(self, x):
        return self.network(x)

def compute_returns(rewards, gamma, reward_to_go=False):
    """Compute discounted returns."""
    if reward_to_go:
        returns = []
        future_return = 0
        for r in reversed(rewards):
            future_return = r + gamma * future_return
            returns.insert(0, future_return)
        return returns
    else:
        total_return = sum([gamma**i * r for i, r in enumerate(rewards)])
        return [total_return] * len(rewards)

def plot_rewards(all_mean_rewards, fname):
    
    k = 50
    running_avg = np.convolve(all_mean_rewards, np.ones(k)/k, mode='valid')
    plt.figure(figsize=(8, 4))
    plt.plot(all_mean_rewards, label="All Returns", alpha=0.5)
    plt.plot(np.arange(k-1, len(all_mean_rewards)), running_avg, label=f"Running Average (k={k})", color='r', linewidth=2)
    plt.title(f"{fname}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Return")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(fname + ".png")
    print(f"Plot saved as {fname}.png\n")
    plt.close()

def run_training(env_name="CartPole-v1", iterations=100, batch_size=2000,lr=1e-2, gamma=0.99, reward_to_go=True, advantage_norm=True,hidden_dim=128,):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"PG training on {env_name} | reward_to_go={reward_to_go} advantage_norm={advantage_norm}  | device = {device}")
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n
    
    policy = Policy(state_dim, n_actions, hidden_dim).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    all_mean_rewards = []

    for it in range(iterations):
        batch_states, batch_actions, batch_weights, batch_episode_rewards = [], [], [], []
        steps = 0

        # collect trajectories until we reach desired batch size
        while steps < batch_size:
            obs, info = env.reset()
            done = False
            states, actions, rewards = [], [], []
            while True:
                s_t = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                logits = policy(s_t)
                dist = torch.distributions.Categorical(logits=logits)
                action = int(dist.sample().item())

                next_obs, reward, terminated, truncated, info = env.step(action)
                done_flag = terminated or truncated

                states.append(obs)
                actions.append(action)
                rewards.append(reward)
                obs = next_obs

                if done_flag:
                    break

            steps += len(states)
            batch_states += states
            batch_actions += actions
            batch_episode_rewards.append(sum(rewards))
            batch_weights += list(compute_returns(rewards, gamma, reward_to_go))

        # prepare tensors
        batch_states_t = torch.tensor(np.array(batch_states), dtype=torch.float32, device=device)
        batch_actions_t = torch.tensor(batch_actions, dtype=torch.int64, device=device)
        batch_weights_t = torch.tensor(batch_weights, dtype=torch.float32, device=device)

        # advantage normalization
        if advantage_norm:
            mean = batch_weights_t.mean()
            std = batch_weights_t.std() + 1e-8
            batch_weights_t = (batch_weights_t - mean) / std

        logits = policy(batch_states_t)
        dists = torch.distributions.Categorical(logits=logits)
        logp = dists.log_prob(batch_actions_t)
        loss = -(logp * batch_weights_t).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        mean_reward = np.mean(batch_episode_rewards)
        all_mean_rewards.append(mean_reward)
        print(f"Iteration {it+1}/{iterations} | Mean Reward: {mean_reward:.2f} | Episodes: {len(batch_episode_rewards)}")

    return all_mean_rewards


if __name__ == "__main__":    
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default="CartPole-v1")
    parser.add_argument("--iterations", type=int, default=300)
    parser.add_argument("--batch_size", type=int, default=8000)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--reward_to_go", action="store_true")
    parser.add_argument("--advantage_norm", action="store_true")
    parser.add_argument("--reward_scale", type=float, default=1.0)
    parser.add_argument("--reward_clip_min", type=float, default=None)
    parser.add_argument("--reward_clip_max", type=float, default=None)
    parser.add_argument("--hidden_dim", type=int, default=128)
    args = parser.parse_args()

    all_mean_rewards = run_training(
        env_name=args.environment,
        iterations=args.iterations,
        batch_size=args.batch_size,
        lr=args.lr,
        gamma=args.gamma,
        reward_to_go=args.reward_to_go,
        advantage_norm=args.advantage_norm,
        hidden_dim=args.hidden_dim
    )

    fname = f"{args.environment}_PG_iters{args.iterations}_bs{args.batch_size}_g_lr_{'rtg' if args.reward_to_go else 'tot'}_{'advnorm' if args.advantage_norm else 'noadv'}_"
    plot_rewards(all_mean_rewards, fname)


Overwriting CartPole-PG-b.py


In [None]:
# !python3 CartPole-PG-b.py --environment CartPole-v1 --iterations 50 --batch 32 --gamma 0.99 --learning_rate 1e-3 --reward_to_go --advantage_norm
# run the python CMD via CLI


Environment: CartPole-v1
Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Discrete(2)
Device: cpu
Policy Params: 17410
Iter 1/50 | AvgReturn 22.000 | Loss -3.502
Iter 2/50 | AvgReturn 22.031 | Loss -4.476
Iter 3/50 | AvgReturn 27.250 | Loss -9.461
Iter 4/50 | AvgReturn 26.906 | Loss -7.115
Iter 5/50 | AvgReturn 26.844 | Loss -14.143
Iter 10/50 | AvgReturn 40.250 | Loss -19.295
Iter 15/50 | AvgReturn 45.156 | Loss -28.071
Iter 20/50 | AvgReturn 71.375 | Loss -19.352
Iter 25/50 | AvgReturn 126.750 | Loss -52.173
Iter 30/50 | AvgReturn 179.188 | Loss -111.890
Iter 35/50 | AvgReturn 254.062 | Loss -22.117
Iter 40/50 | AvgReturn 324.875 | Loss -81.378
Iter 45/50 | AvgReturn 376.438 | Loss -12.135
Iter 50/50 | AvgReturn 421.000 | Loss -33.217
Training finished. Plot saved to CartPole-v1_PG_iters50_batch32_g0.99_lr0.001_rtg_advnorm_20251028-183942.png
Saved plot: CartPole-v1_PG_iters50_batch32_g0.99_lr0.001_rtg_advnorm_202510

In [None]:
%%writefile CartPole-PG-c.py

import subprocess
import os
import sys

# Fixed settings
environment = "CartPole-v1"
iterations = 500
learning_rate = 1e-3
gamma = 0.99

# Batch sizes to compare
batch_sizes = [50, 500, 5000]

# Store results
batch_rewards = {}

for b in batch_sizes:
    script_path = os.path.join(os.path.dirname(__file__), "CartPole-PG-b.py")
    try:
        result = subprocess.run(
            [
                sys.executable,
                script_path,
                "--environment", environment,
                "--iterations", str(iterations),
                "--batch_size", str(b),
                "--lr", str(learning_rate),
                "--gamma", str(gamma),
                "--reward_to_go",
                "--advantage_norm"
            ],
            capture_output=True,
            text=True,
            check=True,
            cwd=os.path.dirname(__file__)
        )
        # show subprocess output for debugging / parsing
        if result.stdout:
            print(result.stdout)
        if result.stderr:
            print("Subprocess stderr:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("Subprocess failed with returncode", e.returncode)
        if e.stdout:
            print(e.stdout)
        if e.stderr:
            print(e.stderr)
        continue


In [None]:
# run the python CMD via CLI


# Part(a)

A random agent in the CartPole-v1 Gym environment receives consistently low and volatile rewards, usually between 15 and 28 per episode across many iterations, means the agent cannot learn or exploit the environment, leading to frequent failure in balancing the pole. Occasional higher rewards occur only by chance.

---

![Random Agent Performance](part(a)/CartPole-v1_500_16_random_agent.png)
<center>Figure-1: Rewards per episode for a random agent in CartPole-v1 environment over 500
</center>

---

---

# Part(b)

- Advantage Normalization speeds up learning and makes it more stable, helping the model hit near-optimal performance pretty quickly.

- Reward-to-Go boosts performance too, but without advantage normalization, things get shaky, with some drops due to high variance.

- No Reward-to-Go leads to a bumpy ride, with performance dropping a lot, showing that it's less efficient and struggles with variance.

- Combining Both gives the best results, with faster learning, less fluctuation, and overall smoother training.

- Advantage Normalization on its own is the best for stabilizing things, especially in noisy environments like CartPole-v1.

- Without Either, you get a lot of instability, with performance going up and down, and itâ€™s hard to maintain high returns.

---
![rTaT](part(b)/CartPole-v1_PG_iters500_bs5000_g_lr_rtg_advnorm_.png)
![rTaF](part(b)/CartPole-v1_PG_iters500_bs5000_g_lr_rtg_noadv_.png)
![rFaT](part(b)/CartPole-v1_PG_iters500_bs5000_g_lr_tot_advnorm_.png)
![rFaF](part(b)/CartPole-v1_PG_iters500_bs5000_g_lr_tot_noadv_.png)
<center>Figure-2,3,4,5: Average Return over 500 episodes for 5000 as batch size with different combinations of Reward-to-Go and Advantage Normalization
</center>

---

---

# Part(c)

- Small Batch (bs=50) is super noisy-lots of ups and downs, takes longer to stabilize, and even after a lot of training, the agent can still mess up.

- Medium Batch (bs=500) smooths things out a bit-faster and more reliable learning, but still some random dips later on.

- Large Batch (bs=5000) is the smoothest-consistent, stable, and hits optimal performance without big drops.

- Smaller batches update faster but with higher variance, making it more unstable in the long run time.

- Larger batches cut down on gradient noise, making the learning process steadier and more reliable.

- Big trade-off: small batches are faster but unstable, while large batches are slower but way more consistent.

---

![bs50](part(c)/CartPole-v1_PG_iters500_bs50_g_lr_rtg_advnorm_.png)
![bs500](part(c)/CartPole-v1_PG_iters500_bs500_g_lr_rtg_advnorm_.png)
![bs5000](part(c)/CartPole-v1_PG_iters500_bs5000_g_lr_rtg_advnorm_.png)
<center>Figure-6,7,8: Average Return over 500 episodes for different batch sizes with Reward-to-Go and Advantage Normalization enabled
</center>

---

---