yet to implement CartPole bro


In [None]:
%%writefile CartPole-PG-a.py

import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import torch
import random
import argparse
from itertools import count


parser = argparse.ArgumentParser(description="Policy Gradient for CartPole-v0/v1")
parser.add_argument("--environment", type=str, required=True,
                    help="Gym environment name (e.g., CartPole-v1")
parser.add_argument("--iterations", type=int, default=50,
                    help="Number of iterations (each iteration = batch of episodes)")
parser.add_argument("--batch", type=int, default=32,
                    help="Batch size (number of episodes per iteration)")
args = parser.parse_args()



def plot_rewards(rewards, filename):
    """Plot the average total rewards per iteration."""
    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(rewards, label='Average Reward per Iteration')
    plt.title(f"Random Agent Performance on {args.environment}")
    plt.xlabel("Iteration")
    plt.ylabel("Average Total Reward")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


def get_state(obs):
    """Convert observation (state) to PyTorch tensor."""
    state = torch.from_numpy(np.array(obs)).float().unsqueeze(0)
    return state


def select_random_action(env):
    """Select a random valid action from the environment."""
    return env.action_space.sample()



def run_random_agent(env, iterations, batch_size):
    """Run random agent for given environment and collect average rewards."""
    all_rewards = []

    for it in range(iterations):
        batch_rewards = []

        for ep in range(batch_size):
            obs = env.reset()
            # Compatible with Gym v0.26+
            if isinstance(obs, tuple):
                obs = obs[0]

            total_reward = 0
            for t in count():
                action = select_random_action(env)
                step_output = env.step(action)

                if len(step_output) == 5:
                    obs, reward, terminated, truncated, _ = step_output
                    done = terminated or truncated
                else:
                    obs, reward, done, _ = step_output

                total_reward += reward

                if done:
                    break

            batch_rewards.append(total_reward)

        avg_reward = np.mean(batch_rewards)
        print(f"Iteration {it+1}/{iterations}: Average Reward = {avg_reward:.2f}")
        all_rewards.append(avg_reward)

    env.close()
    return all_rewards



if __name__ == "__main__":
    # Load Gym environment
    env = gym.make(args.environment)
    print(f"\nLoaded Environment: {args.environment}")
    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}\n")

    # Run random agent
    rewards = run_random_agent(env, args.iterations, args.batch)

    # Save plot
    file_name = f"{args.environment}_{args.iterations}_{args.batch}_random_agent.png"
    plot_rewards(rewards, file_name)
    print(f"Plot saved as {file_name}")


Writing CartPole-PG-a.py


In [None]:
# !python3 CartPole-PG-a.py --environment CartPole-v1 --iterations 500 --batch 16



Loaded Environment: CartPole-v1
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Action space: Discrete(2)

Iteration 1/500: Average Reward = 20.12
Iteration 2/500: Average Reward = 25.00
Iteration 3/500: Average Reward = 20.62
Iteration 4/500: Average Reward = 21.19
Iteration 5/500: Average Reward = 19.56
Iteration 6/500: Average Reward = 20.62
Iteration 7/500: Average Reward = 26.44
Iteration 8/500: Average Reward = 23.38
Iteration 9/500: Average Reward = 26.44
Iteration 10/500: Average Reward = 29.94
Iteration 11/500: Average Reward = 18.44
Iteration 12/500: Average Reward = 20.75
Iteration 13/500: Average Reward = 23.38
Iteration 14/500: Average Reward = 18.94
Iteration 15/500: Average Reward = 20.19
Iteration 16/500: Average Reward = 20.38
Iteration 17/500: Average Reward = 27.50
Iteration 18/500: Average Reward = 30.38
Iteration 19/500: Average Reward = 25.62
Iteration 20/500: Average Reward =

In [None]:
%%writefile CartPole-PG-b.py
import argparse, os, time, random
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt


class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128)):
        super().__init__()
        layers = []
        prev_dim = obs_dim
        for h in hidden_sizes:
            layers += [nn.Linear(prev_dim, h), nn.ReLU()]
            prev_dim = h
        layers += [nn.Linear(prev_dim, act_dim), nn.Softmax(dim=-1)]
        self.net = nn.Sequential(*layers)

    def forward(self, obs):
        return self.net(obs)


def discount_cumsum(rewards, gamma):
    """Compute discounted cumulative rewards."""
    discounted = np.zeros_like(rewards, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(rewards))):
        running_add = rewards[t] + gamma * running_add
        discounted[t] = running_add
    return list(discounted)


def get_obs(obs):
    return np.array(obs[0] if isinstance(obs, (tuple, list)) else obs)


def step_env(env, action):
    out = env.step(action)
    if len(out) == 5:
        obs, reward, term, trunc, info = out
        done = term or trunc
    else:
        obs, reward, done, info = out
    return obs, reward, done, info


def run_training(env_name, iterations, batch_size, lr, gamma,
                 reward_to_go, advantage_norm, reward_scale,
                 reward_clip, hidden_dim, device, fname):



    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    policy = Policy(obs_dim, act_dim, hidden_sizes=(hidden_dim, hidden_dim)).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    print(f"\nTraining {env_name} | γ={gamma} | lr={lr} | batch={batch_size} | device={device}")

    all_returns = []

    for it in range(iterations):
        batch_obs, batch_acts, batch_weights = [], [], []
        ep_returns = []
        log_probs = []

        steps_collected = 0
        while steps_collected < batch_size:
            obs = get_obs(env.reset())
            ep_rewards, ep_logps, ep_obs, ep_acts = [], [], [], []

            done = False
            while not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                probs = policy(obs_tensor)
                dist = Categorical(probs)
                action = dist.sample()

                next_obs, reward, done, _ = step_env(env, action.item())
                if reward_scale != 1.0:
                    reward *= reward_scale
                if reward_clip:
                    reward = np.clip(reward, reward_clip[0], reward_clip[1])

                ep_obs.append(obs)
                ep_acts.append(action.item())
                ep_logps.append(dist.log_prob(action))
                ep_rewards.append(reward)
                obs = get_obs(next_obs)

            # Compute returns
            if reward_to_go:
                ep_returns = discount_cumsum(ep_rewards, gamma)
            else:
                G = sum([gamma**t * r for t, r in enumerate(ep_rewards)])
                ep_returns = [G for _ in ep_rewards]

            batch_obs += ep_obs
            batch_acts += ep_acts
            batch_weights += list(ep_returns)
            log_probs += ep_logps
            ep_returns_sum = sum(ep_rewards)
            ep_returns.append(ep_returns_sum)
            ep_returns = np.array(ep_returns)

            steps_collected += len(ep_rewards)

        # Convert to tensors
        log_probs_tensor = torch.stack(log_probs).to(device)
        advantages = torch.tensor(batch_weights, dtype=torch.float32, device=device)

        # Baseline + normalization
        if advantage_norm:
            advantages -= advantages.mean()
            advantages /= (advantages.std(unbiased=False) + 1e-8)

        # Policy gradient loss
        loss = (-(log_probs_tensor * advantages)).mean()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
        optimizer.step()

        avg_return = np.mean(ep_returns)
        all_returns.append(avg_return)


        print(f"Iter {it+1:4d}/{iterations} | AvgReturn: {avg_return:8.2f} | Loss: {loss:.4f}")

    env.close()
    plt.figure(figsize=(8, 4))
    plt.plot(all_returns)
    plt.title(f"{env_name} - Policy Gradient (Stable Fast Version)")
    plt.xlabel("Iteration")
    plt.ylabel("Average Return")
    # plt.yscale('log')
    plt.grid(True)
    plt.tight_layout()
    
    plt.savefig(fname)
    print(f"Plot saved as {fname}\n")
    return all_returns

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default="CartPole-v1")
    parser.add_argument("--iterations", type=int, default=300)
    parser.add_argument("--batch_size", type=int, default=8000)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--reward_to_go", action="store_true")
    parser.add_argument("--advantage_norm", action="store_true")
    parser.add_argument("--reward_scale", type=float, default=1.0)
    parser.add_argument("--reward_clip_min", type=float, default=None)
    parser.add_argument("--reward_clip_max", type=float, default=None)
    parser.add_argument("--hidden_dim", type=int, default=128)
    args = parser.parse_args()

    clip_tuple = None
    if args.reward_clip_min is not None or args.reward_clip_max is not None:
        lo = -np.inf if args.reward_clip_min is None else args.reward_clip_min
        hi = np.inf if args.reward_clip_max is None else args.reward_clip_max
        clip_tuple = (lo, hi)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    fname = f"{args.environment}_PG_iters{args.iterations}_bs{args.batch_size}_g_lr_{'rtg' if args.reward_to_go else 'tot'}_{'advnorm' if args.advantage_norm else 'noadv'}_.png"
    run_training(
        env_name=args.environment,
        iterations=args.iterations,
        batch_size=args.batch_size,
        lr=args.lr,
        gamma=args.gamma,
        reward_to_go=args.reward_to_go,
        advantage_norm=args.advantage_norm,
        reward_scale=args.reward_scale,
        reward_clip=clip_tuple,
        hidden_dim=args.hidden_dim,
        device=device,
        fname=fname
    )


Overwriting CartPole-PG-b.py


In [None]:
# !python3 CartPole-PG-b.py --environment CartPole-v1 --iterations 50 --batch 32 --gamma 0.99 --learning_rate 1e-3 --reward_to_go --advantage_norm


Environment: CartPole-v1
Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Discrete(2)
Device: cpu
Policy Params: 17410
Iter 1/50 | AvgReturn 22.000 | Loss -3.502
Iter 2/50 | AvgReturn 22.031 | Loss -4.476
Iter 3/50 | AvgReturn 27.250 | Loss -9.461
Iter 4/50 | AvgReturn 26.906 | Loss -7.115
Iter 5/50 | AvgReturn 26.844 | Loss -14.143
Iter 10/50 | AvgReturn 40.250 | Loss -19.295
Iter 15/50 | AvgReturn 45.156 | Loss -28.071
Iter 20/50 | AvgReturn 71.375 | Loss -19.352
Iter 25/50 | AvgReturn 126.750 | Loss -52.173
Iter 30/50 | AvgReturn 179.188 | Loss -111.890
Iter 35/50 | AvgReturn 254.062 | Loss -22.117
Iter 40/50 | AvgReturn 324.875 | Loss -81.378
Iter 45/50 | AvgReturn 376.438 | Loss -12.135
Iter 50/50 | AvgReturn 421.000 | Loss -33.217
Training finished. Plot saved to CartPole-v1_PG_iters50_batch32_g0.99_lr0.001_rtg_advnorm_20251028-183942.png
Saved plot: CartPole-v1_PG_iters50_batch32_g0.99_lr0.001_rtg_advnorm_202510

In [None]:
%%writefile CartPole-PG-c.py

import subprocess
import matplotlib.pyplot as plt
import argparse, os, time, random
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import sys

# Fixed settings
environment = "CartPole-v1"
iterations = 100
learning_rate = 1e-3
gamma = 0.99

# Batch sizes to compare
batch_sizes = [32, 320, 800, 1000, 10000]

# Store results
batch_rewards = {}

for b in batch_sizes[0:2]:
    script_path = os.path.join(os.path.dirname(__file__), "CartPole-PG-b.py")
    try:
        result = subprocess.run(
            [
                sys.executable,
                script_path,
                "--environment", environment,
                "--iterations", str(iterations),
                "--batch_size", str(b),
                "--lr", str(learning_rate),
                "--gamma", str(gamma),
                "--reward_to_go",
                "--advantage_norm"
            ],
            capture_output=True,
            text=True,
            check=True,
            cwd=os.path.dirname(__file__)
        )
        # show subprocess output for debugging / parsing
        if result.stdout:
            print(result.stdout)
        if result.stderr:
            print("Subprocess stderr:", result.stderr)
    except subprocess.CalledProcessError as e:
        print("Subprocess failed with returncode", e.returncode)
        if e.stdout:
            print(e.stdout)
        if e.stderr:
            print(e.stderr)
        continue



Running with batch size = 8


  return datetime.utcnow().replace(tzinfo=utc)


SyntaxError: invalid syntax (<string>, line 1)