<a href="https://colab.research.google.com/github/sasya05/Reinforcement-learning/blob/main/lab9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torch gymnasium[classic-control] numpy




In [2]:
"""
PPO (continuous actions) — PyTorch implementation
Works with gymnasium (Pendulum-v1 by default).
"""

import time
import math
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_   # numpy >= 2.0 compat

import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

# --------------------------
# Actor-Critic network
# --------------------------
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=256):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh()
        )
        self.mu_head = nn.Linear(hidden_size, action_dim)
        # log std as parameter (one per action dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        self.value_head = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.shared(x)
        mu = self.mu_head(x)
        std = torch.exp(self.log_std)
        value = self.value_head(x)
        return mu, std, value.squeeze(-1)

# --------------------------
# Helper functions
# --------------------------
def discount_cumsum(x, discount):
    # reversed discounted cumulative sums
    return np.array([sum((discount**i) * x[i + t] for i in range(len(x) - t)) for t in range(len(x))])

# --------------------------
# PPO agent / training
# --------------------------
class PPO:
    def __init__(self,
                 env_name="Pendulum-v1",
                 hidden_size=256,
                 lr=3e-4,
                 gamma=0.99,
                 lam=0.95,
                 clip_eps=0.2,
                 epochs=10,
                 minibatch_size=64,
                 rollout_steps=2048,
                 policy_epochs=10,
                 value_coef=0.5,
                 ent_coef=0.0,
                 max_grad_norm=0.5,
                 device=None):
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.device = device or (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]
        self.act_low = self.env.action_space.low
        self.act_high = self.env.action_space.high

        self.model = ActorCritic(self.obs_dim, self.act_dim, hidden_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.lam = lam
        self.clip_eps = clip_eps
        self.epochs = epochs
        self.minibatch_size = minibatch_size
        self.rollout_steps = rollout_steps
        self.policy_epochs = policy_epochs
        self.value_coef = value_coef
        self.ent_coef = ent_coef
        self.max_grad_norm = max_grad_norm

    def scale_action(self, raw_action):
        # raw_action from Normal distribution — treat as unbounded; we squash with tanh and scale to env range
        # But here we sample directly in action space and clip
        return np.clip(raw_action, self.act_low, self.act_high)

    def collect_rollout(self):
        obs_buf = []
        act_buf = []
        logp_buf = []
        rew_buf = []
        val_buf = []
        done_buf = []

        obs, _ = self.env.reset()
        for _ in range(self.rollout_steps):
            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
            with torch.no_grad():
                mu, std, value = self.model(obs_tensor)
                dist = Normal(mu, std)
                action = dist.sample()
                logp = dist.log_prob(action).sum(axis=-1)
            action_np = action.cpu().numpy().squeeze(0)
            clipped_action = self.scale_action(action_np)
            next_obs, reward, terminated, truncated, _ = self.env.step(clipped_action)
            done = float(terminated or truncated)

            obs_buf.append(obs)
            act_buf.append(action_np)
            logp_buf.append(logp.cpu().item())
            rew_buf.append(reward)
            val_buf.append(value.cpu().item())
            done_buf.append(done)

            obs = next_obs
            if done:
                obs, _ = self.env.reset()

        # get last value for bootstrap
        obs_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
        with torch.no_grad():
            _, _, last_val = self.model(obs_tensor)
            last_val = last_val.cpu().item()

        # convert to numpy arrays
        return {
            'obs': np.array(obs_buf, dtype=np.float32),
            'acts': np.array(act_buf, dtype=np.float32),
            'logp': np.array(logp_buf, dtype=np.float32),
            'rews': np.array(rew_buf, dtype=np.float32),
            'vals': np.array(val_buf, dtype=np.float32),
            'dones': np.array(done_buf, dtype=np.float32),
            'last_val': last_val
        }

    def compute_gae(self, rews, vals, dones, last_val):
        # Generalized advantage estimation
        T = len(rews)
        adv = np.zeros(T, dtype=np.float32)
        last_gae = 0.0
        for t in reversed(range(T)):
            if t == T - 1:
                next_value = last_val
                next_non_terminal = 1.0 - dones[t]
            else:
                next_value = vals[t+1]
                next_non_terminal = 1.0 - dones[t]
            delta = rews[t] + self.gamma * next_value * next_non_terminal - vals[t]
            last_gae = delta + self.gamma * self.lam * next_non_terminal * last_gae
            adv[t] = last_gae
        returns = adv + vals
        return adv, returns

    def ppo_update(self, batch, batch_size):
        obs = torch.tensor(batch['obs'], dtype=torch.float32, device=self.device)
        acts = torch.tensor(batch['acts'], dtype=torch.float32, device=self.device)
        old_logp = torch.tensor(batch['logp'], dtype=torch.float32, device=self.device)
        adv = torch.tensor(batch['adv'], dtype=torch.float32, device=self.device)
        ret = torch.tensor(batch['ret'], dtype=torch.float32, device=self.device)

        # normalize advantage
        adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        N = obs.shape[0]
        idxs = np.arange(N)
        for _ in range(self.policy_epochs):
            np.random.shuffle(idxs)
            for start in range(0, N, self.minibatch_size):
                mb_idx = idxs[start:start + self.minibatch_size]
                mb_obs = obs[mb_idx]
                mb_acts = acts[mb_idx]
                mb_old_logp = old_logp[mb_idx]
                mb_adv = adv[mb_idx]
                mb_ret = ret[mb_idx]

                mu, std, value = self.model(mb_obs)
                dist = Normal(mu, std)
                mb_logp = dist.log_prob(mb_acts).sum(axis=-1)
                entropy = dist.entropy().sum(axis=-1).mean()

                ratio = torch.exp(mb_logp - mb_old_logp)
                surr1 = ratio * mb_adv
                surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * mb_adv
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = nn.MSELoss()(value, mb_ret)
                loss = actor_loss + self.value_coef * critic_loss - self.ent_coef * entropy

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                self.optimizer.step()

    def train(self, total_updates=200):
        print(f"Training on {self.env_name} with device={self.device}")
        start_time = time.time()
        for update in range(1, total_updates + 1):
            batch = self.collect_rollout()
            adv, ret = self.compute_gae(batch['rews'], batch['vals'], batch['dones'], batch['last_val'])
            batch['adv'] = adv
            batch['ret'] = ret

            self.ppo_update(batch, self.minibatch_size)

            # simple logging: average return estimate over rollout
            avg_ret = np.mean(ret)
            avg_reward = np.mean(batch['rews'])
            if update % 5 == 0 or update == 1:
                print(f"Update {update}/{total_updates} | avg_rollout_reward {avg_reward:.3f} | avg_ret {avg_ret:.3f}")

        total_time = time.time() - start_time
        print(f"Training finished in {total_time:.1f}s. Saving model to ppo_continuous.pth")
        torch.save(self.model.state_dict(), "ppo_continuous.pth")

    def evaluate(self, episodes=5, render=False):
        env = gym.make(self.env_name, render_mode="human" if render else None)
        for ep in range(episodes):
            obs, _ = env.reset()
            done = False
            total = 0.0
            steps = 0
            while True:
                obs_t = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)
                with torch.no_grad():
                    mu, std, _ = self.model(obs_t)
                action = mu.cpu().numpy().squeeze(0)
                action = self.scale_action(action)
                obs, reward, terminated, truncated, _ = env.step(action)
                total += reward
                steps += 1
                if terminated or truncated:
                    break
            print(f"Eval ep {ep+1}: reward={total:.3f} steps={steps}")
        env.close()

# --------------------------
# Run
# --------------------------
if __name__ == "__main__":
    # hyperparameters you can tweak
    ppo = PPO(env_name="Pendulum-v1",
              hidden_size=256,
              lr=3e-4,
              gamma=0.99,
              lam=0.95,
              clip_eps=0.2,
              epochs=10,
              minibatch_size=64,
              rollout_steps=2048,
              policy_epochs=10,
              value_coef=0.5,
              ent_coef=0.0)

    # train (updates). Each update collects 'rollout_steps' environment steps.
    ppo.train(total_updates=50)

    # evaluate learned policy
    ppo.evaluate(episodes=3, render=False)


Training on Pendulum-v1 with device=cpu
Update 1/50 | avg_rollout_reward -5.478 | avg_ret -84.348
Update 5/50 | avg_rollout_reward -6.887 | avg_ret -188.508
Update 10/50 | avg_rollout_reward -6.264 | avg_ret -270.695
Update 15/50 | avg_rollout_reward -6.689 | avg_ret -352.228
Update 20/50 | avg_rollout_reward -6.621 | avg_ret -380.336
Update 25/50 | avg_rollout_reward -6.341 | avg_ret -398.225
Update 30/50 | avg_rollout_reward -6.304 | avg_ret -405.918
Update 35/50 | avg_rollout_reward -6.565 | avg_ret -414.604
Update 40/50 | avg_rollout_reward -6.162 | avg_ret -412.687
Update 45/50 | avg_rollout_reward -6.045 | avg_ret -406.215
Update 50/50 | avg_rollout_reward -6.048 | avg_ret -392.367
Training finished in 104.6s. Saving model to ppo_continuous.pth
Eval ep 1: reward=-1051.506 steps=200
Eval ep 2: reward=-1103.852 steps=200
Eval ep 3: reward=-1734.714 steps=200
