<a href="https://colab.research.google.com/github/sasya05/Reinforcement-learning/blob/main/lab8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install gymnasium[classic-control] torch numpy




In [3]:
# =========================================================
# ✅ Advantage Actor–Critic (A2C) — Continuous Action Example
# Works with Python 3.12, NumPy ≥ 2.0, Gymnasium, and PyTorch
# =========================================================

import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_   # Compatibility for NumPy ≥ 2.0

import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import time

# ------------------------------
# Actor–Critic Network
# ------------------------------
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(ActorCritic, self).__init__()
        # Shared base
        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU()
        )
        # Actor outputs mean
        self.mu = nn.Linear(hidden_dim, action_dim)
        # Learnable log std
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        # Critic outputs value
        self.value = nn.Linear(hidden_dim, 1)

    def forward(self, state):
        x = self.shared(state)
        mu = self.mu(x)
        std = torch.exp(self.log_std)
        value = self.value(x)
        return mu, std, value

# ------------------------------
# A2C Agent
# ------------------------------
class A2CAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99):
        self.gamma = gamma
        self.model = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        mu, std, _ = self.model(state)
        dist = Normal(mu, std)
        action = dist.sample()
        log_prob = dist.log_prob(action).sum(dim=-1)
        return action.squeeze(0).detach().numpy(), log_prob

    def compute_returns(self, rewards, dones, values, next_value):
        returns = []
        R = next_value
        for r, d in zip(reversed(rewards), reversed(dones)):
            R = r + self.gamma * R * (1 - d)
            returns.insert(0, R)
        return returns

    def update(self, log_probs, values, rewards, dones, next_value):
        returns = self.compute_returns(rewards, dones, values, next_value)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)
        log_probs = torch.cat(log_probs)

        advantage = returns - values
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        loss = actor_loss + 0.5 * critic_loss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# ------------------------------
# Training Loop
# ------------------------------
def train(env_name="Pendulum-v1", num_episodes=300, max_steps=200):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    agent = A2CAgent(state_dim, action_dim)
    rewards_all = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        log_probs, values, rewards, dones = [], [], [], []
        total_reward = 0

        for step in range(max_steps):
            action, log_prob = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            _, _, value = agent.model(state_tensor)

            log_probs.append(log_prob.unsqueeze(0))
            values.append(value)
            rewards.append(torch.tensor([reward], dtype=torch.float32))
            dones.append(torch.tensor([done], dtype=torch.float32))

            total_reward += reward
            state = next_state

            if done:
                break

        next_state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        _, _, next_value = agent.model(next_state_tensor)

        agent.update(log_probs, values, rewards, dones, next_value)
        rewards_all.append(total_reward)

        if episode % 10 == 0:
            avg_reward = np.mean(rewards_all[-10:])
            print(f"Episode {episode}, Average Reward: {avg_reward:.2f}")

    env.close()
    torch.save(agent.model.state_dict(), "a2c_policy.pth")
    print("✅ Training complete! Model saved as a2c_policy.pth")
    return agent, rewards_all

# ------------------------------
# Visualization / Playback
# ------------------------------
def visualize(agent, env_name="Pendulum-v1", episodes=3):
    env = gym.make(env_name, render_mode="human")
    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        for _ in range(200):
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            mu, std, _ = agent.model(state_tensor)
            dist = Normal(mu, std)
            action = dist.mean.detach().numpy()[0]
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        print(f"🎮 Episode {ep+1}: Reward = {total_reward:.2f}")
    env.close()

# ------------------------------
# Main Entry
# ------------------------------
if __name__ == "__main__":
    start = time.time()
    agent, rewards = train(num_episodes=300)
    print(f"Training done in {round(time.time() - start, 2)} sec")
    visualize(agent)


Episode 0, Average Reward: -1687.95
Episode 10, Average Reward: -1436.97
Episode 20, Average Reward: -1392.66
Episode 30, Average Reward: -1475.79
Episode 40, Average Reward: -1255.14
Episode 50, Average Reward: -1481.63
Episode 60, Average Reward: -1406.42
Episode 70, Average Reward: -1424.84
Episode 80, Average Reward: -1498.36
Episode 90, Average Reward: -1341.39
Episode 100, Average Reward: -1336.14
Episode 110, Average Reward: -1338.94
Episode 120, Average Reward: -1337.62
Episode 130, Average Reward: -1354.38
Episode 140, Average Reward: -1402.93
Episode 150, Average Reward: -1376.94
Episode 160, Average Reward: -1385.53
Episode 170, Average Reward: -1365.98
Episode 180, Average Reward: -1234.99
Episode 190, Average Reward: -1363.01
Episode 200, Average Reward: -1352.72
Episode 210, Average Reward: -1395.63
Episode 220, Average Reward: -1332.62
Episode 230, Average Reward: -1484.52
Episode 240, Average Reward: -1395.51
Episode 250, Average Reward: -1249.93
Episode 260, Average Re