# Week 5 Implementation - CPT-DDPG with Visualizations

This notebook updates the original Week3 implementation to incorporate a CPT-modified DDPG agent. In addition to the CPT update (which applies a risk-sensitive transformation to rewards), the notebook retains all the visualizations and metrics logging from your original code.

You can adjust hyperparameters and network dimensions as needed.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

# --- CPT Transformation Function ---
# Applies a CPT-like transformation to rewards: power-law for gains and extra penalty for losses.
def cpt_transform_tensor(rewards, alpha=0.88, beta=0.88, lambda_=2.25):
    rewards = rewards.float()
    pos = torch.pow(torch.clamp(rewards, min=0), alpha)
    neg = -lambda_ * torch.pow(-torch.clamp(rewards, max=0), beta)
    return pos + neg

# --- Actor Network ---
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Tanh()  # Assuming actions scaled between -1 and 1
        )

    def forward(self, state):
        return self.net(state)

# --- Critic Network ---
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.net(x)

# --- CPT-DDPG Agent ---
class CPTDDPG:
    def __init__(self, state_dim, action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=0.005):
        self.gamma = gamma
        self.tau = tau

        # Actor network and target
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())

        # Critic network and target
        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

    def select_action(self, state):
        # Accepts a NumPy array state
        state = torch.FloatTensor(state).unsqueeze(0)
        action = self.actor(state)
        return action.detach().cpu().numpy()[0]

    def update(self, replay_buffer, batch_size=64):
        state, action, reward, next_state, done = replay_buffer.sample(batch_size)

        # Apply CPT transformation to rewards
        reward = cpt_transform_tensor(reward)

        with torch.no_grad():
            next_action = self.actor_target(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        current_Q = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft update target networks
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        return critic_loss.item(), actor_loss.item()

# --- Minimal Replay Buffer ---
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        indices = np.random.randint(0, len(self.buffer), size=batch_size)
        batch = [self.buffer[i] for i in indices]
        state, action, reward, next_state, done = zip(*batch)
        state = torch.stack(state)
        action = torch.stack(action)
        reward = torch.stack(reward).squeeze()
        next_state = torch.stack(next_state)
        done = torch.stack(done).squeeze()
        return state, action, reward, next_state, done

    def size(self):
        return len(self.buffer)


## Training Loop and Metrics Collection

The following cell runs the training loop for a dummy environment (replace with your actual environment if needed) and logs metrics such as total reward, actor loss, and critic loss for each episode.

In [None]:
# Training parameters
state_dim = 10
action_dim = 2
num_episodes = 50
steps_per_episode = 100

# Initialize agent and replay buffer
agent = CPTDDPG(state_dim, action_dim)
replay_buffer = ReplayBuffer(capacity=100000)

# Containers to store metrics
episode_rewards = []
episode_actor_losses = []
episode_critic_losses = []

for episode in range(num_episodes):
    state = torch.randn(state_dim)  # Replace with env.reset() if available
    total_reward = 0
    actor_losses = []
    critic_losses = []
    
    for t in range(steps_per_episode):
        # Select an action using the CPT-DDPG agent
        action = agent.select_action(state.numpy())
        
        # Simulate environment step (replace with your env.step())
        next_state = torch.randn(state_dim)
        reward = torch.tensor([float(torch.randn(1))])
        done = torch.tensor([0.0]) if t < steps_per_episode - 1 else torch.tensor([1.0])
        
        total_reward += reward.item()
        
        # Store transition in replay buffer
        replay_buffer.add(state, torch.tensor(action), reward, next_state, done)
        state = next_state
        
        # Update agent if enough transitions have been collected
        if replay_buffer.size() >= 64:
            critic_loss, actor_loss = agent.update(replay_buffer, batch_size=64)
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)

    episode_rewards.append(total_reward)
    episode_actor_losses.append(np.mean(actor_losses) if actor_losses else 0)
    episode_critic_losses.append(np.mean(critic_losses) if critic_losses else 0)
    
    print(f"Episode {episode}: Total Reward = {total_reward:.2f}")


## Visualizations

The following cell plots the total reward per episode as well as the actor and critic losses over training.

In [None]:
plt.figure(figsize=(12, 4))

# Plot Episode Rewards
plt.subplot(1, 3, 1)
plt.plot(episode_rewards, label='Total Reward')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Rewards')
plt.legend()

# Plot Actor Loss
plt.subplot(1, 3, 2)
plt.plot(episode_actor_losses, label='Actor Loss', color='orange')
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.title('Actor Loss')
plt.legend()

# Plot Critic Loss
plt.subplot(1, 3, 3)
plt.plot(episode_critic_losses, label='Critic Loss', color='green')
plt.xlabel('Episode')
plt.ylabel('Loss')
plt.title('Critic Loss')
plt.legend()

plt.tight_layout()
plt.show()
