In [8]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt

# Actor Network
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(8, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 4)

    def forward(self, x):
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        logits = self.fc4(x)  # Raw logits, no ReLU
        return F.softmax(logits, dim=-1)

# Critic Network
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(8, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

# Hyperparameters
actor_lr = 3e-4  # Lowered slightly for stability
critic_lr = 1e-3  # Reduced to balance with actor
gamma = 0.99
num_epochs = 1500  # Increased training time
entropy_beta = 0.01  # Entropy regularization for exploration
clip_grad = 0.5  # Gradient clipping

# Initialize environment and models
env = gym.make('LunarLander-v3', continuous=False)
actor = Actor()
critic = Critic()
actor_optim = optim.Adam(actor.parameters(), lr=actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=critic_lr)

# Action selection
def get_action(state):
    state = torch.tensor(state, dtype=torch.float32)
    action_probs = actor(state)
    dist = Categorical(action_probs)
    action = dist.sample()
    return action.item(), dist.log_prob(action), dist.entropy()

# Test function
def test(num_tests):
    tot_reward = 0
    for _ in range(num_tests):
        state, _ = env.reset()
        done = False
        while not done:
            action, _, _ = get_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            tot_reward += reward
            done = terminated or truncated
    return tot_reward / num_tests

# Training loop
rewards = []
max_reward = -300
for epoch in range(1, num_epochs + 1):
    state, _ = env.reset()
    done = False
    episode_log_probs = []
    episode_values = []
    episode_rewards = []
    episode_entropy = []
    
    # Collect episode data
    while not done:
        action, log_prob, entropy = get_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        value = critic(state)
        
        episode_log_probs.append(log_prob)
        episode_values.append(value)
        episode_rewards.append(reward)
        episode_entropy.append(entropy)
        
        state = next_state
        done = terminated or truncated
    
    # Compute returns and advantages
    returns = []
    R = 0
    for r in episode_rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns, dtype=torch.float32)
    
    # Normalize returns
    returns = (returns - returns.mean()) / (returns.std() + 1e-8)
    
    # Compute losses
    values = torch.cat(episode_values).squeeze()
    log_probs = torch.cat(episode_log_probs)
    entropy = torch.cat(episode_entropy).mean()
    
    advantages = returns - values.detach()
    actor_loss = -(log_probs * advantages.detach()).mean() - entropy_beta * entropy  # Add entropy
    critic_loss = advantages.pow(2).mean()
    
    # Update critic
    critic_optim.zero_grad()
    critic_loss.backward()
    nn.utils.clip_grad_norm_(critic.parameters(), clip_grad)
    critic_optim.step()
    
    # Update actor
    actor_optim.zero_grad()
    actor_loss.backward()
    nn.utils.clip_grad_norm_(actor.parameters(), clip_grad)
    actor_optim.step()
    
    # Track rewards
    tot_reward = sum(episode_rewards)
    rewards.append(tot_reward)
    
    # Test and save
    if epoch % 50 == 0:
        avg = test(10)
        if avg > max_reward:
            max_reward = avg
            torch.save(actor.state_dict(), "LunarLanderAgent.pt")
        print(f"{epoch} / {num_epochs} : {avg:.1f}")

# Plot results
window = 20
avg_rewards = [np.mean(rewards[max(0, i - window):(i + 1)]) for i in range(len(rewards))]
plt.plot(avg_rewards)
plt.xlabel("Episode")
plt.ylabel("Average Reward")
plt.title("Actor-Critic Lunar Lander")
plt.grid()
plt.show()

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated