In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

In [3]:
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

# Environment setup
env = gym.make('Pendulum-v1', render_mode="rgb_array")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = float(env.action_space.high[0])

# Preprocessing (No heavy preprocessing for this lightweight setup)
def preprocess_state(state):
    return torch.tensor(state, dtype=torch.float32)

# Train-Test Split
train_episodes = 80
val_episodes = 10
test_episodes = 10

# Actor Network
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_bound):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.out = nn.Linear(64, action_dim)
        self.action_bound = action_bound

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return torch.tanh(self.out(x)) * self.action_bound

# Critic Network
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.out = nn.Linear(64, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size=64):
        samples = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        return (
            torch.stack(states),
            torch.stack(actions),
            torch.tensor(rewards, dtype=torch.float32).unsqueeze(1),
            torch.stack(next_states),
            torch.tensor(dones, dtype=torch.float32).unsqueeze(1),
        )

    def __len__(self):
        return len(self.buffer)

# Initialize networks and optimizers
actor = Actor(state_dim, action_dim, action_bound)
critic = Critic(state_dim, action_dim)
actor_target = Actor(state_dim, action_dim, action_bound)
critic_target = Critic(state_dim, action_dim)
actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.002)

replay_buffer = ReplayBuffer()

def soft_update(target, source, tau=0.005):
    for target_param, source_param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(tau * source_param.data + (1.0 - tau) * target_param.data)

def get_action(state, noise_scale=0.1):
    state = preprocess_state(state).unsqueeze(0)
    action = actor(state).detach().numpy()[0]
    noise = noise_scale * np.random.randn(action_dim)
    return np.clip(action + noise, -action_bound, action_bound)

# Train Base Model (DDPG)
def train_ddpg(episodes):
    gamma = 0.99
    batch_size = 64
    min_buffer_size = 500

    for episode in range(episodes):
        state = env.reset()[0]
        episode_reward = 0
        for _ in range(200):
            action = get_action(state)
            next_state, reward, done, _, _ = env.step(action)
            replay_buffer.push(preprocess_state(state), torch.tensor(action, dtype=torch.float32), reward, preprocess_state(next_state), done)

            state = next_state
            episode_reward += reward

            if len(replay_buffer) >= min_buffer_size:
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

                # Critic loss
                with torch.no_grad():
                    target_actions = actor_target(next_states)
                    target_q = critic_target(next_states, target_actions)
                    y = rewards + gamma * (1 - dones) * target_q
                critic_loss = nn.MSELoss()(critic(states, actions), y)

                critic_optimizer.zero_grad()
                critic_loss.backward()
                critic_optimizer.step()

                # Actor loss
                actor_loss = -critic(states, actor(states)).mean()

                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                # Soft update
                soft_update(actor_target, actor)
                soft_update(critic_target, critic)

        print(f"Episode {episode+1}/{episodes}, Reward: {episode_reward:.2f}")

# Fine-Tune Model
def fine_tune(episodes):
    print("\nFine-tuning the model...")
    train_ddpg(episodes)

# Evaluate
def evaluate(episodes):
    total_reward = 0
    for episode in range(episodes):
        state = env.reset()[0]
        episode_reward = 0
        for _ in range(200):
            action = actor(preprocess_state(state).unsqueeze(0)).detach().numpy()[0]
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            episode_reward += reward
        total_reward += episode_reward
        print(f"Test Episode {episode+1}/{episodes}, Reward: {episode_reward:.2f}")
    avg_reward = total_reward / episodes
    print(f"\nAverage Test Reward: {avg_reward:.2f}")

# Deploy Policy (Simulate Deployment)
def deploy_policy():
    print("\nDeploying trained policy...")
    state = env.reset()[0]
    for _ in range(200):
        env.render()
        action = actor(preprocess_state(state).unsqueeze(0)).detach().numpy()[0]
        state, _, done, _, _ = env.step(action)
        if done:
            break
    env.close()

# Main Workflow
print("Training Base Model (DDPG)...")
train_ddpg(train_episodes)

fine_tune(5)  # Fine-tune for 5 episodes

evaluate(test_episodes)

deploy_policy()

Training Base Model (DDPG)...
Episode 1/80, Reward: -1193.70
Episode 2/80, Reward: -1571.87
Episode 3/80, Reward: -1360.30
Episode 4/80, Reward: -1100.12
Episode 5/80, Reward: -1489.19
Episode 6/80, Reward: -1806.54
Episode 7/80, Reward: -1476.07
Episode 8/80, Reward: -1416.23
Episode 9/80, Reward: -1550.52
Episode 10/80, Reward: -1285.29
Episode 11/80, Reward: -1537.18
Episode 12/80, Reward: -1464.88
Episode 13/80, Reward: -1502.67
Episode 14/80, Reward: -1087.02
Episode 15/80, Reward: -1398.54
Episode 16/80, Reward: -1036.15
Episode 17/80, Reward: -1052.90
Episode 18/80, Reward: -905.35
Episode 19/80, Reward: -753.82
Episode 20/80, Reward: -758.40
Episode 21/80, Reward: -1058.36
Episode 22/80, Reward: -1049.74
Episode 23/80, Reward: -885.84
Episode 24/80, Reward: -712.39
Episode 25/80, Reward: -633.58
Episode 26/80, Reward: -555.69
Episode 27/80, Reward: -631.05
Episode 28/80, Reward: -513.95
Episode 29/80, Reward: -247.84
Episode 30/80, Reward: -261.23
Episode 31/80, Reward: -134.97