In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

In [7]:
# ----------------------- Environment Setup ----------------------- #
class DummyEnv:
    def __init__(self):
        self.state_dim = 4
        self.action_dim = 2

    def reset(self):
        return np.random.randn(self.state_dim).astype(np.float32)

    def step(self, action):
        next_state = np.random.randn(self.state_dim).astype(np.float32)
        reward = np.random.randn()
        done = np.random.rand() > 0.95  # Randomly end episode
        return next_state, reward, done

# ----------------------- Neural Networks ----------------------- #
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 32), nn.ReLU(),
            nn.Linear(32, action_dim), nn.Tanh()
        )

    def forward(self, state):
        return self.net(state)

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 32), nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.net(x)

class DynamicsModel(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 32), nn.ReLU(),
            nn.Linear(32, state_dim)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.net(x)

# ----------------------- Preprocessing ----------------------- #
def preprocess(state):
    return torch.tensor(state, dtype=torch.float32).unsqueeze(0)

# ----------------------- Train-Test Split ----------------------- #
def train_test_split(data, train_ratio=0.8):
    split = int(len(data) * train_ratio)
    return data[:split], data[split:]

# ----------------------- Base Model Training ----------------------- #
def train_base_model(env, actor, critic, dynamics_model, episodes=10):
    actor_optim = optim.Adam(actor.parameters(), lr=0.001)
    critic_optim = optim.Adam(critic.parameters(), lr=0.001)
    dynamics_optim = optim.Adam(dynamics_model.parameters(), lr=0.001)

    for episode in range(episodes):
        state = preprocess(env.reset())
        done = False

        while not done:
            action = actor(state)
            next_state, reward, done = env.step(action.detach().numpy().flatten())
            next_state_tensor = preprocess(next_state)
            reward_tensor = torch.tensor([[reward]], dtype=torch.float32)

            # Update Critic
            critic_input = torch.cat([state, action], dim=-1)
            target_value = reward_tensor + 0.99 * critic(next_state_tensor, actor(next_state_tensor).detach())
            critic_loss = nn.MSELoss()(critic(state, action), target_value.detach())
            critic_optim.zero_grad()
            critic_loss.backward(retain_graph=True)
            critic_optim.step()

            # Update Actor
            actor_loss = -critic(state, actor(state)).mean()
            actor_optim.zero_grad()
            actor_loss.backward(retain_graph=True)
            actor_optim.step()

            # Update Dynamics Model
            pred_next_state = dynamics_model(state, action.detach())
            dynamics_loss = nn.MSELoss()(pred_next_state, next_state_tensor.detach())
            dynamics_optim.zero_grad()
            dynamics_loss.backward()
            dynamics_optim.step()

            state = next_state_tensor

# ----------------------- Planning (Simulated Experience) ----------------------- #
def planning_with_dynamics(dynamics_model, buffer, planning_steps=5):
    if len(buffer) == 0:
        return

    for _ in range(planning_steps):
        state, action = buffer[np.random.randint(len(buffer))]
        state = preprocess(state)
        action = torch.tensor(action, dtype=torch.float32).unsqueeze(0)
        simulated_next_state = dynamics_model(state, action).detach().numpy().flatten()
        buffer.append((simulated_next_state, np.random.randn(2)))

# ----------------------- Fine-Tune Actor-Critic ----------------------- #
def fine_tune_actor_critic(env, actor, critic, episodes=5):
    actor_optim = optim.Adam(actor.parameters(), lr=0.0005)
    critic_optim = optim.Adam(critic.parameters(), lr=0.0005)

    for _ in range(episodes):
        state = preprocess(env.reset())
        done = False

        while not done:
            action = actor(state)
            next_state, reward, done = env.step(action.detach().numpy().flatten())
            next_state_tensor = preprocess(next_state)
            reward_tensor = torch.tensor([[reward]], dtype=torch.float32)

            # Critic update
            target_value = reward_tensor + 0.99 * critic(next_state_tensor, actor(next_state_tensor).detach())
            critic_loss = nn.MSELoss()(critic(state, action), target_value.detach())
            critic_optim.zero_grad()
            critic_loss.backward(retain_graph=True)
            critic_optim.step()

            # Actor update
            actor_loss = -critic(state, actor(state)).mean()
            actor_optim.zero_grad()
            actor_loss.backward()
            actor_optim.step()

            state = next_state_tensor

# ----------------------- Evaluation ----------------------- #
def evaluate(env, actor, episodes=3):
    for ep in range(episodes):
        state = preprocess(env.reset())
        total_reward = 0
        done = False

        while not done:
            action = actor(state).detach().numpy().flatten()
            next_state, reward, done = env.step(action)
            total_reward += reward
            state = preprocess(next_state)

        print(f"Episode {ep+1}: Total Reward = {total_reward:.2f}")

# ----------------------- Deploy Policy ----------------------- #
def deploy_policy(actor):
    torch.save(actor.state_dict(), "deployed_policy.pth")
    print("Policy deployed and saved!")

In [8]:
# ----------------------- Main Flow ----------------------- #
if __name__ == "__main__":
    env = DummyEnv()
    actor = Actor(env.state_dim, env.action_dim)
    critic = Critic(env.state_dim, env.action_dim)
    dynamics_model = DynamicsModel(env.state_dim, env.action_dim)

    # Train-Test Split (Random Data for Demo)
    data = [(np.random.randn(env.state_dim), np.random.randn(env.action_dim)) for _ in range(100)]
    train_data, test_data = train_test_split(data)

    # Train Base Model
    train_base_model(env, actor, critic, dynamics_model)

    # Planning Phase
    buffer = deque(maxlen=50)
    buffer.extend(train_data)
    planning_with_dynamics(dynamics_model, buffer)

    # Fine-Tune Actor-Critic
    fine_tune_actor_critic(env, actor, critic)

    # Evaluation
    evaluate(env, actor)

    # Deploy Policy
    deploy_policy(actor)

Episode 1: Total Reward = -8.11
Episode 2: Total Reward = -4.49
Episode 3: Total Reward = 1.13
Policy deployed and saved!
