In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

In [12]:
class DynamicsModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(DynamicsModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, state_dim)
        )

    def forward(self, state, action):
        x = torch.cat([state, action], dim=-1)
        return self.model(x)

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(PolicyNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Tanh()
        )

    def forward(self, state):
        return self.model(state)

def generate_random_data(num_samples, state_dim, action_dim):
    states = torch.randn(num_samples, state_dim)
    actions = torch.randn(num_samples, action_dim)

    if action_dim >= state_dim:
        padded_actions = actions[:, :state_dim]
    else:
        padding = torch.zeros(num_samples, state_dim - action_dim)
        padded_actions = torch.cat((actions, padding), dim=1)

    next_states = states + 0.1 * padded_actions + torch.randn(num_samples, state_dim) * 0.01
    return states, actions, next_states

def train_dynamics_model(model, states, actions, next_states, epochs=5):
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    for _ in range(epochs):
        optimizer.zero_grad()
        predictions = model(states, actions)
        loss = criterion(predictions, next_states)
        loss.backward()
        optimizer.step()

def simulate_experience(dynamics_model, policy_network, num_simulations=200, state_dim=4, action_dim=2):
    with torch.no_grad():
        states = torch.randn(num_simulations, state_dim)
        actions = policy_network(states)[:, :action_dim]
        next_states = dynamics_model(states, actions)
    return states, actions, next_states

def fine_tune_policy(policy_network, states, actions, epochs=5):
    optimizer = optim.Adam(policy_network.parameters(), lr=0.005)
    criterion = nn.MSELoss()

    states = states.detach()
    actions = actions.detach()

    for _ in range(epochs):
        optimizer.zero_grad()
        predicted_actions = policy_network(states)[:, :actions.size(1)]
        loss = criterion(predicted_actions, actions)
        loss.backward()
        optimizer.step()

def evaluate_policy(policy_network, num_eval=100, state_dim=4):
    with torch.no_grad():
        states = torch.randn(num_eval, state_dim)
        actions = policy_network(states)
        print(f"Evaluation - Mean Action: {actions.mean().item():.4f}")

def deploy_policy(policy_network):
    print("Policy deployed successfully.")

state_dim = 4
action_dim = 2
num_samples = 1000

states, actions, next_states = generate_random_data(num_samples, state_dim, action_dim)
train_size = int(0.8 * num_samples)
train_states, test_states = states[:train_size], states[train_size:]
train_actions, test_actions = actions[:train_size], actions[train_size:]
train_next_states, test_next_states = next_states[:train_size], next_states[train_size:]

dynamics_model = DynamicsModel(state_dim, action_dim)
policy_network = PolicyNetwork(state_dim, action_dim)

train_dynamics_model(dynamics_model, train_states, train_actions, train_next_states)
sim_states, sim_actions, sim_next_states = simulate_experience(dynamics_model, policy_network, state_dim=state_dim, action_dim=action_dim)
fine_tune_policy(policy_network, sim_states, sim_actions)
evaluate_policy(policy_network)
deploy_policy(policy_network)

Evaluation - Mean Action: -0.0283
Policy deployed successfully.
