In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import gymnasium as gym
env = gym.make('InvertedPendulum-v4')

# ignore:

In [17]:

# Actor-Critic Network for Continuous Action Spaces
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))  # Learnable log standard deviation
        
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        mean = self.actor(x)
        std = self.log_std.exp().expand_as(mean)  # Standard deviation
        dist = torch.distributions.Normal(mean, std)
        return dist, self.critic(x)

# PPO Agent with Clipping Method
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, clip_epsilon=0.2):
        self.model = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = 0.99
        self.clip_epsilon = clip_epsilon

    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        dist, _ = self.model(state_tensor)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item()

    def train(self, states, actions, rewards, next_states, dones, old_log_probs):
        states_tensor = torch.FloatTensor(states)
        actions_tensor = torch.FloatTensor(actions).unsqueeze(1)
        next_states_tensor = torch.FloatTensor(next_states)
        rewards_tensor = torch.FloatTensor(rewards)
        dones_tensor = torch.FloatTensor(dones)
        old_log_probs_tensor = torch.FloatTensor(old_log_probs)

        _, values = self.model(states_tensor)
        dists, next_values = self.model(next_states_tensor)

        new_log_probs = dists.log_prob(actions_tensor).sum(axis=1, keepdim=True)

        advantages = rewards_tensor + self.gamma * next_values.squeeze() * (1 - dones_tensor) - values.squeeze()

        ratios = (new_log_probs - old_log_probs_tensor).exp()
        clipped_ratios = torch.clamp(ratios, 1 - self.clip_epsilon, 1 + self.clip_epsilon)
        surrogate_loss = -torch.min(ratios * advantages, clipped_ratios * advantages).mean()

        critic_loss = advantages.pow(2).mean()
        loss = surrogate_loss + critic_loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Training Function
def train_agent(agent, env, episodes=1000):
    for episode in range(episodes):
        state = env.reset()[0]
        total_reward = 0
        done = False
        states, actions, rewards, next_states, dones, old_log_probs = [], [], [], [], [], []

        while not done:
            action, log_prob = agent.select_action(state)
            next_state, reward, done, _,_ = env.step([action])  # Action needs to be a list
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)
            old_log_probs.append(log_prob)
            state = next_state
            total_reward += reward

            

        agent.train(states, actions, rewards, next_states, dones, old_log_probs)
        print(f'Episode {episode + 1}: Total Reward: {total_reward}')

# env = gym.make('InvertedPendulum-v4')
# agent = PPOAgent(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
# train_agent(agent, env, episodes=1000)


In [18]:
def evaluate_policy(agent, env, episodes=10):
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()[0]
        total_reward = 0
        done = False
        while not done:
            action, _ = agent.select_action(state)  # Use the trained policy to select actions
            state, reward, done, _ , _= env.step([action])  # Action should be in the correct format
            total_reward += reward
        total_rewards.append(total_reward)
        print(f'Evaluation Episode {episode + 1}: Total Reward: {total_reward}')
    
    average_reward = sum(total_rewards) / len(total_rewards)
    print(f'Average Reward over {episodes} episodes: {average_reward}')
    return average_reward

# env = gym.make('InvertedPendulum-v4', render_mode='human')
# evaluate_policy(agent, env, episodes=100)


In [19]:
# env.close()

In [20]:
# env = gym.make('InvertedPendulum-v4', render_mode='human')

In [21]:
# env.close()

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        mean = self.actor(x)
        std = self.log_std.exp().expand_as(mean)
        dist = torch.distributions.Normal(mean, std)
        return dist, self.critic(x)

class PPOAgent:
    def __init__(self, state_dim, action_dim, horizon=2048, lr=3e-4, clip_epsilon=0.2, gamma=0.99, gae_lambda=0.95, epochs=10, minibatch_size=64):
        self.model = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_epsilon = clip_epsilon
        self.horizon = horizon
        self.epochs = epochs
        self.minibatch_size = minibatch_size

    def select_action(self, state):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        dist, _ = self.model(state_tensor)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item()

    def train(self, states, actions, rewards, next_states, dones, old_log_probs):
        states_tensor = torch.FloatTensor(states)
        actions_tensor = torch.FloatTensor(actions).unsqueeze(1)
        next_states_tensor = torch.FloatTensor(next_states)
        rewards_tensor = torch.FloatTensor(rewards)
        dones_tensor = torch.FloatTensor(dones)
        old_log_probs_tensor = torch.FloatTensor(old_log_probs)

        _, values = self.model(states_tensor)
        dists, next_values = self.model(next_states_tensor)

        new_log_probs = dists.log_prob(actions_tensor).sum(axis=1, keepdim=True)

        advantages = rewards_tensor + self.gamma * next_values.squeeze() * (1 - dones_tensor) - values.squeeze()

        ratios = (new_log_probs - old_log_probs_tensor).exp()
        clipped_ratios = torch.clamp(ratios, 1 - self.clip_epsilon, 1 + self.clip_epsilon)
        surrogate_loss = -torch.min(ratios * advantages, clipped_ratios * advantages).mean()

        critic_loss = advantages.pow(2).mean()
        loss = surrogate_loss + critic_loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

def train_agent(agent, env, episodes=1000):
    for episode in range(episodes):
        state = env.reset()[0]
        total_reward = 0
        done = False
        states, actions, rewards, next_states, dones, old_log_probs = [], [], [], [], [], []

        while not done:
            action, log_prob = agent.select_action(state)
            next_state, reward, done, _, _ = env.step([action])
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)
            old_log_probs.append(log_prob)
            state = next_state
            total_reward += reward
        
        agent.train(states, actions, rewards, next_states, dones, old_log_probs)
        print(f'Episode {episode + 1}: Total Reward: {total_reward}')

# env = gym.make('InvertedPendulum-v4')
# agent = PPOAgent(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0])
# train_agent(agent, env, episodes=1000)


In [23]:
def evaluate_policy(agent, env, episodes=10):
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()[0]
        total_reward = 0
        done = False
        while not done:
            action, _ = agent.select_action(state)  # Use the trained policy to select actions
            state, reward, done, _ , _= env.step([action])  # Action should be in the correct format
            total_reward += reward
        total_rewards.append(total_reward)
        print(f'Evaluation Episode {episode + 1}: Total Reward: {total_reward}')
    
    average_reward = sum(total_rewards) / len(total_rewards)
    print(f'Average Reward over {episodes} episodes: {average_reward}')
    return average_reward

# env = gym.make('InvertedPendulum-v4', render_mode='human')
# evaluate_policy(agent, env, episodes=100)


In [24]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.distributions import Normal
# import numpy as np
# import os
# # from torch.utils.tensorboard import SummaryWriter
# from functools import reduce
# from operator import mul

# glob_i = 0
# glob_error = 0

# class PPOAgent():
#     def __init__(self, TRAIN, env=None, traj=3, net_size=64, net_std=1,
#                 lr=1e-4, bs=100, y=0.99, ep=10, W=None,
#                 c1=0.5, c2=0.01):
#         self.TRAIN  = TRAIN
#         self.LR     = lr
#         self.GAMMA  = y
#         self.BATCHSIZE  = bs
#         self.EPOCHS = ep
#         self.NTRAJ = traj
#         self.env = env
#         self.C1 = c1
#         self.C2 = c2

#         self.WRITER = W

#         self.input_size = reduce(mul, env.observation_space.shape, 1)
#         self.out_size   = env.action_space.shape[0]

#         self.model = AgentNet(self.input_size, self.out_size, h=net_size, std=net_std)
#         self.opt = optim.Adam(self.model.parameters(), lr=self.LR)

#         self.value_criterion = nn.MSELoss()

#         self.trajectories = []
#         self.states   = []
#         self.rewards  = []
#         self.actions  = []
#         self.values   = []
#         self.logAprob = []

#     def __call__(self, state):
#         # Each time an action is required, we save the
#         # state value for computing advantages later
#         state = torch.FloatTensor(state).reshape(1, self.input_size)
#         normal, _ = self.model(state) 
#         action = normal.sample().reshape(1, self.out_size)

#         self.values.append(normal.mean)
#         self.actions.append(action)
#         self.logAprob.append(normal.log_prob(action).squeeze(0))

#         return action.numpy().reshape(self.out_size)

#     def observe(self, s, r, s1, done, NEPISODE):
#         if not self.TRAIN: return

#         s  = torch.FloatTensor(s).reshape(1, self.input_size)
#         s1 = torch.FloatTensor(s1).reshape(1, self.input_size)

#         self.states.append(s)

#         # For rewards we only maintain the value.
#         self.rewards.append(r)

#         if done:
#             # Compute advantages using critic network
#             with torch.no_grad():
#                 s1_tensor = s1
#                 _, next_value = self.model(s1_tensor)
#             next_value = next_value.detach().squeeze().numpy()
#             advantages = self._compute_advantages(next_value)

#             # Update trajectories with advantages
#             for traj, adv in zip(self.trajectories, advantages):
#                 traj['Adv'] = torch.FloatTensor(adv)

#             # Update Condition
#             if NEPISODE != 0 and (NEPISODE % self.NTRAJ) == 0: 
#                 self.update() 

#             self.states   = []
#             self.actions  = []
#             self.logAprob = []
#             self.values   = []
#             self.rewards  = []

#     def _compute_advantages(self, next_value):
#         advantages = []
#         for traj in self.trajectories:
#             rewards = traj['r']
#             values = traj['V'].numpy()
#             deltas = [r + self.GAMMA * next_v - v for r, next_v, v in zip(rewards, [next_value] + values[:-1], values)]
#             advantages.extend(self._discount_cumsum(deltas, self.GAMMA * self.C2))
#         return advantages

#     def _discount_cumsum(self, x, discount):
#         """
#         Compute discounted cumulative sums of vectors.

#         Parameters:
#             x (list): Input list of numbers.
#             discount (float): Discount factor.

#         Returns:
#             list: Discounted cumulative sums.
#         """
#         discounted = [x[-1]]
#         for v in reversed(x[:-1]):
#             discounted.append(v + discount * discounted[-1])
#         return list(reversed(discounted))

#     def update(self):
#         EPS = 0.2

#         # Compute advantages
#         S = torch.cat([x['S'] for x in self.trajectories], 0)
#         A = torch.cat([x['A'] for x in self.trajectories], 0)
#         Adv = torch.cat([x['Adv'] for x in self.trajectories], 0)
#         Log_old = torch.cat([x['LogP'] for x in self.trajectories], 0)
#         G = torch.cat([x['G'] for x in self.trajectories], 0)
#         V = torch.cat([x['V'] for x in self.trajectories], 0)

#         bufsize = S.size(0)

#         for ep in range(self.EPOCHS*(bufsize//self.BATCHSIZE+1)):
#             ids = np.random.randint(0, bufsize,
#                     min(self.BATCHSIZE, bufsize))

#             bS, bA, bAdv = S[ids,:], A[ids,:], Adv[ids,:]

#             normal, Vnew = self.model(bS)
#             logAprob_old = Log_old[ids,:]
#             logAprob = normal.log_prob(bA)

#             # L_CLIP
#             ratio = (logAprob - logAprob_old).exp().squeeze(0)
#             m1 = ratio * bAdv
#             m2 = torch.clamp(ratio, 1.0 - EPS, 1.0 + EPS) * bAdv
#             L_CLIP = torch.min(m1, m2).mean()

#             # L_VF
#             L_VF = (G[ids,:] - Vnew).pow(2).mean()

#             # Entropy
#             E = normal.entropy().mean()

#             # Total Loss
#             L = -L_CLIP + self.C1 * L_VF - self.C2 * E

#             # Apply Gradients
#             self.opt.zero_grad()
#             L.backward()
#             self.opt.step()

#         # Update Graphs
#         # self.WRITER.add_scalar("L_VF", L_VF, glob_i)
#         # global glob_i
#         # glob_i += 1
#         self.trajectories = []

#     def load(self, path):
#         try:
#             self.model.load_state_dict(torch.load(path))
#             self.model.eval()
#         except FileNotFoundError:
#             print(f'Error: {path} not found.')
#             exit()

#     def save(self, path):
#         torch.save(self.model.state_dict(), path)

# class AgentNet(nn.Module):
#     def __init__(self, inp, out, h=32, std=0):
#         super(AgentNet, self).__init__()

#         self.actor = nn.Sequential(
#             nn.Linear(inp, h),
#             nn.Tanh(),
#             nn.Linear(h, h//2),
#             nn.Tanh(),
#             nn.Linear(h//2, out),
#         )
#         self.log_std = nn.Parameter(torch.ones(1, out) * std)

#         self.critic = nn.Sequential(
#             nn.Linear(inp, h),
#             nn.Tanh(),
#             nn.Linear(h, h//2),
#             nn.Tanh(),
#             nn.Linear(h//2, 1)
#         )

#     def forward(self, x):
#         actor_output = self.actor(x)
#         critic_output = self.critic(x)
#         std = self.log_std.exp().expand_as(actor_output)
#         dist = Normal(actor_output, std)
#         return dist, critic_output

# # main.py


In [25]:

# # import mujoco_py
# import gym
# import torch
# import numpy as np
# # from torch.utils.tensorboard import SummaryWriter
# # from ppo_agent import PPOAgent
# import sys, os, json
# from glob import glob
# import argparse

# # Environemnt Params
# MODELS_PATH        = 'models'
# DEFAULT_EPISODES   = 2000
# DEFAULT_MAX_STEPS  = 2000
# DEFAULT_CHECKPOINT = 5

# ## These may be updated by arguments
# EnvName = 'InvertedPendulum-v4'
# TRAIN = False
# RENDER = True

# # writer = SummaryWriter(max_queue=2)

# def main():
#     global TRAIN, RENDER, EnvName

#     # Training Parameters
#     params_path = os.path.join(MODELS_PATH, f'{EnvName}.json')
#     with open(params_path, 'r') as f:
#         params = json.load(f)

#     EPOCHS        = params["EPOCHS"] if "EPOCHS" in params else 200
#     LR            = params["LR"] if "LR" in params else 1e-3
#     C2            = params["C2"] if "C2" in params else 0
#     GAMMA         = params["GAMMA"] if "GAMMA" in params else 0.99
#     STD           = params["STD"] if "STD" in params else 1
#     NETSIZE       = params["NETSIZE"] if "NETSIZE" in params else 64
#     BATCHSIZE     = params["BATCHSIZE"] if "BATCHSIZE" in params else 500
#     TRAJECTORIES  = params["TRAJECTORIES"] if "TRAJECTORIES" in params else 10
#     MAX_STEPS     = params["MAX_STEPS"] if "MAX_STEPS" in params else 2000
#     CHECKPOINT    = params["CHECKPOINT"] if "CHECKPOINT" in params else 5
#     EPISODES      = params["EPISODES"] if "EPISODES" in params else 2000

#     print("Environment:  ", EnvName)
#     print("Train:        ", TRAIN)
#     print("EPOCHS:       ", EPOCHS)
#     print("LR:           ", LR)
#     print("C2:           ", C2)
#     print("GAMMA:        ", GAMMA)
#     print("STD:          ", STD)
#     print("NETSIZE:      ", NETSIZE)
#     print("BATCHSIZE:    ", BATCHSIZE)
#     print("TRAJECTORIES: ", TRAJECTORIES)
#     print("MAX_STEPS:    ", MAX_STEPS)
#     print("CHECKPOINT:   ", CHECKPOINT)
#     print("EPISODES:     ", EPISODES)

#     # save model after collecting N trajectories 
#     # (which corresponds to when the update is calculated)
#     SAVE_STEP = CHECKPOINT * TRAJECTORIES
#     save_model_name = os.path.join(MODELS_PATH, EnvName + ".pth")

#     total = 0

#     env = gym.make(EnvName, render_mode="human" if RENDER else None)
#     agent = PPOAgent(
#             TRAIN, env=env,
#             lr=LR, c2=C2,
#             net_size=NETSIZE,
#             net_std=STD,
#             y=GAMMA,
#             traj=TRAJECTORIES,
#             bs=BATCHSIZE,
#             ep=EPOCHS
#     )
#     if not TRAIN: agent.load(save_model_name)

#     for i in range(EPISODES):
#         state, _ = env.reset()

#         for t in range(MAX_STEPS+1):
#             # RL Step
#             action = agent(state)
#             new_state, reward, done, _, _ = env.step(action)
            
#             # Impose done=True if last-step
#             if t == MAX_STEPS: done = True

#             agent.observe(state, reward, new_state, done, i)

#             total += reward
#             state  = new_state

#             if done: break

#         # Print Performance
#         print(f"[{i}] Steps: {t}\tReward: {total}")
#         # writer.add_scalar('Reward', total, i)
#         total = 0

#         if TRAIN and (i % SAVE_STEP) == SAVE_STEP -1:
#             agent.save(save_model_name)
#             print("Model Checkpoint saved")

#     env.close()


# # envs_names = glob(f'{MODELS_PATH}/*.json')
# # envs_names = [x.split('/')[-1].split('.')[0] for x in envs_names]

# # parser = argparse.ArgumentParser(description="Train PPO models and run Gym environments")
# # parser.add_argument('env', type=str, metavar="environment", help=", ".join(envs_names),
# #                     choices=envs_names, default="MountainCarContinuous-v0")
# # parser.add_argument('--train', action='store_true')
# # args = parser.parse_args()

# # EnvName = EnvName
# # TRAIN   = TRAIN
# # RENDER  = not(TRAIN)

# # main()


In [26]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64, std=0.1):
        super(PPOAgent, self).__init__()

        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, action_dim),
        )
        self.log_std = nn.Parameter(torch.ones(1, action_dim) * std)

        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        actor_output = self.actor(x)
        critic_output = self.critic(x)
        std = self.log_std.exp().expand_as(actor_output)
        dist = Normal(actor_output, std)
        return dist, critic_output

class PPO:
    def __init__(self, env_name, epochs=200, lr=1e-3, gamma=0.99, c1=0.5, c2=0.01, batch_size=500):
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.agent = PPOAgent(self.state_dim, self.action_dim)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.epochs = epochs
        self.gamma = gamma
        self.c1 = c1
        self.c2 = c2
        self.batch_size = batch_size

    def train(self):
        for epoch in range(self.epochs):
            states, actions, rewards, log_probs, values = self.generate_trajectories()

            for _ in range(self.epochs):
                self.update(states, actions, rewards, log_probs, values)


    def generate_trajectories(self):
        states = []
        actions = []
        rewards = []
        log_probs = []
        values = []

        state = self.env.reset()[0]

        for _ in range(self.batch_size):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Ensure the state tensor is of type float
            dist, value = self.agent(state_tensor)
            action = dist.sample()

            next_state, reward, done, _, _ = self.env.step(action.numpy()[0])
            
            states.append(state_tensor)  # Append the state tensor
            actions.append(action)
            rewards.append(reward)
            log_probs.append(dist.log_prob(action).unsqueeze(0))
            values.append(value)

            state = next_state
            if done:
                state = self.env.reset()[0]

        return torch.cat(states), torch.cat(actions), torch.tensor(rewards), torch.cat(log_probs), torch.cat(values)



    def update(self, states, actions, rewards, log_probs, values):
        returns = self.compute_returns(rewards)
        advantages = self.compute_advantages(rewards, values)

        for _ in range(self.epochs):
            print(states.shape, self.agent)
            dist, new_values = self.agent(states)
            new_log_probs = dist.log_prob(actions)

            ratio = (new_log_probs - log_probs).exp()
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1.0 - self.c2, 1.0 + self.c2) * advantages

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (values - returns).pow(2).mean()
            entropy = dist.entropy().mean()

            loss = actor_loss + self.c1 * critic_loss - self.c2 * entropy

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def compute_returns(self, rewards):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    def compute_advantages(self, rewards, values):
        advantages = self.compute_returns(rewards) - values.squeeze(1)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

# ppo = PPO("InvertedPendulum-v4")
# ppo.train()


In [40]:
#this one sorta, but doesnt use correct batch and stuyff
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64, std=0.1):
        super(PPOAgent, self).__init__()

        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, action_dim),
        )
        self.log_std = nn.Parameter(torch.ones(1, action_dim) * std)

        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        actor_output = self.actor(x)
        critic_output = self.critic(x)
        std = self.log_std.exp().expand_as(actor_output)
        dist = Normal(actor_output, std)
        return dist, critic_output

class PPO:
    def __init__(self, env_name, epochs=1000, lr=3e-4, gamma=0.99, c1=0.5, c2=0.01, batch_size=64):
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.agent = PPOAgent(self.state_dim, self.action_dim)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.epochs = epochs
        self.gamma = gamma
        self.c1 = c1
        self.c2 = c2
        self.batch_size = batch_size

    def train(self):
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}')
            states, actions, rewards, log_probs, values = self.generate_trajectories()

            # Detach tensors before passing them to the update method
            states_detached = states.detach()
            actions_detached = actions.detach()
            rewards_detached = rewards.detach()
            log_probs_detached = log_probs.detach()
            values_detached = values.detach()
            print('Updating')
            self.update(states_detached, actions_detached, rewards_detached, log_probs_detached, values_detached)

    def generate_trajectories(self):
        states = []
        actions = []
        rewards = []
        log_probs = []
        values = []

        state = self.env.reset()[0]

        for _ in range(self.batch_size):
            state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Ensure the state tensor is of type float
            dist, value = self.agent(state_tensor)
            action = dist.sample()

            next_state, reward, done, _, _ = self.env.step(action.numpy()[0])
            
            states.append(state_tensor)  # Append the state tensor
            actions.append(action)
            rewards.append(reward)
            log_probs.append(dist.log_prob(action).unsqueeze(0))
            values.append(value)

            state = next_state
            if done:
                state = self.env.reset()[0]

        return torch.cat(states), torch.cat(actions), torch.tensor(rewards), torch.cat(log_probs), torch.cat(values)

    def update(self, states, actions, rewards, log_probs, values):
        returns = self.compute_returns(rewards)
        advantages = self.compute_advantages(rewards, values)

        self.optimizer.zero_grad()  # Clear gradients

        dist, new_values = self.agent(states)
        new_log_probs = dist.log_prob(actions)

        ratio = (new_log_probs - log_probs).exp()
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.c2, 1.0 + self.c2) * advantages

        actor_loss = -torch.min(surr1, surr2).mean()
        critic_loss = (values - returns).pow(2).mean()
        entropy = dist.entropy().mean()

        loss = actor_loss + self.c1 * critic_loss - self.c2 * entropy

        loss.backward(retain_graph=True)  # Retain computational graph
        self.optimizer.step()

    def compute_returns(self, rewards):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    def compute_advantages(self, rewards, values):
        advantages = self.compute_returns(rewards) - values.squeeze(1)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    def evaluate(self, env, num_episodes=10):
        total_rewards = 0
        for _ in range(num_episodes):
            state = env.reset()[0]
            done = False
            episode_reward = 0

            while not done:
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                dist, _ = self.agent(state_tensor)
                action = dist.mean  # Using mean action for evaluation
                next_state, reward, done, _, _ = env.step(action.detach().numpy()[0])
                state = next_state
                episode_reward += reward

            total_rewards += episode_reward

        average_reward = total_rewards / num_episodes
        return average_reward

ppo = PPO("InvertedPendulum-v4")
ppo.train()


Epoch 1
Updating
Epoch 2
Updating
Epoch 3
Updating
Epoch 4
Updating
Epoch 5
Updating
Epoch 6
Updating
Epoch 7
Updating
Epoch 8
Updating
Epoch 9
Updating
Epoch 10
Updating
Epoch 11
Updating
Epoch 12
Updating
Epoch 13
Updating
Epoch 14
Updating
Epoch 15
Updating
Epoch 16
Updating
Epoch 17
Updating
Epoch 18
Updating
Epoch 19
Updating
Epoch 20
Updating
Epoch 21
Updating
Epoch 22
Updating
Epoch 23
Updating
Epoch 24
Updating
Epoch 25
Updating
Epoch 26
Updating
Epoch 27
Updating
Epoch 28
Updating
Epoch 29
Updating
Epoch 30
Updating
Epoch 31
Updating
Epoch 32
Updating
Epoch 33
Updating
Epoch 34
Updating
Epoch 35
Updating
Epoch 36
Updating
Epoch 37
Updating
Epoch 38
Updating
Epoch 39
Updating
Epoch 40
Updating
Epoch 41
Updating
Epoch 42
Updating
Epoch 43
Updating
Epoch 44
Updating
Epoch 45
Updating
Epoch 46
Updating
Epoch 47
Updating
Epoch 48
Updating
Epoch 49
Updating
Epoch 50
Updating
Epoch 51
Updating
Epoch 52
Updating
Epoch 53
Updating
Epoch 54
Updating
Epoch 55
Updating
Epoch 56
Updating
E

KeyboardInterrupt: 

In [28]:



# # Create a new environment for evaluation
# eval_env = gym.make('InvertedPendulum-v4', render_mode='human')

# average_reward = ppo.evaluate(eval_env)
# print(f"Average reward over 10 episodes: {average_reward}")

# # Don't forget to close the evaluation environment when done
# eval_env.close()


In [29]:
# eval_env.close()

In [30]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64, std=0.1):
        super(PPOAgent, self).__init__()

        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, action_dim),
        )
        self.log_std = nn.Parameter(torch.ones(1, action_dim) * std)

        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        actor_output = self.actor(x)
        critic_output = self.critic(x)
        std = self.log_std.exp().expand_as(actor_output)
        dist = Normal(actor_output, std)
        return dist, critic_output

class PPO:
    def __init__(self, env_name, horizon=2048, lr=3e-4, epochs=10, minibatch_size=64, gamma=0.99, gae_lambda=0.95):
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.agent = PPOAgent(self.state_dim, self.action_dim)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.horizon = horizon
        self.epochs = epochs
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.gae_lambda = gae_lambda

    def train(self, total_timesteps=1000):
        timesteps_so_far = 0
        while timesteps_so_far < total_timesteps:
            states, actions, rewards, log_probs, values = self.generate_trajectories()

            # Update timesteps_so_far
            print(states.shape)
            timesteps_so_far += states.shape[0]

            self.update(states, actions, rewards, log_probs, values)

    def generate_trajectories(self):
        states = []
        actions = []
        rewards = []
        log_probs = []
        values = []

        timesteps_so_far = 0
        while timesteps_so_far < self.horizon:
            state = self.env.reset()[0]

            for _ in range(self.minibatch_size):
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                dist, value = self.agent(state_tensor)
                action = dist.sample()

                next_state, reward, done, _, _ = self.env.step(action.numpy()[0])
                
                states.append(state_tensor)
                actions.append(action)
                rewards.append(reward)
                log_probs.append(dist.log_prob(action).unsqueeze(0))
                values.append(value)

                state = next_state
                if done:
                    state = self.env.reset()[0]

            timesteps_so_far += self.minibatch_size

        return torch.cat(states), torch.cat(actions), torch.tensor(rewards), torch.cat(log_probs), torch.cat(values)

    def update(self, states, actions, rewards, log_probs, values):
        returns = self.compute_returns(rewards)
        advantages = self.compute_advantages(rewards, values)

        self.optimizer.zero_grad()

        dist, new_values = self.agent(states)
        new_log_probs = dist.log_prob(actions)

        ratio = (new_log_probs - log_probs).exp()
        print(ratio.shape, advantages.shape)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.gae_lambda, 1.0 + self.gae_lambda) * advantages

        actor_loss = -torch.min(surr1, surr2).mean()
        critic_loss = (values - returns).pow(2).mean()
        entropy = dist.entropy().mean()

        loss = actor_loss + critic_loss - 0.01 * entropy

        loss.backward()
        self.optimizer.step()

    def compute_returns(self, rewards):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    def compute_advantages(self, rewards, values):
        advantages = self.compute_returns(rewards) - values.squeeze(1)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

# ppo = PPO("InvertedPendulum-v4")
# ppo.train()


In [31]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal

class PPOAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64, std=0.1):
        super(PPOAgent, self).__init__()

        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, action_dim),
        )
        self.log_std = nn.Parameter(torch.ones(1, action_dim) * std)

        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        actor_output = self.actor(x)
        critic_output = self.critic(x)
        std = self.log_std.exp().expand_as(actor_output)
        dist = Normal(actor_output, std)
        return dist, critic_output

class PPO:
    def __init__(self, env_name, horizon=2048, lr=3e-4, epochs=10, minibatch_size=64, gamma=0.99, gae_lambda=0.95):
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.agent = PPOAgent(self.state_dim, self.action_dim)
        self.optimizer = optim.Adam(self.agent.parameters(), lr=lr)
        self.horizon = horizon
        self.epochs = epochs
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.gae_lambda = gae_lambda

    def train(self, total_timesteps=1000000):
        timesteps_so_far = 0
        while timesteps_so_far < total_timesteps:
            states, actions, rewards, log_probs, values = self.generate_trajectories()

            # Update timesteps_so_far
            timesteps_so_far += states.shape[0]

            self.update(states, actions, rewards, log_probs, values)

    def generate_trajectories(self):
        states = []
        actions = []
        rewards = []
        log_probs = []
        values = []

        timesteps_so_far = 0
        while timesteps_so_far < self.horizon:
            state = self.env.reset()[0]

            for _ in range(self.minibatch_size):
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                dist, value = self.agent(state_tensor)
                action = dist.sample()

                next_state, reward, done, _, _ = self.env.step(action.numpy()[0])
                
                states.append(state_tensor)
                actions.append(action)
                rewards.append(reward)
                log_probs.append(dist.log_prob(action).unsqueeze(0))
                values.append(value)

                state = next_state
                if done:
                    state = self.env.reset()[0]

            timesteps_so_far += self.minibatch_size

        return torch.cat(states), torch.cat(actions), torch.tensor(rewards), torch.cat(log_probs), torch.cat(values)

    def update(self, states, actions, rewards, log_probs, values):
        returns = self.compute_returns(rewards)
        advantages = self.compute_advantages(rewards, values)

        self.optimizer.zero_grad()

        dist, new_values = self.agent(states)
        new_log_probs = dist.log_prob(actions)

        # Expand ratio to match the shape of advantages
        ratio = (new_log_probs - log_probs).exp().unsqueeze(1)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.gae_lambda, 1.0 + self.gae_lambda) * advantages

        actor_loss = -torch.min(surr1, surr2).mean()
        critic_loss = (values - returns).pow(2).mean()
        entropy = dist.entropy().mean()

        loss = actor_loss + critic_loss - 0.01 * entropy

        loss.backward()
        self.optimizer.step()


    def compute_returns(self, rewards):
        returns = []
        G = 0
        for r in reversed(rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        return torch.tensor(returns)

    def compute_advantages(self, rewards, values):
        advantages = []
        G = 0
        for r, v in zip(reversed(rewards), reversed(values)):
            G = r + self.gamma * G
            advantages.insert(0, G - v.item())
        advantages = torch.tensor(advantages)
        
        # Split advantages into minibatches
        minibatch_advantages = []
        minibatch_size = len(advantages) // self.minibatch_size
        for i in range(self.minibatch_size):
            start_idx = i * minibatch_size
            end_idx = (i + 1) * minibatch_size
            minibatch_advantages.append(advantages[start_idx:end_idx])
        
        # Stack minibatch advantages along a new dimension
        minibatch_advantages = torch.stack(minibatch_advantages, dim=1)
        
        # Calculate mean and standard deviation batch-wise
        advantages_mean = minibatch_advantages.mean(dim=1)
        advantages_std = minibatch_advantages.std(dim=1) + 1e-8
        
        # Normalize advantages using batch-wise mean and std
        normalized_advantages = (minibatch_advantages - advantages_mean.unsqueeze(1)) / advantages_std.unsqueeze(1)
        
        return normalized_advantages

# ppo = PPO("InvertedPendulum-v4")
# ppo.train()


# joakim:

In [32]:
class Actor(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(Actor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, x):
        mu = self.fc(x)
        std = self.log_std.exp().expand_as(mu)
        return mu, std

class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

class PPOAgent:
    def __init__(self, input_dim, action_dim, actor_lr=1e-4, critic_lr=1e-3, gamma=0.99, epsilon=0.2, k_epochs=10, c1=0.5, c2=0.01):
        self.gamma = gamma
        self.epsilon = epsilon
        self.k_epochs = k_epochs  # Number of optimization epochs per batch
        self.c1 = c1  # Value function coefficient
        self.c2 = c2  # Entropy coefficient
        self.actor = Actor(input_dim, action_dim)
        self.critic = Critic(input_dim)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.from_numpy(state).float()
        mu, std = self.actor(state)
        distribution = torch.distributions.Normal(mu, std)
        action = distribution.sample()
        log_prob = distribution.log_prob(action).sum(dim=-1)
        return action.detach().numpy(), log_prob

    def compute_gae(self, rewards, masks, values, next_value):
        gae = 0
        returns = []
        values = values + [next_value]
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + self.gamma * values[step + 1] * masks[step] - values[step]
            gae = delta + self.gamma * 0.95 * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    def update(self, trajectory):
        states, actions, log_probs_old, returns, advantages = trajectory

        # Convert lists or arrays to tensors outside the loop
        log_probs_old = torch.stack(log_probs_old).detach()
        states = torch.stack(states).detach()
        actions = torch.tensor(actions).detach()
        returns = torch.tensor(returns).unsqueeze(-1).detach()
        advantages = torch.tensor(advantages)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

        for _ in range(self.k_epochs):  # Multiple optimization epochs
            # Optimization steps for both actor and critic
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()

            mu, std = self.actor(states)
            dist = torch.distributions.Normal(mu, std)
            log_probs_new = dist.log_prob(actions).sum(dim=-1)
            entropy = dist.entropy().mean()

            ratios = torch.exp(log_probs_new - log_probs_old)
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * advantages

            actor_loss = -torch.min(surr1, surr2).mean() - self.c2 * entropy  # Include entropy bonus
            critic_loss = self.mse_loss(self.critic(states), returns) * self.c1  # Apply value loss coefficient directly

            actor_loss.backward()
            self.actor_optimizer.step()

            critic_loss.backward()
            self.critic_optimizer.step()

    def train(self, env, episodes=250000, batch_size=2048):
        all_rewards = []
        step_counter = 0
        
        # Initialize empty lists to collect data until batch size is reached
        states, actions, rewards, log_probs, values, masks = [], [], [], [], [], []

        for episode in range(episodes):
            state = env.reset()[0]
            episode_rewards = 0

            for _ in range(1000):
                action, log_prob = self.select_action(state)
                value = self.critic(torch.from_numpy(state).float()).item()
                next_state, reward, done, _, _ = env.step(action)

                # Accumulate data in the lists
                states.append(torch.from_numpy(state).float())
                actions.append(action)
                rewards.append(reward)
                log_probs.append(log_prob)
                values.append(value)
                masks.append(1 - done)

                state = next_state
                episode_rewards += reward
                step_counter += 1

                if step_counter % batch_size == 0:
                    # Update policy with the accumulated data once the batch size is reached
                    next_value = self.critic(torch.from_numpy(state).float()).item()
                    returns = self.compute_gae(rewards, masks, values, next_value)
                    advantages = [ret - val for ret, val in zip(returns, values)]
                    trajectory = (states, actions, log_probs, returns, advantages)
                    self.update(trajectory)

                    # Clear the accumulated data for the next batch
                    states, actions, rewards, log_probs, values, masks = [], [], [], [], [], []

                if done:
                    break

            all_rewards.append(episode_rewards)
            print(f"Episode {episode + 1}, Total Reward = {episode_rewards}, Total Steps = {step_counter}")

            # Check termination condition (if needed)
            if step_counter >= 1000000:
                return all_rewards

        return all_rewards

# chat 4:

In [33]:
import gym
import torch
import numpy as np
import multiprocessing as mp
import torch.nn as nn
from collections import deque
from torch.distributions import Categorical



class AgentNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(AgentNet, self).__init__()
        self.affine = nn.Linear(num_inputs, 128)
        self.action_head = nn.Linear(128, num_outputs)
        self.value_head = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.tanh(self.affine(x))
        action_probs = torch.softmax(self.action_head(x), dim=-1)
        state_values = self.value_head(x)
        return action_probs, state_values

class PPOAgent:
    def __init__(self, env, config, policy_params):
        self.env = env
        self.config = config
        self.model = AgentNet(env.observation_space.shape[0], env.action_space.n)
        self.model.load_state_dict(policy_params)

    def step(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs, state_value = self.model(state)
        m = Categorical(probs)
        action = m.sample()
        log_prob = m.log_prob(action)
        next_state, reward, done, _ = self.env.step(action.item())
        return action.item(), log_prob, state_value, next_state, reward, done

    def update_policy(self, states, actions, old_log_probs, rewards, values, optimizer):
        actions = torch.tensor(actions)
        old_log_probs = torch.stack(old_log_probs)
        rewards = torch.tensor(rewards)
        values = torch.cat(values)
        masks = torch.tensor([1.0] * len(rewards))

        # Adding last value for advantage calculation
        _, last_value = self.model(torch.from_numpy(states[-1]).float().unsqueeze(0))
        values = torch.cat([values, last_value.detach()])

        advantages = self.compute_advantages(rewards, masks, values)

        # Convert advantages to tensor and standardize
        advantages = torch.tensor(advantages)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Convert rewards to returns
        returns = advantages + values[:-1]

        # Optimization loop
        for _ in range(self.config['epochs']):
            idx = torch.randperm(len(states))
            for batch_indices in idx.split(self.config['batch_size']):
                sampled_states = torch.tensor(states)[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_old_log_probs = old_log_probs[batch_indices]
                sampled_advantages = advantages[batch_indices]

                # Forward pass
                new_probs, new_values = self.model(sampled_states)
                new_dist = Categorical(new_probs)
                new_log_probs = new_dist.log_prob(sampled_actions)

                # Calculating the ratio (pi_theta / pi_theta_old):
                ratio = torch.exp(new_log_probs - sampled_old_log_probs)

                # Clipped surrogate loss
                surr1 = ratio * sampled_advantages
                surr2 = torch.clamp(ratio, 1.0 - self.epsilon, 1.0 + self.epsilon) * sampled_advantages
                loss = -torch.min(surr1, surr2).mean()  # Focus only on the clipping part

                # take gradient step
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            


In [38]:
def worker(worker_id, policy_params, config, return_dict):
    """Worker process to collect data from the environment."""
    np.random.seed(worker_id)
    torch.manual_seed(worker_id)
    env = gym.make(config['env_name'])
    agent = PPOAgent(env, config, policy_params)

    state = env.reset()
    rewards, log_probs, states, actions, values = [], [], [], [], []
    for _ in range(config['horizon']):
        action, log_prob, value, next_state, reward, done = agent.step(state)
        states.append(state)
        actions.append(action)
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(reward)
        state = next_state if not done else env.reset()

    return_dict[worker_id] = {
        'states': states,
        'actions': actions,
        'log_probs': log_probs,
        'values': values,
        'rewards': rewards
    }

def main():
    config = {
        'env_name': 'InvertedPendulum-v4',
        'horizon': 2048,
        'learning_rate': 3e-4,
        'batch_size': 64,
        'epochs': 10,
        'gamma': 0.99,
        'gae_lambda': 0.95,
        'num_workers': 4
    }

    env = gym.make(config['env_name'])
    model = AgentNet(env.observation_space.shape[0], env.action_space.shape[0])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])

    manager = mp.Manager()
    return_dict = manager.dict()

    for iteration in range(10):  # run for 10 iterations
        processes = []
        for i in range(config['num_workers']):
            p = mp.Process(target=worker, args=(i, model.state_dict(), config, return_dict))
            p.start()
            processes.append(p)
        
        for p in processes:
            p.join()

        # Aggregate data from all workers
        print(return_dict)
        aggregated_data = {k: [] for k in return_dict[0].keys()}
        for i in range(config['num_workers']):
            for key in aggregated_data.keys():
                aggregated_data[key].extend(return_dict[i][key])
        
        # Convert lists to tensors and perform PPO update
        states = torch.FloatTensor(aggregated_data['states'])
        actions = torch.LongTensor(aggregated_data['actions'])
        old_log_probs = torch.stack(aggregated_data['log_probs'])
        rewards = torch.FloatTensor(aggregated_data['rewards'])
        values = torch.stack(aggregated_data['values'])

        # Example PPO update, assuming `update_policy` is implemented
        agent = PPOAgent(env, config, model.state_dict())
        agent.update_policy(states, actions, old_log_probs, rewards, values, optimizer)

        print(f'Iteration {iteration + 1} complete.')

if __name__ == "__main__":
    main()





{}


KeyError: 0