In [14]:
import sys
import os
import gymnasium as gym
import rware
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

# Add the root directory to path to find the 'src' folder
sys.path.append(os.path.abspath(os.path.join('..')))

# Import your architecture
from src.worker_mlp import WorkerNetwork

# Initialize Environment
env = gym.make("rware:rware-tiny-2ag-v2")
print("Environment and WorkerNetwork successfully loaded.")

Environment and WorkerNetwork successfully loaded.


In [16]:
class RolloutStorage:
    def __init__(self):
        self.states, self.actions, self.logprobs, self.rewards, self.is_terminals, self.values = [], [], [], [], [], []
    
    def clear(self):
        del self.states[:], self.actions[:], self.logprobs[:], self.rewards[:], self.is_terminals[:], self.values[:]

class PPOAgent:
    def __init__(self, obs_shape=71, action_dim=5, lr=3e-4, gamma=0.99, eps_clip=0.2):
        self.policy = WorkerNetwork(obs_shape, action_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.policy_old = WorkerNetwork(obs_shape, action_dim)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.gamma, self.eps_clip = gamma, eps_clip
        self.mse_loss = nn.MSELoss()

    def select_action(self, state, storage):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            logits, val = self.policy_old(state)
        probs = F.softmax(logits, dim=-1)
        dist = Categorical(probs)
        action = dist.sample()
        storage.states.append(state); storage.actions.append(action)
        storage.logprobs.append(dist.log_prob(action)); storage.values.append(val)
        return action.item()

    def update(self, storage):
        old_states = torch.cat(storage.states).detach()
        old_actions = torch.cat(storage.actions).detach()
        old_logprobs = torch.cat(storage.logprobs).detach()
        returns, discounted_reward = [], 0
        for reward, is_terminal in zip(reversed(storage.rewards), reversed(storage.is_terminals)):
            if is_terminal: discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            returns.insert(0, discounted_reward)
        returns = torch.tensor(returns, dtype=torch.float32)
        values_cat = torch.cat(storage.values).detach().squeeze()
        advantages = returns - values_cat
        for _ in range(10):
            logits, state_values = self.policy(old_states)
            dist = Categorical(F.softmax(logits, dim=-1))
            logprobs = dist.log_prob(old_actions)
            ratios = torch.exp(logprobs - old_logprobs)
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.mse_loss(state_values.squeeze(), returns)
            self.optimizer.zero_grad(); loss.mean().backward(); self.optimizer.step()
        self.policy_old.load_state_dict(self.policy.state_dict()); storage.clear()

# --- Training Execution ---
agent = PPOAgent()
storage = RolloutStorage()
max_episodes, update_timestep, timestep = 2000, 2000, 0

print("Starting Training with Safe State Access...")

for episode in range(max_episodes):
    state, info = env.reset()
    current_state = state[0]
    episode_reward = 0
    
    for t in range(500):
        timestep += 1
        action = agent.select_action(current_state, storage)
        
        # Apply actions
        all_actions = tuple([action] + [0] * (env.unwrapped.n_agents - 1))
        next_state, reward, terminated, truncated, info = env.step(all_actions)
        
        # FIX 1: Access agent positions safely from the internal environment state
        # rware stores agents in env.unwrapped.agents
        agent_obj = env.unwrapped.agents[0]
        agent_x, agent_y = agent_obj.x, agent_obj.y
        
        # Define a goal (e.g., the first shelf location or center)
        goal_x, goal_y = 5, 5 
        dist = abs(agent_x - goal_x) + abs(agent_y - goal_y)
        
        # FIX 2: Convert reward list to a float to stop the UserWarning
        base_reward = float(reward[0])
        
        # Shaped Reward calculation
        shaped_reward = base_reward - (0.005 * dist) - 0.01
        
        storage.rewards.append(shaped_reward)
        storage.is_terminals.append(terminated)
        current_state = next_state[0]
        episode_reward += shaped_reward
        
        if timestep % update_timestep == 0:
            agent.update(storage)
            print(f"Step {timestep} | Policy Updated")
            
        if terminated or truncated:
            break
            
    if episode % 50 == 0:
        print(f"Episode {episode} | Reward: {episode_reward:.2f} | Pos: ({agent_x}, {agent_y})")

Starting Training with Safe State Access...
Episode 0 | Reward: -14.79 | Pos: (1, 0)
Step 2000 | Policy Updated
Step 4000 | Policy Updated
Step 6000 | Policy Updated
Step 8000 | Policy Updated
Step 10000 | Policy Updated
Step 12000 | Policy Updated
Step 14000 | Policy Updated
Step 16000 | Policy Updated
Step 18000 | Policy Updated
Step 20000 | Policy Updated
Step 22000 | Policy Updated
Step 24000 | Policy Updated
Episode 50 | Reward: -15.95 | Pos: (3, 0)
Step 26000 | Policy Updated
Step 28000 | Policy Updated
Step 30000 | Policy Updated
Step 32000 | Policy Updated
Step 34000 | Policy Updated
Step 36000 | Policy Updated
Step 38000 | Policy Updated
Step 40000 | Policy Updated
Step 42000 | Policy Updated
Step 44000 | Policy Updated
Step 46000 | Policy Updated
Step 48000 | Policy Updated
Step 50000 | Policy Updated
Episode 100 | Reward: -22.53 | Pos: (3, 1)
Step 52000 | Policy Updated
Step 54000 | Policy Updated
Step 56000 | Policy Updated
Step 58000 | Policy Updated
Step 60000 | Policy Up

KeyboardInterrupt: 

In [19]:
import torch
import os

# 1. Define the absolute project root to be safe
project_root = r"D:\SWARM"

# 2. Define the specific target path for the worker model
# Using os.path.join is safer for Windows paths
save_dir = os.path.join(project_root, "models", "workers")
model_name = "worker_v1_850k.pth"
full_path = os.path.join(save_dir, model_name)

# 3. Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# 4. Save the trained weights
torch.save(agent.policy.state_dict(), full_path)

print(f"Success! Model saved to: {full_path}")

Success! Model saved to: D:\SWARM\models\workers\worker_v1_850k.pth
