In [None]:
#Notebook track - Extending PPO to a multi agent environment pistonballv6
#Reinforcement Learning (RL) is a powerful approach for teaching agents to solve tasks through interaction with an environment. In this tutorial, we explore multi-agent RL and single-agent RL in the Pistonball environment using the PPO (Proximal Policy Optimization) algorithm. This tutorial highlights the versatility of RL in multi-agent and single-agent environments. By leveraging libraries like PettingZoo, SuperSuit, and Stable-Baselines3, we explore the nuances of policy learning in collaborative and isolated scenarios. The Pistonball environment provides an excellent playground for testing cooperative strategies and debugging individual agent behavior, making it ideal for RL experimentation and research.

#The tutorial covers:

#1)Single-Agent Training and Evaluation: Isolating a single piston and training it to act effectively in the environment.
#2)Multi-Agent Training and Evaluation: Training all pistons collaboratively and recording their performance.

#The Pistonball environment is a multi-agent environment from the PettingZoo library designed for collaborative tasks. Each agent (piston) moves vertically to bounce a ball toward the right. The goal is to keep the ball in the air and help it travel as far as possible while penalizing actions that waste energy. All pistons share a common goal, so performance depends on teamwork.Each agent sees a small window of the environment and the action space is treated as continuous.

#Part one: This section focuses on isolating and training a single agent, piston_3, from the Pistonball environment. Using a custom SingleAgentWrapper, the multi-agent environment is adapted for single-agent training by isolating the observation, action, and reward spaces of the specified agent. The wrapper also records rendered frames for visualization.PPO is used to train the agent to maximize its individual contribution to the global task of moving the ball to the right. Training focuses solely on the selected agent's local interactions, simplifying the problem and reducing computational complexity.After training, the agent is evaluated in the same wrapped environment, and its performance is recorded as a video. This process provides insight into the agent's learned behavior, helping to debug and optimize its policy within the larger multi-agent system.
#Part two: This section trains all agents in the Pistonball environment collaboratively using PPO. The environment is preprocessed with SuperSuit wrappers to standardize observation and action spaces, reduce complexity, and ensure compatibility with multi-agent RL.PPO trains a shared policy for all agents, optimizing their collective performance in moving the ball to the right while minimizing energy wastage. The algorithm balances exploration and exploitation, leveraging image-based inputs and temporal context provided by stacked frames. After training, the policy is evaluated, and the agents' collaborative behavior is recorded as a video. This allows for a visual assessment of the learned strategies and how well the agents work together to achieve the global goal.

#Incomplete portion: extending the framework through custom policy to MADDPG (Multi-Agent Deep Deterministic Policy Gradient) from RLlib. Problem encountered with discrete space. A better alternative would have been perhaps DQN instead. More work to follow but i donot have enough time.



In [None]:
import numpy as np
import matplotlib.pyplot as plt
!pip install pettingzoo
!pip install stable_baselines3
!pip install gym
!pip install pymunk
!pip install gymnasium
!pip install supersuit
!pip install pettingzoo stable-baselines3 supersuit
!pip install pettingzoo supersuit stable-baselines3 gym
!pip install 'shimmy>=2.0'
!pip install pettingzoo stable-baselines3 gymnasium shimmy supersuit

import numpy as np
import matplotlib.pyplot as plt
from pettingzoo.butterfly import pistonball_v6
from stable_baselines3 import PPO
from supersuit import pad_observations_v0, pad_action_space_v0
from gymnasium import Env, spaces
from stable_baselines3.common.vec_env import DummyVecEnv
import os
import cv2
from IPython.display import HTML
from base64 import b64encode

# Step 1: Initialize PettingZoo AEC Environment with RGB Array Rendering
env = pistonball_v6.env(render_mode="rgb_array")  # Use rgb_array for frame rendering

# Step 2: Apply Supersuit Wrappers
env = pad_observations_v0(env)
env = pad_action_space_v0(env)

# Step 3: Single-Agent Wrapper for AEC Environment
class SingleAgentWrapper(Env):
    """Wrapper to isolate a single agent and comply with Gym API."""
    def __init__(self, env, agent_id):
        self.env = env
        self.agent_id = agent_id
        self.action_space = self.env.action_space(self.agent_id)
        self.observation_space = self.env.observation_space(self.agent_id)
        self.frames = []  # Store frames for video rendering

    def reset(self, seed=None, options=None):
        self.frames = []  # Clear previous frames
        self.env.reset(seed=seed, options=options)
        obs, _, _, _,_ = self.env.last()
        self.frames.append(self.env.render())  # Save initial frame
        return obs, {}

    def step(self, action):
        self.env.step(action)
        obs, reward, terminated, truncated, _ = self.env.last()
        done = terminated or truncated
        self.frames.append(self.env.render())  # Save current frame
        return obs, reward, done, False, {}

    def render_video(self, video_path="output.mp4", fps=30):
        """Save the collected frames as a video file."""
        height, width, _ = self.frames[0].shape
        out = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
        for frame in self.frames:
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR
            out.write(frame_bgr)
        out.release()
        print(f"Video saved to {video_path}")

# Step 4: Wrap the Environment for Single Agent
agent_id = "piston_3"  # Focused agent
single_agent_env = SingleAgentWrapper(env, agent_id)

# Ensure compatibility with Stable Baselines3
vec_env = DummyVecEnv([lambda: single_agent_env])

# Step 5: Train PPO Model
model = PPO("MlpPolicy", vec_env, verbose=1)
print("Training PPO model...")
model.learn(total_timesteps=50000) #configurable param for better training

# Step 6: Test the Trained Agent and Record Video
print("Testing PPO model and recording video...")
obs = vec_env.reset()
for _ in range(1000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = vec_env.step(action)
    if done:
        obs = vec_env.reset()

# Save the recorded video
video_path = "pistonball_output.mp4"
single_agent_env.render_video(video_path)

# Step 7: Display Video in Colab
def show_video(video_path):
    """Display video in Colab."""
    with open(video_path, "rb") as video_file:
        video_data = video_file.read()
    video_base64 = b64encode(video_data).decode()
    video_html = f'''
        <video width="600" controls>
            <source src="data:video/mp4;base64,{video_base64}" type="video/mp4">
        </video>
    '''
    return HTML(video_html)

show_video(video_path)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
!pip install pettingzoo
!pip install stable_baselines3
!pip install gym
!pip install pymunk
!pip install gymnasium
!pip install supersuit
!pip install pettingzoo stable-baselines3 supersuit
!pip install pettingzoo supersuit stable-baselines3 gym
!pip install 'shimmy>=2.0'
!pip install pettingzoo stable-baselines3 gymnasium shimmy supersuit



from pettingzoo.butterfly import pistonball_v6
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3 import PPO
import supersuit as ss
from stable_baselines3.common.vec_env import VecVideoRecorder
import os

#Step 1: Initialising training and evaluation
# False: Training ; True: Evaluation
is_evaluation = True

#Step 2: Training the model
def train():
    # Set up the environment for training
    env = pistonball_v6.parallel_env(n_pistons=20, time_penalty=-0.1, continuous=True,
                                    random_drop=True, random_rotate=True, ball_mass=0.75, ball_friction=0.3,
                                    ball_elasticity=1.5, max_cycles=125)

    env = ss.color_reduction_v0(env, mode="B")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.frame_stack_v1(env, 3)
    env = ss.pettingzoo_env_to_vec_env_v1(env)
    env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class="stable_baselines3")

    # Define PPO model
    model = PPO(CnnPolicy, env, verbose=3, gamma=0.95, n_steps=512, ent_coef=0.0905168,
                learning_rate=0.00062211, vf_coef=0.042202, max_grad_norm=0.9, gae_lambda=0.99,
                n_epochs=5, clip_range=0.3, batch_size=256)

    # Training the model
    print("\nTraining is starting...\n")
    model.learn(total_timesteps=200000)

    # Save the trained model
    model.save("/content/policy")
    print("\nModel saved to /content/policy\n")

#Step 3: Record Video
def record_video(env, model, video_length=500, prefix="eval"):
    """Records a video of the evaluation and saves it to /content/videos/."""
    video_folder = "/content/videos/"
    os.makedirs(video_folder, exist_ok=True)
    env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix=prefix)

    obs = env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, _, _ = env.step(action)
    env.close()
    print(f"\nVideo recorded at {video_folder}\n")

#Step 4: Set up the environment for evaluation
def evaluate():

    print("\nEvaluation is starting...\n")
    env = pistonball_v6.parallel_env(n_pistons=20, time_penalty=-0.1, continuous=True,
                                     random_drop=True, random_rotate=True, ball_mass=0.75, ball_friction=0.3,
                                     ball_elasticity=1.5, max_cycles=125, render_mode="rgb_array")

    # Apply preprocessing
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.frame_stack_v1(env, 3)
    env = ss.pettingzoo_env_to_vec_env_v1(env)
    env = ss.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")


    #env = pistonball_v6.env()
    #env = ss.color_reduction_v0(env, mode="B")
    #env = ss.resize_v1(env, x_size=84, y_size=84)
    #env = ss.frame_stack_v1(env, 3)

    # Load the trained model
    model = PPO.load("/content/policy")

    # Record a video of the evaluation
    record_video(env, model, video_length=1000, prefix="pistonball_eval")


if __name__ == "__main__":
    if is_evaluation:
        evaluate()
    else:
        train()


In [None]:
#UNFINISHED

!pip install pettingzoo supersuit ray[rllib]
!pip install ray[rllib]==2.0.0
!pip install open-cv-python-headless

#!pip uninstall pydantic
!pip install pydantic==1.10.12
from torch import nn

from stable_baselines3 import PPO
from pettingzoo.butterfly import pistonball_v6
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
from supersuit import color_reduction_v0, resize_v1, frame_stack_v1, normalize_obs_v0
from ray.rllib.algorithms.maddpg import MADDPGConfig
from ray.tune.registry import register_env
import ray
from ray.rllib.algorithms.dqn import DQN
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from supersuit import pad_observations_v0, pad_action_space_v0


import supersuit as ss


class CNNModelV2(TorchModelV2, nn.Module):
    def __init__(self, obs_space, act_space, num_outputs, *args, **kwargs):
        TorchModelV2.__init__(self, obs_space, act_space, num_outputs, *args, **kwargs)
        nn.Module.__init__(self)
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, [8, 8], stride=(4, 4)),
            nn.ReLU(),
            nn.Conv2d(32, 64, [4, 4], stride=(2, 2)),
            nn.ReLU(),
            nn.Conv2d(64, 64, [3, 3], stride=(1, 1)),
            nn.ReLU(),
            nn.Flatten(),
            (nn.Linear(3136, 512)),
            nn.ReLU(),
        )
        self.policy_fn = nn.Linear(512, num_outputs)
        self.value_fn = nn.Linear(512, 1)

    def forward(self, input_dict, state, seq_lens):
        model_out = self.model(input_dict["obs"].permute(0, 3, 1, 2))
        self._value_out = self.value_fn(model_out)
        return self.policy_fn(model_out), state

    def value_function(self):
        return self._value_out.flatten()





def create_environment():
    env = pistonball_v6.env(
        n_pistons=20,
        time_penalty=-0.1,
        continuous=True,
        random_drop=True,
        random_rotate=True,
        ball_mass=0.75,
        ball_friction=0.3,
        ball_elasticity=1.5,
        max_cycles=125,
        render_mode="rgb_array",
    )
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.dtype_v0(env, "float32")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.normalize_obs_v0(env, env_min=0, env_max=1)
    env = ss.frame_stack_v1(env, 3)
    #env = pad_observations_v0(env)
    env = pad_action_space_v0(env)
    return env


# Create the environment and apply Supersuit wrappers
#def create_environment():
    # Load Pistonball environment
    #env = pistonball_v6.parallel_env()
    # Apply Supersuit wrappers to modify the environment (preprocessing)
    #env = color_reduction_v0(env, mode="B")  # Reduce observation complexity (e.g., grayscale)
    #env = resize_v1(env, x_size=84, y_size=84)  # Resize observation space
    #env = frame_stack_v1(env, stack_size=4)  # Stack frames for temporal context
    #return env

# Wrapper to register the environment with RLlib
def env_creator(config):
    return create_environment()

# Training configuration and loop
def train_marl_agents():
    # Initialize Ray
    ray.init(ignore_reinit_error=True)

    # Register the environment with RLlib
    env_name = "Pistonball-v6"
    register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))
    ModelCatalog.register_custom_model("custom_cnn", CNNModelV2)


    #register_env(env_name, lambda config: PettingZooEnv(pistonball_v6.env()))
    #register_env(env_name, env_creator)
    temp_env = create_environment()
    temp_env.reset()
    agent_ids = temp_env.possible_agents



    obs_spaces = {agent_id: temp_env.observation_space(agent_id) for agent_id in agent_ids}
    act_spaces = {agent_id: temp_env.action_space(agent_id) for agent_id in agent_ids}

    string_to_int_mapping = {agent_id: idx for idx, agent_id in enumerate(temp_env.possible_agents)}

    # Define policies with string-based IDs
    policies = {
        f"policy_{idx}": (
            None,
            temp_env.observation_space(agent_id),
            temp_env.action_space(agent_id),
            {
                "agent_id": idx,  # Integer-based agent ID for MADDPG
                "model": {
                    "custom_model": "custom_cnn",  # Custom CNN model
                },
            },
        )
        for agent_id, idx in string_to_int_mapping.items()
    }




    # Define policies using the policy creation function

    config = (
        MADDPGConfig()
        .environment(env="pistonball")  # Replace with your registered environment name
        .framework("torch")
        .rollouts(num_rollout_workers=2)
        .training(
            model={
                "custom_model": "custom_cnn",  # Reference the registered custom CNN
            },
            multiagent={
                policies==policies,
                policy_mapping_fn=lambda agent_id, *args, **kwargs: f"policy_{string_to_int_mapping[agent_id]}",
                # Map each agent to its own policy
            }
        )
    )


    algo = config.build()

#




    # Configure the MADDPG algorithm

    #config = (

        #MADDPGConfig()
        #.environment(env=env_name)
        #.rollouts(rollout_fragment_length=200)
        #.multi_agent(
            #policies=policies,
            #policy_mapping_fn=lambda agent_id, *args, **kwargs: f"policy_{string_to_int_mapping[agent_id]}",
            # Map each agent to its own policy
        #)
        #.framework("torch")
        #.resources(num_gpus=1)  # Set to 0 if GPU is not available
    #)

    # Build the algorithm
    #algo = config.build()

    # Training loop
    for i in range(1000):  # Adjust iterations as needed
        result = algo.train()
        print(f"Iteration {i}: reward = {result['episode_reward_mean']}")

        # Save the model periodically
        if i % 100 == 0:
            algo.save(f"checkpoint_{i}")

    # Save the final model
    algo.save("final_marl_model")
    print("Training complete. Model saved.")

    ray.shutdown()
    return algo

# Evaluation loop with video rendering
def evaluate_marl_agents(algo, num_episodes=5, video_dir="videos"):
    import os
    import cv2

    # Create the evaluation environment
    env = create_environment()

    # Ensure video directory exists
    os.makedirs(video_dir, exist_ok=True)

    for episode in range(num_episodes):
        env.reset()
        frames = []

        for agent in env.agent_iter():
            observation, reward, done, info = env.last()
            action = algo.compute_action(observation) if not done else None
            env.step(action)
            frames.append(env.render(mode="rgb_array"))

        # Save the episode as a video
        video_path = os.path.join(video_dir, f"episode_{episode + 1}.mp4")
        save_video(frames, video_path)
        print(f"Saved episode {episode + 1} video at {video_path}")

    env.close()

# Function to save video from frames
def save_video(frames, video_path, fps=30):
    height, width, _ = frames[0].shape
    video = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
    for frame in frames:
        video.write(frame)
    video.release()
    print(f"Video saved to {video_path}")

# Main script
if __name__ == "__main__":
    # Train the agents
    algo = train_marl_agents()

    # Evaluate the trained model and render videos
    evaluate_marl_agents(algo)