In [None]:
!pip install gymnasium numpy matplotlib stable-baselines3[extra] tensorflow --quiet

In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import os
import time

In [None]:
class WarehouseEnvGeneralized(gym.Env):
    """
    Gymnasium environment for optimizing warehouse layout.
    (Docstrings like this explain the class's purpose)
    """
    # Optional: Provides metadata, like supported rendering modes.
    metadata = {'render_modes': ['human'], 'render_fps': 4}

    # --- Initialization (`__init__`) ---
    # This method runs ONCE when you create an instance of the environment (e.g., env = WarehouseEnvGeneralized())
    def __init__(self, num_items=10, render_mode=None):
        # Calls the initializer of the parent class (gym.Env) - necessary setup.
        super().__init__()
        # --- Environment Parameters ---
        self.num_items = num_items # Store the number of items/positions in the warehouse
        self.render_mode = render_mode # Store how rendering should happen (e.g., 'human' for printing)

        # --- Action Space Definition ---
        # Calculate the total number of unique pairs of items we can swap.
        # Formula for combinations: n * (n-1) / 2
        self._num_actions = self.num_items * (self.num_items - 1) // 2
        # Create a list to easily map a discrete action index (0, 1, 2, ...) to the actual pair of indices to swap.
        self._action_to_pair = []
        for i in range(self.num_items):
            for j in range(i + 1, self.num_items): # Ensure j > i to get unique pairs (swap (1,2) is same as (2,1))
                self._action_to_pair.append((i, j))

        # Define the action space using gymnasium.spaces.
        # `spaces.Discrete` means the agent chooses one integer action from 0 up to (self._num_actions - 1).
        self.action_space = spaces.Discrete(self._num_actions)

        # --- Observation Space Definition ---
        # Define what the agent "sees". We use `spaces.Dict` because the observation has multiple parts.
        self.observation_space = spaces.Dict({
            # 'layout': Represents the order of item IDs. It's an array of length `num_items`.
            # Each element is an integer between 0 and `num_items - 1` (representing item IDs).
            # `spaces.Box` defines a continuous or multi-dimensional discrete space.
            'layout': spaces.Box(low=0, high=self.num_items - 1, shape=(self.num_items,), dtype=np.int32),

            # 'demands': Represents the demand frequency for each ITEM ID (item 0, item 1, etc.).
            # It's an array of length `num_items`. Demands range from 1 to 100.
            'demands': spaces.Box(low=1, high=100, shape=(self.num_items,), dtype=np.int32)
        })

        # --- Internal State Variables ---
        # These hold the current state of the environment. They are initialized in `reset`.
        self.layout = None # Will hold the current array representing item positions (e.g., [3, 0, 1, ...])
        self.demand_frequencies = None # Will hold the demand array for the current episode (e.g., [50, 88, ...])

        # --- Rendering Variables (Optional) ---
        # Used if you implement graphical rendering (e.g., with Pygame). Not used in the current text rendering.
        self.window = None
        self.clock = None

    # --- Helper Method: `_get_obs` ---
    # Creates the observation dictionary from the current internal state.
    # Ensures the agent gets a *copy*, so modifying the observation doesn't change the internal state.
    def _get_obs(self):
        return {'layout': self.layout.copy(), 'demands': self.demand_frequencies.copy()}

    # --- Helper Method: `_get_info` ---
    # Returns extra information about the environment state, not used for training but useful for debugging/logging.
    # Here, we calculate the current cost.
    def _get_info(self):
        # Cost = Sum over positions p: (p+1) * demand_of_item_at_position_p
        cost = sum((pos + 1) * self.demand_frequencies[item_id] for pos, item_id in enumerate(self.layout))
        return {'cost': cost}

    # --- Core RL Method: `reset` ---
    # Called at the beginning of every new episode. Prepares the environment for the next run.
    def reset(self, seed=None, options=None):
        # Crucial for reproducibility: seeds the environment's random number generator.
        super().reset(seed=seed)

        # --- Initialize State ---
        # Set the layout to the default order [0, 1, 2, ..., num_items-1].
        self.layout = np.arange(self.num_items, dtype=np.int32)

        # *** KEY FOR GENERALIZATION ***
        # Generate NEW random demand frequencies FOR EACH EPISODE.
        # Uses the environment's seeded random number generator (`self.np_random`) for reproducibility.
        # `integers(low, high, size)` generates integers >= low and < high.
        self.demand_frequencies = self.np_random.integers(1, 101, size=self.num_items, dtype=np.int32)

        # --- Prepare Return Values ---
        # Get the initial observation based on the just-reset state.
        observation = self._get_obs()
        # Get any initial diagnostic info.
        info = self._get_info()

        # If rendering is enabled, show the initial state.
        if self.render_mode == "human": self._render_frame()

        # Standard Gymnasium `reset` returns the initial observation and info dictionary.
        return observation, info

    # --- Core RL Method: `step` ---
    # This is where the main RL interaction happens. Takes an action, updates the state, calculates reward.
    def step(self, action):
        # --- Decode Action ---
        # Convert the integer action chosen by the agent into the pair of layout indices to swap.
        idx1, idx2 = self._action_to_pair[action]

        # --- Update State (Environment Dynamics) ---
        # Perform the swap on the current layout. This is the state transition.
        self.layout[idx1], self.layout[idx2] = self.layout[idx2], self.layout[idx1]

        # --- Calculate Reward ---
        # The core signal guiding the agent's learning.
        # Calculate the cost based on the NEW layout and the CURRENT episode's demands.
        cost = sum((pos + 1) * self.demand_frequencies[item_id] for pos, item_id in enumerate(self.layout))
        # **CRITICAL:** Reward is the NEGATIVE cost. RL agents maximize reward, so we want to maximize (-cost), which is equivalent to minimizing cost.
        reward = -float(cost)

        # --- Determine Episode End Conditions ---
        # `terminated`: True if the episode ends naturally based on the environment's goal (e.g., reaching a target). Our env doesn't have this.
        terminated = False
        # `truncated`: True if the episode ends due to an external limit (e.g., time limit). We haven't added one here.
        truncated = False

        # --- Prepare Return Values ---
        # Get the observation corresponding to the NEW state.
        observation = self._get_obs()
        # Get diagnostic info for the NEW state.
        info = self._get_info()

        # If rendering is enabled, show the state after the step.
        if self.render_mode == "human": self._render_frame()

        # Standard Gymnasium `step` returns 5 values: observation, reward, terminated, truncated, info
        return observation, reward, terminated, truncated, info

    # --- Optional Method: `render` ---
    # Used to visualize the environment's state.
    def render(self):
         if self.render_mode == "human":
             self._render_frame() # Calls the helper method

    # --- Helper Method: `_render_frame` ---
    # Contains the actual logic for rendering (in this case, printing to console).
    def _render_frame(self):
        if self.render_mode == "human":
            # Print the current layout, demands for this episode, and the calculated cost.
            print(f"Layout: {self.layout}, Demands: {self.demand_frequencies}, Cost: {self._get_info()['cost']}")

    # --- Optional Method: `close` ---
    # Called when the environment is no longer needed, to clean up resources (e.g., close rendering windows).
    def close(self):
        if self.window is not None: # Example for Pygame window cleanup
            import pygame
            pygame.display.quit()
            pygame.quit()


In [None]:
if __name__ == "__main__":
    NUM_ITEMS = 10
    TOTAL_TIMESTEPS = 1_000_000
    MODEL_FILENAME = "ppo_warehouse_generalized_v2.zip"
    STATS_FILENAME = "vecnormalize_generalized_v2.pkl"
    LOG_DIR = "./ppo_warehouse_gen_logs_v2/"
    os.makedirs(LOG_DIR, exist_ok=True)

In [None]:
def make_env(): # Helper function needed by DummyVecEnv
        env = WarehouseEnvGeneralized(num_items=NUM_ITEMS)
        return env

vec_env = DummyVecEnv([make_env]) # Wrap the environment creator

print("Wrapping environment with VecNormalize for reward scaling.")
vec_env = VecNormalize(vec_env, norm_obs=False, norm_reward=True, gamma=0.99) # Apply normalization

In [None]:
 # --- Agent Training ---
print(f"\n--- Training PPO with MultiInputPolicy for {TOTAL_TIMESTEPS} timesteps ---")
start_time = time.time() # Record start time

# Instantiate the PPO Agent.
model = PPO(
    "MultiInputPolicy", # **IMPORTANT:** Use this policy because our observation space is a `spaces.Dict`. "MlpPolicy" is for flat `spaces.Box`.
    vec_env,              # Train on the vectorized AND normalized environment.
    verbose=1,            # Print training progress updates (0=none, 1=updates, 2=debug).
    tensorboard_log=LOG_DIR, # Tell SB3 where to save logs for TensorBoard visualization.
    # --- Hyperparameters (can be tuned for better performance) ---
    learning_rate=1e-4,   # How big are the steps during policy updates (gradient descent). Smaller can be more stable but slower.
    n_steps=2048,         # Number of steps collected from the env per agent update cycle. Larger batch -> more stable gradients.
    batch_size=64,        # Size of minibatches used within each update epoch.
    n_epochs=10,          # Number of times the collected data (`n_steps`) is iterated over during an update.
    gamma=0.99,           # Discount factor for future rewards. Closer to 1 means more emphasis on long-term rewards. MUST MATCH VecNormalize gamma.
    gae_lambda=0.95,      # Factor for Generalized Advantage Estimation (helps balance bias/variance in advantage calculation).
    clip_range=0.2,       # PPO's clipping parameter to limit policy changes per update.
    ent_coef=0.01,        # Entropy coefficient. Encourages exploration by adding a bonus for taking less predictable actions. Higher -> more exploration.
    # policy_kwargs=dict(net_arch=...) # Can customize the neural network size here if needed.
)

# **THE ACTUAL TRAINING LOOP:** The agent interacts with `vec_env` for `TOTAL_TIMESTEPS` steps, updating its policy along the way.
model.learn(total_timesteps=TOTAL_TIMESTEPS, progress_bar=True) # `progress_bar=True` shows a nice TQDM bar.
end_time = time.time()
print(f"Training complete. Time taken: {end_time - start_time:.2f} seconds")

In [None]:
print("Saving the trained model...")
model.save(MODEL_FILENAME)
print(f"Model saved to {MODEL_FILENAME}")

print("Saving VecNormalize statistics...")
# Important: Save the VecNormalize wrapper, not the base environment
vec_env.save(STATS_FILENAME)
print(f"Normalization statistics saved to {STATS_FILENAME}")

# Close the training environment (releases resources)
vec_env.close()
print("Training environment closed.")

In [None]:
print(f"\n--- Evaluating the trained model ---")

# --- Parameters ---
N_EVAL_EPISODES = 50  # How many different demand scenarios to test on
EVAL_STEPS_PER_EPISODE = 200 # How many optimization steps the agent gets per scenario
SEED = 42 # Use a fixed seed for evaluation for reproducibility

# --- Load Model and Environment ---
if not os.path.exists(MODEL_FILENAME) or not os.path.exists(STATS_FILENAME):
    print("Error: Model or normalization statistics file not found.")
    print("Please train the model first (run the script without modification).")
    exit()

print(f"Loading model from {MODEL_FILENAME}")
model = PPO.load(MODEL_FILENAME)

print(f"Loading normalization statistics from {STATS_FILENAME}")
# Create a *new* environment instance for evaluation
eval_env_raw = WarehouseEnvGeneralized(num_items=NUM_ITEMS) # Use the same num_items
# Wrap it in DummyVecEnv
eval_vec_env = DummyVecEnv([lambda: eval_env_raw])
# Load the saved VecNormalize statistics
eval_vec_env = VecNormalize.load(STATS_FILENAME, eval_vec_env)

# **IMPORTANT:** Set VecNormalize to evaluation mode
# This stops it from updating the running statistics (means/variances)
eval_vec_env.training = False
# This stops it from normalizing rewards (we want to see the true cost)
eval_vec_env.norm_reward = False

print("Evaluation environment ready.")

# --- Evaluation Loop ---
initial_costs = []
final_costs = []
optimal_costs = []

for i in range(N_EVAL_EPISODES):
    print(f"\n--- Evaluation Episode {i+1}/{N_EVAL_EPISODES} ---")
    # Reset the environment - this generates NEW random demands
    # We pass a specific seed to ensure reproducibility across evaluation runs *if needed*,
    # but different seeds for each episode to test generalization.
    # Using i as part of the seed ensures each episode gets different demands.
    obs = eval_vec_env.reset() # Seed for reset is handled internally by VecEnv usually

    # Get the underlying env to access its state directly
    current_env = eval_vec_env.envs[0]
    initial_demands = current_env.demand_frequencies.copy()
    initial_layout = current_env.layout.copy()

    # Calculate initial cost (before the agent acts)
    # Note: We use the *unnormalized* reward/cost here.
    initial_cost = sum((pos + 1) * initial_demands[item_id] for pos, item_id in enumerate(initial_layout))
    initial_costs.append(initial_cost)
    print(f"Episode {i+1}: Initial Demands: {initial_demands}")
    print(f"Episode {i+1}: Initial Layout: {initial_layout}")
    print(f"Episode {i+1}: Initial Cost: {initial_cost}")

    # Calculate the theoretical optimal cost for this demand scenario
    # Sort item IDs by demand (highest demand first)
    sorted_item_ids_by_demand = np.argsort(initial_demands)[::-1]
    optimal_cost = sum((pos + 1) * initial_demands[item_id] for pos, item_id in enumerate(sorted_item_ids_by_demand))
    optimal_costs.append(optimal_cost)
    print(f"Episode {i+1}: Optimal Possible Cost: {optimal_cost}")

    # Let the agent optimize the layout for a fixed number of steps
    current_cost = initial_cost
    # Set render_mode if you want to see the steps (can be slow)
    # current_env.render_mode = "human" # Uncomment to watch optimization

    for step in range(EVAL_STEPS_PER_EPISODE):
        # Get action from the loaded policy (deterministic=True means no random exploration)
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_vec_env.step(action)

        # Get the true cost from the info dict (safer than using normalized reward)
        # info is a list (one element per env in VecEnv), so access info[0]
        current_cost = info[0]['cost']

        # Optional: Render frame if needed
        # if current_env.render_mode == "human":
        #     current_env.render()
        #     time.sleep(0.1) # Slow down rendering

        # Environment doesn't terminate/truncate naturally in this setup
        if done[0]:
            print("Warning: Episode ended unexpectedly during evaluation step.")
            break # Should not happen with current env logic

    # Record the final cost achieved by the agent
    final_costs.append(current_cost)
    print(f"Episode {i+1}: Final Layout: {current_env.layout}")
    print(f"Episode {i+1}: Final Cost after {EVAL_STEPS_PER_EPISODE} steps: {current_cost}")
    # Reset render mode if it was turned on
    # current_env.render_mode = None

# Close the evaluation environment
eval_vec_env.close()

# --- Report Results ---
print("\n\n--- Evaluation Summary ---")
initial_costs = np.array(initial_costs)
final_costs = np.array(final_costs)
optimal_costs = np.array(optimal_costs)

avg_initial_cost = np.mean(initial_costs)
avg_final_cost = np.mean(final_costs)
avg_optimal_cost = np.mean(optimal_costs)
avg_improvement = avg_initial_cost - avg_final_cost
avg_improvement_percent = (avg_improvement / avg_initial_cost) * 100 if avg_initial_cost > 0 else 0
avg_gap_to_optimal = avg_final_cost - avg_optimal_cost
avg_initial_gap_to_optimal = avg_initial_cost - avg_optimal_cost
avg_percent_of_optimal_gap_closed = ((avg_initial_gap_to_optimal - avg_gap_to_optimal) / avg_initial_gap_to_optimal) * 100 if avg_initial_gap_to_optimal > 0 else float('inf')


print(f"Evaluated on {N_EVAL_EPISODES} episodes.")
print(f"Average Initial Cost (random layout): {avg_initial_cost:.2f}")
print(f"Average Final Cost (after agent optimization): {avg_final_cost:.2f}")
print(f"Average Optimal Possible Cost: {avg_optimal_cost:.2f}")
print("-" * 30)
print(f"Average Cost Reduction per Episode: {avg_improvement:.2f}")
print(f"Average Cost Reduction Percentage: {avg_improvement_percent:.2f}%")
print(f"Average Final Gap to Optimal Cost: {avg_gap_to_optimal:.2f}")
print(f"Average Percentage of Optimal Gap Closed: {avg_percent_of_optimal_gap_closed:.2f}%")

# --- Optional Plotting ---
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(initial_costs, bins=15, alpha=0.7, label='Initial Costs')
plt.hist(final_costs, bins=15, alpha=0.7, label='Final Costs (Agent)')
plt.hist(optimal_costs, bins=15, alpha=0.7, label='Optimal Costs')
plt.xlabel("Cost")
plt.ylabel("Frequency")
plt.title("Distribution of Costs across Episodes")
plt.legend()

plt.subplot(1, 2, 2)
episodes = range(1, N_EVAL_EPISODES + 1)
plt.plot(episodes, initial_costs, 'o-', label='Initial Cost', alpha=0.6)
plt.plot(episodes, final_costs, 's-', label='Final Cost (Agent)', alpha=0.6)
plt.plot(episodes, optimal_costs, '^-', label='Optimal Cost', alpha=0.6)
plt.xlabel("Evaluation Episode")
plt.ylabel("Cost")
plt.title("Cost per Episode")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("evaluation_results_v2.png") # Save the plot
print("\nSaved evaluation plot to evaluation_results_v2.png")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from scipy import stats # Needed for Kendall's Tau

# --- Assuming these are defined from the previous part ---
# from stable_baselines3 import PPO
# from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
# from your_environment_file import WarehouseEnvGeneralized # Import your env class
# NUM_ITEMS = 10
# MODEL_FILENAME = "ppo_warehouse_generalized_v2.zip"
# STATS_FILENAME = "vecnormalize_generalized_v2.pkl"
# ---------------------------------------------------------

# --- Evaluation Focused on Layout Structure ---
print(f"\n--- Evaluating Model based on High-Frequency Item Placement ---")

# --- Parameters ---
N_EVAL_EPISODES = 50  # How many different demand scenarios to test on
EVAL_STEPS_PER_EPISODE = 200 # How many optimization steps the agent gets per scenario
TOP_K_VALUES = [1, 3, 5] # Evaluate placement accuracy for top 1, 3, and 5 items

# --- Load Model and Environment ---
if not os.path.exists(MODEL_FILENAME) or not os.path.exists(STATS_FILENAME):
    print("Error: Model or normalization statistics file not found.")
    print("Please train the model first.")
    exit()

print(f"Loading model from {MODEL_FILENAME}")
model = PPO.load(MODEL_FILENAME)

print(f"Loading normalization statistics from {STATS_FILENAME}")
eval_env_raw = WarehouseEnvGeneralized(num_items=NUM_ITEMS)
eval_vec_env = DummyVecEnv([lambda: eval_env_raw])
eval_vec_env = VecNormalize.load(STATS_FILENAME, eval_vec_env)
eval_vec_env.training = False
eval_vec_env.norm_reward = False
print("Evaluation environment ready.")

# --- Evaluation Loop ---
kendall_taus = []
top_k_matches = {k: [] for k in TOP_K_VALUES} # {1: [matches_ep1, matches_ep2,...], 3: [...], ...}
final_costs_agent = []
optimal_costs_ideal = []


for i in range(N_EVAL_EPISODES):
    print(f"\n--- Structure Evaluation Episode {i+1}/{N_EVAL_EPISODES} ---")
    obs = eval_vec_env.reset()
    current_env = eval_vec_env.envs[0] # Get the underlying environment instance
    episode_demands = current_env.demand_frequencies.copy()
    print(f"Episode {i+1}: Demands: {episode_demands}")

    # 1. Calculate the IDEAL/OPTIMAL layout based on demands
    # Argsort gives indices that would sort the array. [::-1] reverses for descending order.
    # optimal_item_order contains the item IDs sorted by demand (highest demand first)
    optimal_item_order = np.argsort(episode_demands)[::-1]
    # This is the layout array representing the optimal placement
    optimal_layout = optimal_item_order.astype(np.int32)
    optimal_cost = sum((pos + 1) * episode_demands[item_id] for pos, item_id in enumerate(optimal_layout))
    optimal_costs_ideal.append(optimal_cost)
    print(f"Episode {i+1}: Optimal Layout (by demand): {optimal_layout}")
    print(f"Episode {i+1}: Optimal Cost: {optimal_cost:.2f}")


    # 2. Let the AGENT run and find its final layout
    agent_final_cost = -1 # Initialize
    for step in range(EVAL_STEPS_PER_EPISODE):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_vec_env.step(action)
        agent_final_cost = info[0]['cost'] # Get true cost from info
        if done[0]: break # Should not happen here

    agent_final_layout = current_env.layout.copy()
    final_costs_agent.append(agent_final_cost)
    print(f"Episode {i+1}: Agent Final Layout: {agent_final_layout}")
    print(f"Episode {i+1}: Agent Final Cost: {agent_final_cost:.2f}")

    # 3. Compare Agent's layout to the Optimal layout

    # Metric 1: Kendall's Tau Rank Correlation
    # Measures the similarity of the ordering of items between the two layouts.
    # Ranges from -1 (perfect inverse) to +1 (perfect match). 0 means no correlation.
    tau, p_value = stats.kendalltau(agent_final_layout, optimal_layout)
    kendall_taus.append(tau)
    print(f"Episode {i+1}: Kendall's Tau: {tau:.4f}")

    # Metric 2: Top-K Placement Accuracy
    # Checks if the highest-demand items are placed in the first K slots by the agent.
    for k in TOP_K_VALUES:
        # Set of item IDs that *should* be in the first K positions
        ideal_top_k_items = set(optimal_layout[:k])
        # Set of item IDs the agent *actually* placed in the first K positions
        agent_top_k_items = set(agent_final_layout[:k])

        # Count how many items are correctly placed within the top K
        matches = len(ideal_top_k_items.intersection(agent_top_k_items))
        top_k_matches[k].append(matches)
        print(f"Episode {i+1}: Top-{k} Matches: {matches}/{k}")

# Close the evaluation environment
eval_vec_env.close()

# --- Report Structure Evaluation Results ---
print("\n\n--- Structure Evaluation Summary ---")
avg_kendall_tau = np.mean(kendall_taus)
avg_final_cost_agent = np.mean(final_costs_agent)
avg_optimal_cost_ideal = np.mean(optimal_costs_ideal)
avg_cost_difference = avg_final_cost_agent - avg_optimal_cost_ideal

print(f"Evaluated on {N_EVAL_EPISODES} episodes with {EVAL_STEPS_PER_EPISODE} agent steps each.")
print(f"Average Agent Final Cost: {avg_final_cost_agent:.2f}")
print(f"Average Optimal (Demand-Sorted) Cost: {avg_optimal_cost_ideal:.2f}")
print(f"Average Cost Difference (Agent - Optimal): {avg_cost_difference:.2f}")
print("-" * 30)
print(f"Average Kendall's Tau Rank Correlation: {avg_kendall_tau:.4f}")
print(" (Closer to 1.0 means the agent's layout order is more similar to the optimal demand-based order)")
print("-" * 30)
print("Average Top-K Item Placement Accuracy:")
for k in TOP_K_VALUES:
    avg_matches = np.mean(top_k_matches[k])
    percent_accuracy = (avg_matches / k) * 100
    print(f"  - Top-{k}: Agent placed an average of {avg_matches:.2f} / {k} correct items ({percent_accuracy:.1f}%)")

# --- Optional Plotting for Structure Evaluation ---
plt.figure(figsize=(18, 6))

# Plot 1: Histogram of Kendall's Tau
plt.subplot(1, 3, 1)
plt.hist(kendall_taus, bins=10, alpha=0.7, color='skyblue')
plt.axvline(avg_kendall_tau, color='red', linestyle='dashed', linewidth=1, label=f'Avg: {avg_kendall_tau:.3f}')
plt.xlabel("Kendall's Tau")
plt.ylabel("Frequency")
plt.title("Distribution of Layout Rank Correlation (Agent vs Optimal)")
plt.xlim([-1.1, 1.1]) # Kendall's Tau range
plt.legend()
plt.grid(axis='y', linestyle='--')

# Plot 2: Bar chart for Top-K Accuracy
plt.subplot(1, 3, 2)
k_labels = [f'Top-{k}' for k in TOP_K_VALUES]
avg_matches_values = [np.mean(top_k_matches[k]) for k in TOP_K_VALUES]
bars = plt.bar(k_labels, avg_matches_values, color='lightgreen')
plt.ylabel("Average Number of Correct Items")
plt.title("Average Top-K Item Placement Accuracy")
# Add percentage labels on bars
for bar, k, avg_val in zip(bars, TOP_K_VALUES, avg_matches_values):
    percent = (avg_val / k) * 100
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percent:.1f}%', ha='center', va='bottom')
plt.ylim(0, max(TOP_K_VALUES) * 1.1) # Adjust y-limit based on max K
plt.grid(axis='y', linestyle='--')

# Plot 3: Agent Cost vs Optimal Cost Scatter Plot
plt.subplot(1, 3, 3)
plt.scatter(optimal_costs_ideal, final_costs_agent, alpha=0.6, label='Episode Result')
# Add line y=x for reference (perfect agent)
max_cost = max(max(optimal_costs_ideal), max(final_costs_agent))
min_cost = min(min(optimal_costs_ideal), min(final_costs_agent))
plt.plot([min_cost, max_cost], [min_cost, max_cost], 'r--', label='Optimal Cost Line (y=x)')
plt.xlabel("Optimal (Demand-Sorted) Cost")
plt.ylabel("Agent's Final Cost")
plt.title("Agent Cost vs. Theoretical Optimal Cost")
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("evaluation_structure_results_v2.png")
print("\nSaved structure evaluation plot to evaluation_structure_results_v2.png")
plt.show()