<a href="https://colab.research.google.com/github/shravanimamidala/LunarLanding_DLProject/blob/main/LunarLander_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Cell 0

!pip install Box2D
!pip install stable_baselines3
!pip install 'shimmy>=2.0'
!pip install pygame
!pip install moviePy
!pip install --upgrade stable-baselines3 gymnasium

Collecting Box2D
  Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Box2D
Successfully installed Box2D-2.3.10
Collecting stable_baselines3
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable_baselines3)
  Downloading nvidia_cuda_c

In [None]:
# Cell 1: Imports
import os
import random
import time
import datetime
import math
import uuid
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import moviepy
try:
    import gymnasium as gym
    print(f"Gymnasium version: {gym.__version__}")
except ImportError:
    print("⚠ Error: 'gymnasium' package is not installed. Install it using: !pip install gymnasium")
    raise ImportError("Gymnasium is required for this notebook.")
from gymnasium.wrappers import RecordVideo
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
# Verify Stable Baselines3
print(f"Stable Baselines3 version: {PPO.__module__.split('.')[0]}")

Gymnasium version: 1.1.1
Stable Baselines3 version: stable_baselines3


In [None]:
# Cell 2: Configuration & Hyperparameters
ENV_NAME = "LunarLander-v3"
# Reproducibility
SEEDS = [0, 1, 2]
def set_global_seed(seed: int):
    """Set seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

TRAIN_MAX_STEPS = 1000
EVAL_MAX_STEPS = 250

# Training schedule
EPISODES = 250
MILESTONES = [10,50,100,150,250]

# # Smoke Test schedule
# EPISODES = 5
# MILESTONES = [1,5]

# Replay buffer and optimization
BATCH_SIZE = 64
BUFFER_SIZE = 50_000

# Epsilon-greedy schedule
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 5e-4

# Discount factor and target update
GAMMA = 0.99
TARGET_UPDATE_FREQ = 1_000

# Algorithm-specific configurations
CONFIG = {
    'vanilla_dqn': {'lr': 1e-3, 'batch_size': 64, 'prioritized': False},
    'double_dqn': {'lr': 1e-3, 'batch_size': 64, 'prioritized': False},
    'dueling_dqn': {'lr': 1e-3, 'batch_size': 64, 'prioritized': False},
    'per_dqn': {'lr': 1e-3, 'batch_size': 64, 'prioritized': True, 'alpha': 0.6, 'beta': 0.4},
    'ppo': {'n_steps': 1000, 'learning_rate': 3e-4, 'batch_size' : 50}
}
# Directories
BASE_LOG_DIR = "logs"
BASE_VIDEO_DIR = "videos"
PLOTS_DIR = "plots"
os.makedirs(BASE_LOG_DIR, exist_ok=True)
os.makedirs(BASE_VIDEO_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

In [None]:
import shutil
shutil.rmtree(BASE_LOG_DIR, ignore_errors=True)
os.makedirs(BASE_LOG_DIR, exist_ok=True)

In [None]:
# Cell 3: Replay Buffer
from collections import namedtuple

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, capacity: int, prioritized: bool = False, alpha: float = 0.6, beta: float = 0.4):
        self.capacity     = capacity
        self.prioritized  = prioritized
        self.alpha        = alpha
        self.beta         = beta
        self.buffer       = []
        self.priorities   = []
        self.position     = 0

    def push(self, *args):
        """Save a transition."""
        if len(self.buffer) < self.capacity:
            self.buffer.append(Transition(*args))
            self.priorities.append(max(self.priorities, default=1.0))
        else:
            self.buffer[self.position]     = Transition(*args)
            self.priorities[self.position] = max(self.priorities)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size: int):
        """
        Sample a batch of transitions, with optional prioritized replay.
        Returns: (transitions, indices, weights)
        """
        if self.prioritized and len(self.buffer) >= batch_size:
            # compute sampling probabilities
            probs = np.array(self.priorities, dtype=np.float32) ** self.alpha
            probs /= probs.sum()
            indices = np.random.choice(len(self.buffer), batch_size, p=probs)
            weights = (len(self.buffer) * probs[indices]) ** (-self.beta)
            weights /= weights.max()
            samples = [self.buffer[i] for i in indices]
            return samples, indices, torch.tensor(weights, dtype=torch.float32)
        else:
            samples = random.sample(self.buffer, batch_size)
            return samples, None, None

    def update_priorities(self, indices, errors):
        """Update priorities of sampled transitions."""
        for idx, err in zip(indices, errors):
            self.priorities[idx] = abs(err) + 1e-6

    def __len__(self):
        return len(self.buffer)

In [None]:
# Cell 4: Q-Networks
class QNetwork(nn.Module):
    def __init__(self, state_dim: int, action_dim: int, hidden_dim: int = 128):
        super(QNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

class DuelingQNetwork(nn.Module):
    def __init__(self, state_dim: int, action_dim: int, hidden_dim: int = 128):
        super(DuelingQNetwork, self).__init__()
        # value stream
        self.value_net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        # advantage stream
        self.adv_net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        value = self.value_net(x)
        adv   = self.adv_net(x)
        # broadcast value and normalize advantage
        return value + (adv - adv.mean(dim=1, keepdim=True))

In [None]:
# Cell 5: DQN Agent Base
class DQNAgent:
    def __init__(self, state_dim: int, action_dim: int, variant: str, config: dict):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.variant = variant
        self.config = config
        # Verify dimensions
        assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
        assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
        # Choose network architecture
        if variant == "dueling":
            self.q_net = DuelingQNetwork(state_dim, action_dim)
            self.target_net = DuelingQNetwork(state_dim, action_dim)
        else:
            self.q_net = QNetwork(state_dim, action_dim)
            self.target_net = QNetwork(state_dim, action_dim)
        self.target_net.load_state_dict(self.q_net.state_dict())
        # Replay buffer
        prioritized = config.get('prioritized', False)
        alpha = config.get('alpha', 0.6)
        beta = config.get('beta', 0.4)
        self.buffer = ReplayBuffer(BUFFER_SIZE, prioritized=prioritized, alpha=alpha, beta=beta)
        # Optimizer
        self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=config['lr'])
        # Counters
        self.steps_done = 0

    def select_action(self, state):
        # Epsilon-greedy
        eps = EPS_END + (EPS_START - EPS_END) * math.exp(-self.steps_done * EPS_DECAY)
        self.steps_done += 1
        if random.random() < eps:
            return random.randrange(self.action_dim)
        state_t = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            return int(self.q_net(state_t).argmax(dim=1).item())

    def store_transition(self, state, action, reward, next_state, done):
        self.buffer.push(state, action, reward, next_state, done)

    def train_step(self):
        # ————————————
        # Beta annealing for PER‑DQN:
        # linearly increase buffer.beta from 0.4 → 1.0 over 250 000 steps
        if self.variant == "per":
            self.buffer.beta = min(1.0, 0.4 + (1.0 - 0.4) * (self.steps_done / 250_000))
        # ————————————
        if len(self.buffer) < self.config['batch_size']:
            return None
        transitions, indices, weights = self.buffer.sample(self.config['batch_size'])
        states, actions, rewards, next_states, dones = zip(*transitions)
        s_v = torch.tensor(np.stack(states), dtype=torch.float32)
        a_v = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1)
        r_v = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1)
        ns_v = torch.tensor(np.stack(next_states), dtype=torch.float32)
        d_v = torch.tensor(dones, dtype=torch.float32).unsqueeze(-1)
        q_current = self.q_net(s_v).gather(1, a_v)
        with torch.no_grad():
            if self.variant == "double":
                next_actions = self.q_net(ns_v).argmax(dim=1, keepdim=True)
                q_next = self.target_net(ns_v).gather(1, next_actions)
            else:
                q_next = self.target_net(ns_v).max(1, keepdim=True)[0]
            q_target = r_v + GAMMA * (1 - d_v) * q_next
        if self.buffer.prioritized and indices is not None:
            errors = (q_current - q_target).squeeze().abs().detach().cpu().numpy()
            self.buffer.update_priorities(indices, errors)
            loss = ((q_current - q_target).pow(2) * weights.unsqueeze(1)).mean()
        else:
            loss = F.mse_loss(q_current, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # Periodic target update
        if self.steps_done % TARGET_UPDATE_FREQ == 0:
            self.target_net.load_state_dict(self.q_net.state_dict())
        return loss.item()

In [None]:
# Cell 6: Vanilla DQN Factory
def make_vanilla_dqn_agent(env=None):
    """Instantiate a Vanilla DQN agent."""
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
    assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
    return DQNAgent(state_dim, action_dim, variant="vanilla", config=CONFIG['vanilla_dqn'])

In [None]:
# Cell 7: Double DQN Factory
def make_double_dqn_agent(env=None):
    """Instantiate a Double DQN agent."""
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
    assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
    return DQNAgent(state_dim, action_dim, variant="double", config=CONFIG['double_dqn'])

In [None]:
# Cell 8: Dueling DQN Factory
def make_dueling_dqn_agent(env=None):
    """Instantiate a Dueling DQN agent."""
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
    assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
    return DQNAgent(state_dim, action_dim, variant="dueling", config=CONFIG['dueling_dqn'])

In [None]:
# Cell 9: PER DQN Factory
def make_per_dqn_agent(env=None):
    """Instantiate a Prioritized Experience Replay DQN agent."""
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
    assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
    return DQNAgent(state_dim, action_dim, variant="per", config=CONFIG['per_dqn'])

In [None]:
# # Cell 10: PPO Agent Factory

# from stable_baselines3 import PPO
# from stable_baselines3.common.vec_env import DummyVecEnv

# class PPOAgentWrapper:
#     def __init__(self, env):
#         """env: either a callable factory or a Gym env instance"""
#         # Verify dimensions
#         state_dim = env.observation_space.shape[0]
#         action_dim = env.action_space.n
#         assert state_dim == 8, f"Expected state_dim=8, got {state_dim}"
#         assert action_dim == 4, f"Expected action_dim=4, got {action_dim}"
#         # Wrap env for DummyVecEnv
#         if callable(env):
#             env_fn = env
#         else:
#             env_fn = lambda: env
#         self.vec_env = DummyVecEnv([env_fn])
#         self.model = PPO(
#             "MlpPolicy",
#             self.vec_env,
#             verbose=0,
#             n_steps=CONFIG['ppo']['n_steps'],
#             learning_rate=CONFIG['ppo']['learning_rate']
#         )


#     def learn(self, total_timesteps, reset_num_timesteps=False):
#         """Train the model for a specified number of timesteps."""
#         self.model.learn(total_timesteps=total_timesteps, reset_num_timesteps=reset_num_timesteps)

#     def predict(self, state, deterministic=True):
#         # SB3 VecEnv expects batched inputs
#         state_vec = np.array(state, dtype=np.float32)[None]
#         action, _ = self.model.predict(state_vec, deterministic=deterministic)
#         return int(action[0]), _

# def make_ppo_agent(env=None):
#     """Instantiate a PPO agent wrapper."""
#     return PPOAgentWrapper(env)

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

def make_ppo_agent(env=None, seed=None):
    """Instantiate a PPO agent wrapper with seeded vectorized environment."""
    def _make_env():
        env = make_env(seed=seed)  # Assuming make_env is defined elsewhere
        return env
    vec_env = DummyVecEnv([_make_env])
    model = PPO(
        "MlpPolicy",
        vec_env,
        verbose=0,
        n_steps=CONFIG['ppo']['n_steps'],  # Assuming CONFIG is defined
        batch_size=CONFIG['ppo'].get('batch_size', 64),
        learning_rate=CONFIG['ppo']['learning_rate'],
        seed=seed  # Ensure PPO model uses the same seed
    )
    return PPOAgentWrapper(model, vec_env)

class PPOAgentWrapper:
    def __init__(self, model, vec_env):
        self.model = model
        self.vec_env = vec_env

    def learn(self, total_timesteps: int, reset_num_timesteps: bool = False, callback = None, **kwargs):
        """Train the model, forwarding any callback(s) to SB3."""
        return self.model.learn(
            total_timesteps=total_timesteps,
            reset_num_timesteps=reset_num_timesteps,
            callback=callback,
            **kwargs
        )

    def predict(self, state, deterministic=True):
        state_vec = np.array(state, dtype=np.float32)[None]
        action, _ = self.model.predict(state_vec, deterministic=deterministic)
        return int(action[0]), _

In [None]:
# Cell 11: Environment Factory
class FuelTrackingWrapper(gym.Wrapper):
    """Wrapper to track fuel consumption based on actions."""
    def __init__(self, env):
        super().__init__(env)
        self.initial_fuel = 1000.0  # Initial fuel for remaining fuel metric
        self.fuel = 0.0
        # Fuel costs: main engine (-0.3), side engines (-0.03)
        self.fuel_costs = {0: 0.0, 1: -0.03, 2: -0.3, 3: -0.03}  # Actions: none, left, main, right

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.fuel += self.fuel_costs[action]
        info['fuel'] = self.initial_fuel + self.fuel  # Remaining fuel
        return obs, reward, terminated, truncated, info

    def reset(self, **kwargs):
        self.fuel = 0.0
        obs, info = self.env.reset(**kwargs)
        info['fuel'] = self.initial_fuel
        return obs, info

def make_env(seed=None, record=False, record_path=None, video_prefix=None):
    env = gymnasium.make(ENV_NAME, render_mode="rgb_array")
    # Verify state and action spaces
    assert env.observation_space.shape[0] == 8, f"Expected state_dim=8, got {env.observation_space.shape[0]}"
    assert env.action_space.n == 4, f"Expected action_dim=4, got {env.action_space.n}"
    # Wrap with fuel tracking
    env = FuelTrackingWrapper(env)
    if seed is not None:
        env.reset(seed=seed)
    if record and record_path:
        # ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        prefix = video_prefix or f"{ENV_NAME}"
        env = RecordVideo(env, video_folder=record_path, name_prefix=prefix, episode_trigger=lambda x: x in MILESTONES)
    return env

In [None]:
# Cell 12: Evaluation Utility (Updated)

def evaluate(agent, is_ppo, episodes=10, record=False, record_path=None, video_prefix=None,return_raw=False, eval_seed: int = None):
    stats = {"reward": [], "success": [], "dist": [], "fuel": []}


    # for ep in range(1, episodes + 1):
    #     # ─────────── prepare the env for this episode ───────────
    #     if is_ppo:
    #         # PPO uses a VecEnv; wrap it if we want video
    #         vec_env = agent.vec_env
    #         if record and record_path and video_prefix:
    #             vec_env = VecVideoRecorder(
    #                 vec_env,
    #                 video_folder=record_path,
    #                 record_video_trigger=lambda _: True,
    #                 video_length=EVAL_MAX_STEPS,
    #                 name_prefix=video_prefix
    #             )
    #         if eval_seed is not None:
    #             vec_env.seed(eval_seed)
    #         obs = vec_env.reset()
    #         state = obs[0]
    #     else:
    #         # DQN uses a single-env; wrap with RecordVideo if requested
    #         env = make_env(
    #             seed=eval_seed,
    #             record=record,
    #             record_path=record_path,
    #             video_prefix=video_prefix
    #         )
    #         if record and record_path and video_prefix:
    #             env = RecordVideo(
    #                 env,
    #                 video_folder=record_path,
    #                 name_prefix=video_prefix
    #             )
    #         state, _ = env.reset(seed=eval_seed)

    # ─── Build & wrap the env ONE TIME ───
    if is_ppo:
        # start from the agent’s VecEnv
        base_env = agent.vec_env
        if record and record_path and video_prefix:
            env = VecVideoRecorder(
                base_env,
                video_folder=record_path,
                record_video_trigger=lambda step: step == 0,
                video_length=EVAL_MAX_STEPS,
                name_prefix=video_prefix,
            )
        else:
            env = base_env
    else:
        # single‐env for DQN
        base_env = make_env(
            seed=eval_seed,
            record=record,
            record_path=record_path,
            video_prefix=video_prefix,
        )
        if record and record_path and video_prefix:
            env = RecordVideo(
                base_env,
                video_folder=record_path,
                name_prefix=video_prefix,
            )
        else:
            env = base_env

    # ─── Run N episodes ───
    for ep in range(episodes):
        # reseed if requested
        if eval_seed is not None and is_ppo:
            # VecEnv supports .seed()
            env.seed(eval_seed)

        # reset
        if is_ppo:
            obs = env.reset()       # no seed arg on VecEnv
            state = obs[0]
        else:
            state, _ = env.reset(seed=eval_seed)

        total_r, total_f = 0.0, 0.0
        done = False

        for t in range(EVAL_MAX_STEPS):
            if is_ppo:
                action, _ = agent.predict(state)
                obs, rewards, dones, infos = env.step([action])
                ns, r, done, info = obs[0], rewards[0], dones[0], infos[0]
            else:
                action = agent.select_action(state)
                ns, r, term, trunc, info = env.step(action)
                done = term or trunc

            total_r += r
            total_f   = info.get("fuel", 0)
            state     = ns

            if done:
                stats["success"].append(1 if total_r >= 200 else 0)
                stats["dist"].append(math.hypot(state[0], state[1]))
                break

        if not done:
            stats["success"].append(0)
            stats["dist"].append(math.hypot(state[0], state[1]))

        stats["reward"].append(total_r)
        stats["fuel"].append(total_f)

    # ─── Clean up ───
    if record:
        env.close()
    if not is_ppo:
        base_env.close()

    if return_raw:
        return stats["reward"]

    mean_r, std_r = np.mean(stats["reward"]), np.std(stats["reward"])
    mean_d, std_d = np.mean(stats["dist"]),   np.std(stats["dist"])
    return {
        "mean_reward":  f"{mean_r:.2f} ± {std_r:.2f}",
        "success_rate": np.mean(stats["success"]),
        "mean_dist":    f"{mean_d:.2f} ± {std_d:.2f}",
        "mean_fuel":    np.mean(stats["fuel"]),
        "std_reward":   std_r,
        "std_dist":     std_d,
    }

In [None]:
# Cell 13: Training & Milestone Logging

from stable_baselines3.common.callbacks import BaseCallback
class StepCounterCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.total_steps = 0
    def _on_step(self) -> bool:
        self.total_steps += 1
        return True

class TrainRewardCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.episode_rewards = []

    def _on_step(self) -> bool:
        # Required stub so callback is concrete
        return True

    def _on_rollout_end(self) -> bool:
        # Sum up the rewards from this rollout segment
        rewards = self.locals.get("rewards", [])
        self.episode_rewards.append(float(sum(rewards)))
        return True

def train_and_snapshot(name, agent_ctor, is_ppo):
    """Train an agent, log per-episode metrics, checkpoints, and milestone evaluations."""
    # Master log for full training
    master_full_log = os.path.join(BASE_LOG_DIR, "master_full_training.csv")
    master_milestone_log = os.path.join(BASE_LOG_DIR, "master_milestone_metrics.csv")

    if not os.path.exists(master_full_log):
      # create empty CSV with only column headers
      pd.DataFrame(
          columns=[
              "algorithm","seed","episode","reward",
              "steps","time_sec","fuel",
              "sample_efficiency","training_std"
            ]).to_csv(master_full_log, index=False)

    if not os.path.exists(master_milestone_log):
      # create empty CSV with only column headers
      pd.DataFrame(
          columns=[
              "algorithm","seed","episode","mean_reward",
              "success_rate","mean_dist","mean_fuel",
              "sample_efficiency","training_std", "eval_variance",
              "time_sec","gpu_bytes"
          ]).to_csv(master_milestone_log, index=False)

    # Directory for milestone videos
    tl_base = os.path.join(BASE_VIDEO_DIR, "timeline_comparisons")
    os.makedirs(tl_base, exist_ok=True)
    for m in MILESTONES:
        os.makedirs(os.path.join(tl_base, f"milestone_{m}_eps"), exist_ok=True)

    total_steps = 0
    gpu_timer = GPUTimer()
    for seed in SEEDS:
        set_global_seed(seed)
        if is_ppo:
            agent = agent_ctor(seed=seed)
        else:
            env = make_env(seed)
            agent = agent_ctor(env)

        log_dir = os.path.join(BASE_LOG_DIR, name)
        os.makedirs(log_dir, exist_ok=True)

        episode_rewards = []
        first_success_steps = float('inf')

        for epi in range(1, EPISODES + 1):
            start = time.time()
            if is_ppo:
                step_cb   = StepCounterCallback()
                reward_cb = TrainRewardCallback()
                agent.learn(total_timesteps=TRAIN_MAX_STEPS,reset_num_timesteps=False,callback=[step_cb, reward_cb])
                # get reward from callback instead of separate eval
                ep_reward = reward_cb.episode_rewards[-1]
                # get fuel via a quick evaluation run
                metrics   = evaluate(agent, True, episodes=1)
                ep_fuel   = metrics["mean_fuel"]
                steps     = step_cb.total_steps
                total_steps += step_cb.total_steps

            else:
                state, _ = env.reset()
                ep_reward = 0
                for t in range(TRAIN_MAX_STEPS):
                    action = agent.select_action(state)
                    ns, r, term, trunc, info = env.step(action)
                    agent.store_transition(state, action, r, ns, term or trunc)
                    agent.train_step()
                    state = ns
                    ep_reward += r
                    total_steps += 1
                    if term or trunc:
                        break
                steps = t + 1
                ep_fuel = info['fuel']
            elapsed = time.time() - start
            episode_rewards.append(ep_reward)

            # Sample efficiency: steps to reach reward >= 200
            # sample_eff = total_steps if ep_reward >= 200 and total_steps > 0 else float('inf')
            if ep_reward >= 200 and first_success_steps == float('inf'):
                first_success_steps = total_steps
            sample_eff = first_success_steps if first_success_steps < float('inf') else float('inf')

            # Training stability: standard deviation of last 50 episodes
            # reward_var = np.var(episode_rewards[-10:]) if len(episode_rewards) >= 10 else 0.0
            training_std = float(np.std(episode_rewards[-50:])) if len(episode_rewards) >= 50 else float(np.std(episode_rewards))

            # Append to master log
            pd.DataFrame([{
                "algorithm": name,
                "seed": seed,
                "episode": epi,
                "reward": ep_reward,
                "steps": steps,
                "time_sec": elapsed,
                "fuel": ep_fuel,
                "sample_efficiency": sample_eff,
                "training_std": training_std
            }]).to_csv(master_full_log, mode="a", header=False, index=False)

            # Checkpoint DQN
            if not is_ppo and epi == EPISODES:
                ckpt = os.path.join(log_dir, f"{name}_seed{seed}_final.pth")
                torch.save(agent.q_net.state_dict(), ckpt)

            # ── SAVE THE TRAINED MODEL ──
            if epi == EPISODES:
                if is_ppo:
                    # save the SB3 PPO model
                    agent.model.save(os.path.join(log_dir, f"{name}_seed{seed}_model.zip"))
                else:
                    # save the DQN weights
                    torch.save(
                        agent.q_net.state_dict(),
                        os.path.join(log_dir, f"{name}_seed{seed}_qnet.pth")
                    )

            # Milestone evaluation
            if epi in MILESTONES:
                vid_dir = os.path.join(tl_base, f"milestone_{epi}_eps", f"seed{seed}")
                os.makedirs(vid_dir, exist_ok=True)
                with gpu_timer.track():
                    metrics = evaluate(agent, is_ppo, episodes=10)
                    _ = evaluate(agent,is_ppo,episodes=1,record=True,record_path=vid_dir,video_prefix=f"{name}_milestone_{epi}_seed{seed}")

                    # get raw rewards for 10 evaluation episodes
                    raw_rewards = evaluate(
                        agent,
                        is_ppo,
                        episodes=10,
                        return_raw=True,
                        eval_seed=seed
                    )
                    # compute variance over those 10 episodes
                    eval_variance = float(np.var(raw_rewards))
                gpu_stats = gpu_timer.summary()

                pd.DataFrame([{
                "algorithm": name,
                "seed": seed,
                "episode": epi,
                "mean_reward": metrics["mean_reward"],
                "success_rate": metrics["success_rate"],
                "mean_dist": metrics["mean_dist"],
                "mean_fuel": metrics["mean_fuel"],
                "sample_efficiency": sample_eff,
                "training_std": training_std,
                "eval_variance": eval_variance,
                "time_sec": gpu_stats["mean_time_sec"],
                "gpu_bytes": gpu_stats["mean_peak_gpu_bytes"]
                }]).to_csv(master_milestone_log, mode="a", header=False, index=False)


    if not is_ppo:
        # close the single‐env you created for DQN
        env.close()
    else:
        # close the vectorized env you created for PPO
        agent.vec_env.close()

    return agent

In [None]:
# Cell 14: Trajectory Visualization
def plot_trajectory(agent, is_ppo, seed=0):
    """Run one deterministic episode and plot the descent trajectory (x vs. y)."""
    # Prepare environment
    env = make_env(seed=seed)
    state, _ = env.reset(seed=seed)  # Explicit reset with seed
    positions = []
    done = False
    for t in range(TRAIN_MAX_STEPS):
        positions.append((state[0], state[1]))  # x, y
        if is_ppo:
            action, _ = agent.predict(state, deterministic=True)
        else:
            # Ensure deterministic action for DQN
            state_t = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            with torch.no_grad():
                action = int(agent.q_net(state_t).argmax(dim=1).item())
        state, r, term, trunc, info = env.step(action)
        done = term or trunc
        if done:
            positions.append((state[0], state[1]))
            break
    env.close()
    # Plot trajectory
    xs, ys = zip(*positions)
    plt.figure(figsize=(8, 6))
    plt.plot(xs, ys, marker='o', markersize=3)
    plt.title(f"Descent Trajectory: {'PPO' if is_ppo else 'DQN'} (Seed {seed})")
    plt.xlabel('X position')
    plt.ylabel('Y position')
    plt.grid(True)
    plt.savefig(os.path.join(PLOTS_DIR, f"trajectory_{'ppo' if is_ppo else 'dqn'}_seed{seed}.png"))
    plt.close()

In [None]:
# # Cell 15: Q-Value & Angular Distribution Histograms
# def plot_histograms(agent, seed=0, num_samples=1000):
#     """Sample random states to plot histograms of Q-values and angular velocities."""
#     # Collect random states from environment
#     env = make_env(seed=seed)
#     states = []
#     for _ in range(num_samples):
#         st, _ = env.reset()
#         assert len(st) == 8, f"Expected state_dim=8, got {len(st)}"
#         states.append(st)
#     env.close()
#     states_arr = np.array(states, dtype=np.float32)

#     # Q-value distribution (only for DQN agents)
#     if not isinstance(agent, PPOAgentWrapper):
#         with torch.no_grad():
#             q_vals = agent.q_net(torch.tensor(states_arr))
#             q_vals = q_vals.cpu().numpy().flatten()
#         plt.figure(figsize=(8, 6))
#         plt.hist(q_vals, bins=50, alpha=0.7)
#         plt.title('Q-Value Distribution')
#         plt.xlabel('Q value')
#         plt.ylabel('Frequency')
#         plt.grid(True)
#         plt.savefig(os.path.join(PLOTS_DIR, 'q_value_distribution.png'))
#         plt.close()

#     # Angular velocity distribution (state index 4)
#     ang_vel = states_arr[:, 4]
#     plt.figure(figsize=(8, 6))
#     plt.hist(ang_vel, bins=50, alpha=0.7)
#     plt.title('Angular Velocity Distribution')
#     plt.xlabel('Angular Velocity')
#     plt.ylabel('Frequency')
#     plt.grid(True)
#     plt.savefig(os.path.join(PLOTS_DIR, 'angular_velocity_distribution.png'))
#     plt.close()

# Cell 15 (updated): Q‐Value & Angular Distribution Histograms
def plot_histograms(agent, algorithm: str, seed=0, num_samples=1000):
    """Sample random states to plot Q‐value & angular‐velocity histograms for one agent."""
    # 1) Collect states
    env = make_env(seed=seed)
    states = []
    for _ in range(num_samples):
        st, _ = env.reset()
        states.append(st)
    env.close()
    states_arr = np.array(states, dtype=np.float32)

    # 2) Q‐value distribution (only for DQN)
    if not isinstance(agent, PPOAgentWrapper):
        with torch.no_grad():
            q_v = agent.q_net(torch.tensor(states_arr))
            q_v = q_v.cpu().numpy().flatten()
        plt.figure(figsize=(8,6))
        plt.hist(q_v, bins=50, alpha=0.7)
        plt.title(f"Q‐Value Distribution: {algorithm}")
        plt.xlabel("Q value")
        plt.ylabel("Frequency")
        plt.grid(True)
        plt.tight_layout()
        fn = os.path.join(PLOTS_DIR, f"{algorithm}_q_value_distribution.png")
        plt.savefig(fn)
        plt.close()

    # 3) Angular velocity distribution (θ̇ ↦ state index 5)
    ang_vel = states_arr[:, 5]
    plt.figure(figsize=(8,6))
    plt.hist(ang_vel, bins=50, alpha=0.7)
    plt.title(f"Angular Velocity Distribution: {algorithm}")
    plt.xlabel("Angular velocity (rad/s)")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.tight_layout()
    fn = os.path.join(PLOTS_DIR, f"{algorithm}_angular_velocity_distribution.png")
    plt.savefig(fn)
    plt.close()

In [None]:
# Cell 16: GPU Memory Tracking Utility
import contextlib

class GPUTimer:
    """Context manager to track CUDA memory usage and runtime."""
    def __init__(self):
        self.peak_mem = []
        self.times = []

    @contextlib.contextmanager
    def track(self):
        """Track GPU memory and runtime for a code block."""
        t0 = time.time()
        if torch.cuda.is_available() and torch.cuda.device_count() > 0:
            torch.cuda.reset_peak_memory_stats()
        yield
        t1 = time.time()
        self.times.append(t1 - t0)
        # Record peak memory if available, else 0
        if torch.cuda.is_available() and torch.cuda.device_count() > 0:
            self.peak_mem.append(torch.cuda.max_memory_allocated())
        else:
            self.peak_mem.append(0)

    def summary(self):
        """Return mean time and peak GPU memory usage."""
        return {
            'mean_time_sec': np.mean(self.times) if self.times else 0,
            'mean_peak_gpu_bytes': np.mean(self.peak_mem) if self.peak_mem else 0
        }

In [None]:
# Cell 17: Orchestrator
import gymnasium

gpu_timer = GPUTimer()

agents = [
    {"name": "Vanilla_DQN", "factory": make_vanilla_dqn_agent, "is_ppo": False},
    {"name": "Double_DQN", "factory": make_double_dqn_agent, "is_ppo": False},
    {"name": "Dueling_DQN", "factory": make_dueling_dqn_agent, "is_ppo": False},
    {"name": "PER_DQN", "factory": make_per_dqn_agent, "is_ppo": False},
    {"name": "PPO", "factory": make_ppo_agent, "is_ppo": True},
]

# Train each agent with GPU monitoring
for agent_info in agents:
    name = agent_info["name"]
    factory = agent_info["factory"]
    is_ppo = agent_info["is_ppo"]
    seed = 0
    print(f"Starting training for {name}...")

    try:
        if is_ppo:
            # let the PPO factory build its own seeded VecEnv
            agent = factory(seed=0)
        else:
            env   = make_env(seed=0)
            agent = factory(env)

        with gpu_timer.track():
          agent = train_and_snapshot(name, factory, is_ppo=is_ppo)

        gpu_summary = gpu_timer.summary()
        print(f"{name} GPU Summary: {gpu_summary}")

        # — Post‐training 10‐episode evaluation —
        post_train_metrics = evaluate(agent, is_ppo, episodes=10, eval_seed=0)
        print(f"{name} Post Training → mean_reward: {post_train_metrics['mean_reward']}, "
        f"success_rate: {post_train_metrics['success_rate']:.2f}, "
        f"mean_dist: {post_train_metrics['mean_dist']}")

        # close DQN eval env
        if not is_ppo:
            env.close()

        plot_trajectory(agent, is_ppo, seed=0)
        # plot_histograms(agent, seed=0)
        plot_histograms(agent, name, seed=0)

    except Exception as e:
        print(f"⚠ Training failed for {name}: {e}")
        continue
print("✅ Training completed for all agents.")


Starting training for Vanilla_DQN...


  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  logger.warn(
  """
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Vanilla_DQN GPU Summary: {'mean_time_sec': np.float64(2102.551782608032), 'mean_peak_gpu_bytes': np.float64(0.0)}
Vanilla_DQN Post Training → mean_reward: 112.84 ± 23.23, success_rate: 0.00, mean_dist: 0.35 ± 0.10
Starting training for Double_DQN...


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Double_DQN GPU Summary: {'mean_time_sec': np.float64(1927.8222323656082), 'mean_peak_gpu_bytes': np.float64(0.0)}
Double_DQN Post Training → mean_reward: 87.51 ± 2.93, success_rate: 0.00, mean_dist: 0.69 ± 0.03
Starting training for Dueling_DQN...
Dueling_DQN GPU Summary: {'mean_time_sec': np.float64(2433.436619202296), 'mean_peak_gpu_bytes': np.float64(0.0)}
Dueling_DQN Post Training → mean_reward: 30.99 ± 3.26, success_rate: 0.00, mean_dist: 1.00 ± 0.02
Starting training for PER_DQN...
PER_DQN GPU Summary: {'mean_time_sec': np.float64(2325.4268735051155), 'mean_peak_gpu_bytes': np.float64(0.0)}
PER_DQN Post Training → mean_reward: -521.99 ± 22.11, success_rate: 0.00, mean_dist: 1.01 ± 0.04
Starting training for PPO...
Moviepy - Building video /content/videos/timeline_comparisons/milestone_10_eps/seed0/PPO_milestone_10_seed0-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_10_eps/seed0/PPO_milestone_10_seed0-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_10_eps/seed0/PPO_milestone_10_seed0-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_50_eps/seed0/PPO_milestone_50_seed0-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_50_eps/seed0/PPO_milestone_50_seed0-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_50_eps/seed0/PPO_milestone_50_seed0-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_50_eps/seed0/PPO_milestone_50_seed0-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_100_eps/seed0/PPO_milestone_100_seed0-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_100_eps/seed0/PPO_milestone_100_seed0-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_100_eps/seed0/PPO_milestone_100_seed0-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_100_eps/seed0/PPO_milestone_100_seed0-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_150_eps/seed0/PPO_milestone_150_seed0-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_150_eps/seed0/PPO_milestone_150_seed0-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_150_eps/seed0/PPO_milestone_150_seed0-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_150_eps/seed0/PPO_milestone_150_seed0-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_250_eps/seed0/PPO_milestone_250_seed0-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_250_eps/seed0/PPO_milestone_250_seed0-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_250_eps/seed0/PPO_milestone_250_seed0-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_250_eps/seed0/PPO_milestone_250_seed0-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_10_eps/seed1/PPO_milestone_10_seed1-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_10_eps/seed1/PPO_milestone_10_seed1-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_10_eps/seed1/PPO_milestone_10_seed1-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_10_eps/seed1/PPO_milestone_10_seed1-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_50_eps/seed1/PPO_milestone_50_seed1-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_50_eps/seed1/PPO_milestone_50_seed1-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_50_eps/seed1/PPO_milestone_50_seed1-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_50_eps/seed1/PPO_milestone_50_seed1-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_100_eps/seed1/PPO_milestone_100_seed1-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_100_eps/seed1/PPO_milestone_100_seed1-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_100_eps/seed1/PPO_milestone_100_seed1-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_100_eps/seed1/PPO_milestone_100_seed1-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_150_eps/seed1/PPO_milestone_150_seed1-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_150_eps/seed1/PPO_milestone_150_seed1-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_150_eps/seed1/PPO_milestone_150_seed1-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_250_eps/seed1/PPO_milestone_250_seed1-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_250_eps/seed1/PPO_milestone_250_seed1-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_250_eps/seed1/PPO_milestone_250_seed1-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_250_eps/seed1/PPO_milestone_250_seed1-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_10_eps/seed2/PPO_milestone_10_seed2-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_10_eps/seed2/PPO_milestone_10_seed2-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_10_eps/seed2/PPO_milestone_10_seed2-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_10_eps/seed2/PPO_milestone_10_seed2-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_50_eps/seed2/PPO_milestone_50_seed2-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_50_eps/seed2/PPO_milestone_50_seed2-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_50_eps/seed2/PPO_milestone_50_seed2-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_50_eps/seed2/PPO_milestone_50_seed2-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_100_eps/seed2/PPO_milestone_100_seed2-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_100_eps/seed2/PPO_milestone_100_seed2-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_100_eps/seed2/PPO_milestone_100_seed2-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_100_eps/seed2/PPO_milestone_100_seed2-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_150_eps/seed2/PPO_milestone_150_seed2-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_150_eps/seed2/PPO_milestone_150_seed2-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_150_eps/seed2/PPO_milestone_150_seed2-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_150_eps/seed2/PPO_milestone_150_seed2-step-0-to-step-250.mp4
Saving video to /content/videos/timeline_comparisons/milestone_250_eps/seed2/PPO_milestone_250_seed2-step-0-to-step-250.mp4
Moviepy - Building video /content/videos/timeline_comparisons/milestone_250_eps/seed2/PPO_milestone_250_seed2-step-0-to-step-250.mp4.
Moviepy - Writing video /content/videos/timeline_comparisons/milestone_250_eps/seed2/PPO_milestone_250_seed2-step-0-to-step-250.mp4





Moviepy - Done !
Moviepy - video ready /content/videos/timeline_comparisons/milestone_250_eps/seed2/PPO_milestone_250_seed2-step-0-to-step-250.mp4
PPO GPU Summary: {'mean_time_sec': np.float64(2432.4859722137453), 'mean_peak_gpu_bytes': np.float64(0.0)}
PPO Post Training → mean_reward: 56.72 ± 0.00, success_rate: 0.00, mean_dist: 0.98 ± 0.00
✅ Training completed for all agents.


In [None]:
# Cell 18: Utilities
def compute_sample_efficiency(log_df, reward_threshold=200):
    """Compute steps to reach a reward threshold for sample efficiency."""
    first_success = log_df[log_df['reward'] >= reward_threshold]['steps'].cumsum()
    return first_success.iloc[0] if not first_success.empty else float('inf')

def compute_training_stability(log_df, window=10):
    """Compute variance of rewards over the last N episodes."""
    if len(log_df) < window:
        return 0.0
    return np.var(log_df['reward'].tail(window))

def validate_log_file(file_path):
    """Validate CSV log file integrity."""
    try:
        df = pd.read_csv(file_path)
        required_cols = ['algorithm', 'seed', 'episode', 'reward']
        if not all(col in df.columns for col in required_cols):
            print(f"⚠ Missing required columns in {file_path}")
            return None
        if df.empty or df['reward'].isna().all():
            print(f"⚠ Empty or invalid data in {file_path}")
            return None
        return df
    except (pd.errors.ParserError, pd.errors.EmptyDataError):
        print(f"⚠ Corrupted or empty file: {file_path}")
        return None

In [None]:
# Cell 19: Post-Training Evaluation & Plots (v1)
import glob

all_logs = glob.glob(os.path.join(BASE_LOG_DIR, "master_full_training.csv"))

if not all_logs:
  print(f"⚠ No full training logs in '{BASE_LOG_DIR}'. Run training cells first.")
else:
  df_train = pd.read_csv(all_logs[0])
# Convert to numeric
  for col in ['episode', 'reward', 'time_sec', 'sample_efficiency', 'training_std']:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce')
  df_train = df_train.dropna(subset=['episode', 'reward'])

  # Plot reward learning curve
  plt.figure(figsize=(10, 6))
  df_train.groupby(['episode', 'algorithm'])['reward'].mean().unstack('algorithm').plot(title='Mean Reward vs Episode')
  plt.xlabel('Episode')
  plt.ylabel('Mean Reward')
  plt.savefig(os.path.join(PLOTS_DIR, 'learning_curve_reward.png'))
  plt.close()

  # Plot fuel vs reward
  plt.figure(figsize=(8, 6))
  plt.scatter(df_train['reward'], df_train['fuel'])
  plt.title('Reward vs Fuel per Episode')
  plt.xlabel('Reward')
  plt.ylabel('Remaining Fuel')
  plt.savefig(os.path.join(PLOTS_DIR, 'reward_vs_fuel.png'))
  plt.close()

# Load milestone metrics
m_files = glob.glob(os.path.join(BASE_LOG_DIR, "master_milestone_metrics.csv"))

if not m_files:
  print(f"⚠ No milestone metrics found. Run Cell 13 first.")
else:
  df_milestones = pd.read_csv(m_files[0])
  df_milestones = df_milestones[df_milestones['episode'] != 'episode'].reset_index(drop=True)
  raw = df_milestones['mean_reward'].astype(str)
  df_milestones['mean_reward'] = raw.str.split(' ± ').str[0].astype(float)
  df_milestones['std_reward']    = raw.str.split(' ± ').str[1].astype(float)

# Convert to numeric
  for col in ['episode', 'mean_reward', 'std_reward', 'success_rate', 'mean_fuel', 'sample_efficiency','training_std', 'eval_variance', 'time_sec', 'gpu_bytes']:
    df_milestones[col] = pd.to_numeric(df_milestones[col], errors='coerce')
  df_milestones = df_milestones.dropna(subset=['episode', 'mean_reward'])

  # Bar: reward by milestone
  plt.figure(figsize=(10, 6))
  df_milestones.groupby(['episode', 'algorithm'])['mean_reward'].mean().unstack('algorithm').plot(kind='bar', title='Mean Reward by Milestone')
  plt.xlabel('Milestone')
  plt.ylabel('Mean Reward')
  plt.savefig(os.path.join(PLOTS_DIR, 'milestones_reward.png'))
  plt.close()

  # Bar: success rate by milestone
  plt.figure(figsize=(10, 6))
  df_milestones.groupby(['episode', 'algorithm'])['success_rate'].mean().unstack('algorithm').plot(kind='bar', title='Success Rate by Milestone')
  plt.xlabel('Milestone')
  plt.ylabel('Success Rate')
  plt.savefig(os.path.join(PLOTS_DIR, 'milestones_success_rate.png'))
  plt.close()

  # Bar: sample efficiency by milestone
  plt.figure(figsize=(10, 6))
  df_milestones.groupby(['episode', 'algorithm'])['sample_efficiency'].mean().unstack('algorithm').plot(kind='bar', title='Sample Efficiency by Milestone')
  plt.xlabel('Milestone')
  plt.ylabel('Steps to Reward ≥ 200')
  plt.savefig(os.path.join(PLOTS_DIR, 'milestones_sample_efficiency.png'))
  plt.close()

  # Bar: Compute cost by time
  plt.figure(figsize=(10, 6))
  plt.bar(df_milestones['algorithm'], df_milestones['time_sec'])
  plt.title('Compute Cost: Wall-Clock Time')
  plt.xlabel('Algorithm')
  plt.ylabel('Time (s)')
  plt.savefig(os.path.join(PLOTS_DIR, 'compute_cost_time.png'))
  plt.close()

  # Scatter: reward vs time
  plt.figure(figsize=(8, 6))
  plt.scatter(df_train['reward'], df_train['time_sec'])
  plt.title('Reward vs Time per Episode')
  plt.xlabel('Reward')
  plt.ylabel('Time (s)')
  plt.savefig(os.path.join(PLOTS_DIR, 'reward_vs_time.png'))
  plt.close()


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [None]:
# # Cell 20: Post-Training Evaluation & Plots (v2)
# import glob
# import matplotlib.pyplot as plt

# # ——————— Load full‐training log ———————
# all_logs = glob.glob(os.path.join(BASE_LOG_DIR, "master_full_training.csv"))
# if not all_logs:
#     print(f"⚠ No full training logs in '{BASE_LOG_DIR}'. Run training first.")
# else:
#     df_train = pd.read_csv(all_logs[0])
#     # numeric conversions
#     for col in ['episode','reward','time_sec','sample_efficiency','training_std','fuel']:
#         df_train[col] = pd.to_numeric(df_train[col], errors='coerce')
#     df_train.dropna(subset=['episode','reward'], inplace=True)

#     # 1) Learning curve (unchanged)
#     plt.figure(figsize=(10,6))
#     df_train.groupby(['episode','algorithm'])['reward'] \
#             .mean().unstack('algorithm') \
#             .plot(title="Mean Reward vs Episode")
#     plt.xlabel("Episode")
#     plt.ylabel("Mean Reward")
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"learning_curve_reward.png"))
#     plt.close()

#     # 2) Reward vs Fuel, split by algorithm
#     plt.figure(figsize=(8,6))
#     for algo, grp in df_train.groupby("algorithm"):
#         plt.scatter(
#             grp["reward"],
#             grp["fuel"],
#             s=10, alpha=0.6,
#             label=algo
#         )
#     plt.title("Reward vs Remaining Fuel per Episode")
#     plt.xlabel("Reward")
#     plt.ylabel("Remaining Fuel")
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"reward_vs_fuel_by_algo.png"))
#     plt.close()

#     # 3) Reward vs Time, split by algorithm
#     plt.figure(figsize=(8,6))
#     for algo, grp in df_train.groupby("algorithm"):
#         plt.scatter(
#             grp["reward"],
#             grp["time_sec"],
#             s=10, alpha=0.6,
#             label=algo
#         )
#     plt.title("Reward vs Time per Episode")
#     plt.xlabel("Reward")
#     plt.ylabel("Time (s)")
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"reward_vs_time_by_algo.png"))
#     plt.close()


# # ——————— Load milestone metrics ———————
# m_files = glob.glob(os.path.join(BASE_LOG_DIR, "master_milestone_metrics.csv"))
# if not m_files:
#     print(f"⚠ No milestone metrics found. Run Cell 13 first.")
# else:
#     df_m = pd.read_csv(m_files[0])
#     # drop possible header‐row artifacts
#     df_m = df_m[df_m['episode']!='episode'].reset_index(drop=True)

#     # parse out mean ± std
#     parts = df_m['mean_reward'].astype(str).str.split(' ± ', expand=True)
#     df_m['mean_reward'] = parts[0].astype(float)
#     df_m['std_reward']  = parts[1].astype(float)

#     # numeric conversions
#     for col in ['episode','mean_reward','std_reward','success_rate',
#                 'mean_fuel','sample_efficiency','training_std',
#                 'eval_variance','time_sec','gpu_bytes']:
#         df_m[col] = pd.to_numeric(df_m[col], errors='coerce')
#     df_m.dropna(subset=['episode','mean_reward'], inplace=True)

#     # 4) Mean reward by milestone with error bars
#     plt.figure(figsize=(10,6))
#     for algo, grp in df_m.groupby("algorithm"):
#         plt.errorbar(
#             grp["episode"],
#             grp["mean_reward"],
#             yerr=grp["std_reward"],
#             marker='o', capsize=3,
#             label=algo
#         )
#     plt.title("Mean Reward ± Std by Milestone")
#     plt.xlabel("Milestone Episode")
#     plt.ylabel("Mean Reward")
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"milestones_mean_reward_errorbar.png"))
#     plt.close()

#     # 5) Success rate trend
#     plt.figure(figsize=(10,6))
#     for algo, grp in df_m.groupby("algorithm"):
#         plt.plot(
#             grp["episode"],
#             grp["success_rate"],
#             marker='o', linestyle='-',
#             label=algo
#         )
#     plt.title("Success Rate by Milestone")
#     plt.xlabel("Milestone Episode")
#     plt.ylabel("Success Rate")
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"milestones_success_rate_line.png"))
#     plt.close()

#     # 6) Sample efficiency trend (log‐scale to handle inf)
#     plt.figure(figsize=(10,6))
#     for algo, grp in df_m.groupby("algorithm"):
#         plt.plot(
#             grp["episode"],
#             grp["sample_efficiency"],
#             marker='o', linestyle='-',
#             label=algo
#         )
#     plt.yscale('log')
#     plt.title("Sample Efficiency by Milestone (log scale)")
#     plt.xlabel("Milestone Episode")
#     plt.ylabel("Steps to Reward ≥ 200 (log)")
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"milestones_sample_efficiency_log.png"))
#     plt.close()

#     # 7) Compute cost (bar)
#     plt.figure(figsize=(8,6))
#     cost = df_m.groupby("algorithm")["time_sec"].mean()
#     cost.plot(kind='bar')
#     plt.title("Compute Cost: Wall-Clock Time")
#     plt.xlabel("Algorithm")
#     plt.ylabel("Time (s)")
#     plt.tight_layout()
#     plt.savefig(os.path.join(PLOTS_DIR,"compute_cost_time_bar.png"))
#     plt.close()

<Figure size 1000x600 with 0 Axes>