In [None]:
!pip install gymnasium
# !pip install "gymnasium[atari, accept-rom-license]"
# !pip install ale-py




In [None]:
#!/usr/bin/env python3
%%writefile MountainCar-DQN-a.py
"""
dqn_part1a_random.py

Part (a) for the DQN assignment: load MountainCar-v0, print spaces, run a random agent,
record rewards and successes, and save a plot of performance + action-choice scatter.

Usage:
    python3 dqn_part1a_random.py --environment "MountainCar-v0" --episodes 40 --mean_n 5 --seed 0

"""

import argparse
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import random
from itertools import count
import os

def parse_args():
    p = argparse.ArgumentParser(description="Running with different episodic counts and mean rewards")
    p.add_argument("--environment", type=str, default="MountainCar-v0", help="Gym environment id")
    p.add_argument("--episodes", type=int, default=100, help="Number of episodes to run")
    p.add_argument("--mean_n", type=int, default=5, help="n for rolling mean plot")
    p.add_argument("--seed", type=int, default=None, help="Random seed (optional)")
    p.add_argument("--render", action="store_true", help="Render environment (slows down execution)")
    return p.parse_args()
    # Fix for Colab: parse known arguments and ignore the rest
    # args, unknown = p.parse_known_args()
    # return args
    # Simpler fix for this specific case in Colab: pass an empty list to parse_args
    # return p.parse_args([])


def safe_reset(env):
    """Handle gym vs gymnasium reset return types."""
    out = env.reset()
    # gym classic: obs
    # newer gym returns (obs, info)
    if isinstance(out, tuple) and len(out) >= 1:
        return out[0]
    return out

def safe_step(env, action):
    """Handle gym vs gymnasium step return types."""
    out = env.step(action)
    # old: obs, reward, done, info
    # new: obs, reward, terminated, truncated, info
    if len(out) == 4:
        obs, reward, done, info = out
        return obs, reward, done, info
    elif len(out) == 5:
        obs, reward, terminated, truncated, info = out
        done = terminated or truncated
        return obs, reward, done, info
    else:
        raise RuntimeError("Unexpected step output format: len = {}".format(len(out)))

def get_state_tensor(obs):
    """Return numpy array representation for plotting / visualization. Kept simple."""
    return np.array(obs, dtype=np.float32)

def select_random_action(action_space):
    """Return a single integer action chosen uniformly at random."""
    return action_space.sample()

def plot_results(rewards_mean, steps, best_rewards_mean, env_name, file_name, action_scatter):
    """Create a 2-panel plot: performance and action choices scatter."""
    fig = plt.figure(figsize=(12,5))

    # Performance subplot
    ax1 = fig.add_subplot(121)
    ax1.plot(steps, rewards_mean, label=f"{len(steps)}-point rolling mean")
    ax1.plot(steps, best_rewards_mean, label="Best mean reward")
    ax1.grid(True)
    ax1.set_xlabel("Total environment steps")
    ax1.set_ylabel("Reward (higher is better)")
    ax1.legend()
    ax1.set_title(f"Performance of random agent on {env_name}")

    # Action choices subplot
    ax2 = fig.add_subplot(122)
    # action_scatter: list of (pos, vel, action)
    if len(action_scatter) > 0:
        arr = np.array(action_scatter)
        X = arr[:,0].astype(float)
        Y = arr[:,1].astype(float)
        Z = arr[:,2].astype(int)
        # color map for 3 discrete actions
        cmap = {0: 'lime', 1: 'red', 2: 'blue'}
        colors = [cmap[int(a)] for a in Z]
        ax2.scatter(X, Y, c=colors, s=12, alpha=0.7)
        action_names = ['Left (0)', 'No-Op (1)', 'Right (2)']
        # legend patches
        legend_recs = [mpatches.Rectangle((0,0),1,1,fc=cmap[i]) for i in range(3)]
        ax2.legend(legend_recs, action_names, loc='best')
    ax2.set_title("Random agent action choices (sampled states)")
    ax2.set_xlabel("Position")
    ax2.set_ylabel("Velocity")

    plt.suptitle(f"{env_name} - Random Agent Analysis")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(file_name, dpi=200)
    print(f"Saved plot to {file_name}")
    plt.show()
    plt.close(fig)
    return

def run_random_agent(env_id, episodes, mean_n, seed=None, render=False):
    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    env = gym.make(env_id)
    # print spaces and ranges
    print("Environment:", env_id)
    print("Observation space:", env.observation_space)
    print("Action space:", env.action_space)

    episode_rewards = []
    best_reward = -float('inf')
    rewards_mean = []
    best_rewards_mean = []
    steps = []

    total_steps = 0
    success_count = 0

    action_scatter = []  # store (pos, vel, action) samples for plotting

    for ep in range(episodes):
        obs = safe_reset(env)
        state = get_state_tensor(obs)
        total_reward = 0.0

        for t in count():
            if render:
                env.render()

            action = select_random_action(env.action_space)
            # sample a few states for action-choices scatter (not every step)
            if len(action_scatter) < 2000 and (total_steps % max(1, int(100/episodes)) == 0):
                action_scatter.append((state[0], state[1], action))

            obs, reward, done, info = safe_step(env, action)
            state = get_state_tensor(obs)
            total_reward += reward
            total_steps += 1

            if done or t >= 10000:  # safety cap
                # MountainCar success condition: position >= 0.5 at termination
                try:
                    pos = state[0]
                except:
                    pos = None
                if pos is not None and pos >= 0.5:
                    success_count += 1
                break

        episode_rewards.append(total_reward)

        if len(episode_rewards) >= mean_n:
            present_mean = float(np.mean(episode_rewards[-mean_n:]))
            rewards_mean.append(present_mean)
            best_reward = max(present_mean, best_reward)
            best_rewards_mean.append(best_reward)
            steps.append(total_steps)

        print(f"Episode {ep+1}/{episodes} | Reward = {total_reward:.2f} | Successes so far = {success_count}")

    env.close()

    # prepare plot filename
    fn = f"{env_id}_random_{episodes}ep_mean{mean_n}.png"
    plot_results(rewards_mean, steps, best_rewards_mean, env_id, fn, action_scatter)

    # Summarize observations
    summary = {
        "total_episodes": episodes,
        "mean_n": mean_n,
        "final_mean_reward": rewards_mean[-1] if rewards_mean else None,
        "best_mean_reward": best_rewards_mean[-1] if best_rewards_mean else None,
        "success_count": success_count,
        "total_steps": total_steps
    }
    return summary

if __name__ == "__main__":
    args = parse_args()
    summary = run_random_agent(args.environment, args.episodes, args.mean_n, args.seed, args.render)
    print("\n=== Summary ===")
    for k, v in summary.items():
        print(f"{k}: {v}")

Writing MountainCar-DQN-a.py


In [None]:
!python3 MountainCar-DQN-a.py --environment "MountainCar-v0" --episodes 100 --mean_n 5 --seed 303


Environment: MountainCar-v0
Observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Action space: Discrete(3)
Episode 1/100 | Reward = -200.00 | Successes so far = 0
Episode 2/100 | Reward = -200.00 | Successes so far = 0
Episode 3/100 | Reward = -200.00 | Successes so far = 0
Episode 4/100 | Reward = -200.00 | Successes so far = 0
Episode 5/100 | Reward = -200.00 | Successes so far = 0
Episode 6/100 | Reward = -200.00 | Successes so far = 0
Episode 7/100 | Reward = -200.00 | Successes so far = 0
Episode 8/100 | Reward = -200.00 | Successes so far = 0
Episode 9/100 | Reward = -200.00 | Successes so far = 0
Episode 10/100 | Reward = -200.00 | Successes so far = 0
Episode 11/100 | Reward = -200.00 | Successes so far = 0
Episode 12/100 | Reward = -200.00 | Successes so far = 0
Episode 13/100 | Reward = -200.00 | Successes so far = 0
Episode 14/100 | Reward = -200.00 | Successes so far = 0
Episode 15/100 | Reward = -200.00 | Successes so far = 0
Episode 16/100 | Reward = -200.00 

## part (b)

In [None]:
#!/usr/bin/env python3
%%writefile MountainCar-DQN-b.py
"""
Part (b) — Deep Q-Network (DQN) implementation for MountainCar-v0
Based on assignment guidelines and optimized for reward shaping and convergence.
"""

import math
import random
from itertools import count
from collections import namedtuple, deque
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import argparse
import pandas as pd

# ------------------------- #
# Argument Parsing
# ------------------------- #
parser = argparse.ArgumentParser(description="DQN for MountainCar-v0")
parser.add_argument("--environment", type=str, default="MountainCar-v0")
parser.add_argument("--num_episodes", type=int, default=200)
parser.add_argument("--batch", type=int, default=64)
parser.add_argument("--gamma", type=float, default=0.99)
parser.add_argument("--learning_rate", type=float, default=1e-3)
parser.add_argument("--mean_n", type=int, default=5)
parser.add_argument("--seed", type=int, default=0)
args = parser.parse_args()

# ------------------------- #
# Setup
# ------------------------- #
env = gym.make(args.environment)
env.reset(seed=args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------- #
# DQN Model Definition
# ------------------------- #
class DQN(nn.Module):
    def __init__(self, state_dim=2, hidden_dim=200, action_dim=3):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x):
        return self.net(x)

# ------------------------- #
# Replay Memory
# ------------------------- #
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory:
    def __init__(self, capacity=10000):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# ------------------------- #
# Helper Functions
# ------------------------- #
def get_state(obs):
    """Convert observation to tensor."""
    s = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
    return s

def select_action(state, eps_threshold):
    """ε-greedy action selection."""
    if random.random() > eps_threshold:
        with torch.no_grad():
            return policy_net(state).argmax(dim=1).view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

def optimize_model():
    """Perform a single optimization step."""
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    # Prepare batch tensors
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                  device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    # Compute Q(s_t, a)
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1})
    next_state_values = torch.zeros(batch_size, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    loss = F.mse_loss(state_action_values.squeeze(), expected_state_action_values)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
    optimizer.step()

# ------------------------- #
# Training Loop
# ------------------------- #
n_actions = env.action_space.n
batch_size = args.batch
gamma = args.gamma
num_episodes = args.num_episodes
lr = args.learning_rate
mean_n = args.mean_n

policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)
memory = ReplayMemory(10000)

eps_start, eps_end, eps_decay = 1.0, 0.02, 800
target_update = 10
initial_memory = 1000
total_steps = 0

rewards_list, mean_rewards, best_mean_rewards, steps_list = [], [], [], []
best_mean = -float("inf")
successes = 0

for i_episode in range(num_episodes):
    obs, _ = env.reset()
    state = get_state(obs)
    total_reward = 0.0

    for t in count():
        eps_threshold = eps_end + (eps_start - eps_end) * math.exp(-1. * total_steps / eps_decay)
        action = select_action(state, eps_threshold)
        obs_next, reward, done, truncated, _ = env.step(action.item())
        done = done or truncated

        # --- Reward shaping: encourage forward progress and momentum ---
        # Standard MountainCar reward is -1 per step. Goal is at position >= 0.5.
        # Reward shaping can help the agent learn faster.
        # shaped_reward = -1.0 # Base penalty per step

        # Encourage moving towards the goal (positive position)
        # The closer to 0.5, the higher the shaped_reward
        # shaped_reward += obs_next[0]

        # Encourage positive velocity (moving right)
        # shaped_reward += abs(obs_next[1]) * 10 # Reward for velocity magnitude

        # Another common shaping: reward proportional to position and velocity
        # This encourages moving right (positive pos) and building momentum (positive vel)
        shaped_reward = obs_next[0] + 0.5 + abs(obs_next[1]*10)

        # Additional reward for reaching the goal
        if obs_next[0] >= 0.5:
            shaped_reward += 100 # Large bonus for success


        # Convert to tensor
        reward_tensor = torch.tensor([shaped_reward], device=device)

        next_state = None if done else get_state(obs_next)
        memory.push(state, action, next_state, reward_tensor)

        state = next_state
        total_reward += reward # Keep track of the TRUE environment reward
        total_steps += 1

        if total_steps > initial_memory:
            optimize_model()

        if done:
            if obs_next[0] >= 0.5:
                successes += 1
            break

    rewards_list.append(total_reward)

    if i_episode >= mean_n:
        mean_r = np.mean(rewards_list[-mean_n:])
        mean_rewards.append(mean_r)
        best_mean = max(best_mean, mean_r)
        best_mean_rewards.append(best_mean)
        steps_list.append(total_steps)

    if i_episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {i_episode+1}/{num_episodes} | Reward: {total_reward:.2f} | "
          f"Success: {successes} | ε={eps_threshold:.3f}")

# ------------------------- #
# Plotting
# ------------------------- #
env.close()

file_name = f"{args.environment}_DQN_{num_episodes}_episodes.png"

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Performance Plot
# Ensure steps_list and mean_rewards are not empty
if steps_list and mean_rewards:
    ax1.plot(steps_list, mean_rewards, label=f"{mean_n}-episode mean")
    ax1.plot(steps_list, best_mean_rewards, label="Best mean reward")
ax1.set_xlabel("Steps")
ax1.set_ylabel("Reward")
ax1.set_title("DQN Performance on MountainCar-v0")
ax1.grid(True)
ax1.legend()

# Action-choice Plot
# X = np.random.uniform(-1.2, 0.6, 1000) # Original observation space ranges
# Y = np.random.uniform(-0.07, 0.07, 1000)
# Using slightly expanded ranges for better visualization of policy boundaries
X = np.linspace(-1.5, 0.6, 50) # Sample more systematically
Y = np.linspace(-1, 1, 50)
X, Y = np.meshgrid(X, Y)
X, Y = X.flatten(), Y.flatten()


Z = []
# Use policy_net to determine the action for each state
states_to_evaluate = torch.tensor(np.array(list(zip(X, Y))), dtype=torch.float32, device=device)
with torch.no_grad():
    actions_tensor = policy_net(states_to_evaluate).argmax(dim=1).cpu().numpy() # Get index of max Q-value
Z = actions_tensor

colors = ['lime', 'red', 'blue'] # 0: left, 1: no-op, 2: right
ax2.scatter(X, Y, c=[colors[z] for z in Z], s=1, alpha=0.7)
ax2.set_xlabel("Position")
ax2.set_ylabel("Velocity")
ax2.set_title("Trained DQN Action Choices")
legend_recs = [mpatches.Patch(color=colors[i], label=f"Action {i}") for i in range(3)]
ax2.legend(handles=legend_recs)
ax2.set_xlim([-1.5, 0.6])
ax2.set_ylim([-1, 1])


plt.tight_layout()
plt.savefig(file_name, dpi=200)
print(f"Saved plot to {file_name}")

Writing MountainCar-DQN-b.py


In [None]:
!python3 MountainCar-DQN-b.py --environment="MountainCar-v0" --num_episodes=100 --batch=8 --gamma=0.9 --learning_rate=1e-4 --mean_n=5


Episode 1/100 | Reward: -200.00 | Success: 0 | ε=0.784
Episode 2/100 | Reward: -200.00 | Success: 0 | ε=0.615
Episode 3/100 | Reward: -200.00 | Success: 0 | ε=0.483
Episode 4/100 | Reward: -200.00 | Success: 0 | ε=0.381
Episode 5/100 | Reward: -200.00 | Success: 0 | ε=0.301
Episode 6/100 | Reward: -200.00 | Success: 0 | ε=0.239
Episode 7/100 | Reward: -200.00 | Success: 0 | ε=0.191
Episode 8/100 | Reward: -200.00 | Success: 0 | ε=0.153
Episode 9/100 | Reward: -200.00 | Success: 0 | ε=0.123
Episode 10/100 | Reward: -200.00 | Success: 0 | ε=0.101
Episode 11/100 | Reward: -200.00 | Success: 0 | ε=0.083
Episode 12/100 | Reward: -200.00 | Success: 0 | ε=0.069
Episode 13/100 | Reward: -200.00 | Success: 0 | ε=0.058
Episode 14/100 | Reward: -200.00 | Success: 0 | ε=0.050
Episode 15/100 | Reward: -200.00 | Success: 0 | ε=0.043
Episode 16/100 | Reward: -200.00 | Success: 0 | ε=0.038
Episode 17/100 | Reward: -200.00 | Success: 0 | ε=0.034
Episode 18/100 | Reward: -200.00 | Success: 0 | ε=0.031
E

# part (c)

In [None]:
# !pip install gymnasium
# !pip install "gymnasium[atari, accept-rom-license]"
# !pip install ale-py


In [None]:
# import gymnasium as gym
# import time

# def inspect_environment(env_name):
#     """Loads an environment and prints its action and observation space."""
#     print(f"--- Inspecting Environment: {env_name} ---")

#     # Load the environment
#     # For Pong, render_mode='human' will open a window
#     try:
#         env = gym.make(env_name, render_mode='human' if 'Pong' in env_name else None)
#     except Exception as e:
#         print(f"Error loading {env_name}: {e}")
#         print("For 'Pong-v0', did you run: pip install \"gymnasium[atari, accept-rom-license]\"?")
#         print("-" * 40 + "\n")
#         return

#     # 1. Print Observation Space (State)
#     print(f"Observation Space (State): {env.observation_space}")
#     print(f"  - Shape: {env.observation_space.shape}")
#     print(f"  - Sample State: {env.observation_space.sample()}\n")

#     # 2. Print Action Space
#     print(f"Action Space: {env.action_space}")
#     print(f"  - Type: {type(env.action_space)}")

#     # For Discrete spaces, print the number of actions
#     if isinstance(env.action_space, gym.spaces.Discrete):
#         print(f"  - Number of Actions: {env.action_space.n}")

#     # For Atari envs, we can get action meanings
#     if hasattr(env.unwrapped, 'get_action_meanings'):
#         print(f"  - Action Meanings: {env.unwrapped.get_action_meanings()}")

#     env.close()
#     print("-" * 40 + "\n")

# # --- Run the Inspection ---
# inspect_environment('MountainCar-v0')
# # inspect_environment('Pong-v0')

In [None]:
# import gymnasium as gym
# import time

# def run_random_agent(env_name, episodes=10):
#     """Runs a random agent for a few episodes and prints the total reward."""
#     print(f"--- Running Random Agent on: {env_name} ---")

#     # Using render_mode='human' to watch the agent play
#     env = gym.make(env_name) # Removed render_mode='human'

#     for episode in range(episodes):
#         # Reset the environment to get the initial state
#         observation, info = env.reset() # Modified to unpack both observation and info
#         total_reward = 0
#         done = False

#         while not done:
#             # 1. Choose a random action
#             # This is the correct way to select a random action
#             action = env.action_space.sample()

#             # 2. Take the action and get the new state, reward, and done flag
#             # Note: gym.step() returns (observation, reward, terminated, truncated, info)
#             # 'done' is True if either terminated or truncated is True.
#             observation, reward, terminated, truncated, info = env.step(action)
#             done = terminated or truncated

#             # 3. Accumulate the reward
#             total_reward += reward

#             # 4. Render the environment to watch
#             # env.render() # Removed render as it might not be supported

#             # A small delay so we can see what's happening
#             if "Pong" in env_name:
#                 time.sleep(0.01)

#         print(f"Episode {episode + 1}: Total Reward = {total_reward}")

#     env.close()
#     print("-" * 40 + "\n")

# # --- Run the Random Agent Test ---
# run_random_agent('MountainCar-v0', episodes=400)
# # run_random_agent('Pong-v0', episodes=5) # 5 is enough for Pong

In [None]:
# import gymnasium as gym
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import matplotlib.patches as mpatches
# import random
# import time
# import warnings

# # Suppress warnings from pandas/matplotlib if any
# warnings.filterwarnings("ignore")

# def plot_episode_rewards(rewards_list, env_name):
#     """
#     Plots the total reward received in each episode.
#     For a random agent, this should be a noisy, flat line.
#     """
#     plt.figure(figsize=(10, 5))
#     plt.plot(rewards_list, alpha=0.7, label='Episode Reward')
#     plt.title(f"Total Reward per Episode (Random Agent) - {env_name}")
#     plt.xlabel("Episode Number")
#     plt.ylabel("Total Reward")
#     plt.grid(True)

#     # Calculate and show the mean reward
#     mean_reward = np.mean(rewards_list)
#     plt.axhline(mean_reward, color='red', linestyle='--', label=f'Mean Reward: {mean_reward:.2f}')
#     plt.legend()

#     filename = f"{env_name}_random_agent_rewards.png"
#     # plt.savefig(filename)
#     # print(f"Saved reward plot to {filename}")
#     plt.show()

# def plot_random_policy_mountaincar():
#     """
#     Visualizes the policy of a random agent for MountainCar-v0.
#     It shows which action (left, nothing, right) a random agent would pick
#     at various randomly sampled states.
#     """
#     print("Generating random policy plot for MountainCar-v0...")
#     plt.figure(figsize=(10, 6))
#     ax = plt.subplot(111)

#     # Define the action colors and labels
#     colour_map = {0: 'red', 1: 'lime', 2: 'blue'}
#     action_choices = ['Push Left (0)', 'No Push (1)', 'Push Right (2)']

#     # Sample 1000 random states from the observation space
#     # State = [position, velocity]
#     # Pos range: -1.2 to 0.6
#     # Vel range: -0.07 to 0.07
#     X_pos = np.random.uniform(-1.2, 0.6, 1000)
#     Y_vel = np.random.uniform(-0.07, 0.07, 1000)

#     # For each state, get a random action (0, 1, or 2)
#     # This IS the random policy
#     Z_action = [random.randrange(3) for _ in range(1000)]

#     # Map actions to colors
#     colors = [colour_map[action] for action in Z_action]

#     # Create the scatter plot
#     ax.scatter(X_pos, Y_vel, c=colors, alpha=0.6, s=10) # s=10 for smaller dots

#     ax.set_title("Random Agent 'Policy' for MountainCar-v0")
#     ax.set_xlabel("Position (Car's x-coordinate)")
#     ax.set_ylabel("Velocity (Car's velocity)")

#     # Create a custom legend
#     legend_patches = [mpatches.Patch(color=colour_map[i], label=action_choices[i]) for i in range(3)]
#     ax.legend(handles=legend_patches, loc='best')

#     filename = "MountainCar-v0_random_policy.png"
#     # plt.savefig(filename)
#     # print(f"Saved policy plot to {filename}")
#     plt.show()


# def run_random_agent_with_plotting(env_name, episodes=100):
#     """
#     Runs a random agent for 'episodes' steps and gathers data for plotting.
#     Set episodes higher (e.g., 100) for a better average.
#     """
#     print(f"--- Running Random Agent on: {env_name} ---")

#     # Set render_mode=None to run faster. No need to watch it.
#     try:
#         env = gym.make(env_name, render_mode=None)
#     except Exception as e:
#         print(f"Error loading {env_name}: {e}")
#         print("For 'Pong-v0', make sure you have run: pip install \"gymnasium[atari, accept-rom-license]\"")
#         print("-" * 40 + "\n")
#         return

#     episode_rewards = [] # Store rewards for plotting

#     for episode in range(episodes):
#         env.reset()
#         total_reward = 0
#         done = False

#         while not done:
#             # Get a random action from the environment's action space
#             action = env.action_space.sample()

#             observation, reward, terminated, truncated, info = env.step(action)
#             done = terminated or truncated
#             total_reward += reward

#         episode_rewards.append(total_reward)
#         if (episode + 1) % 20 == 0: # Print progress every 20 episodes
#             print(f"  ... Episode {episode + 1} finished. Total Reward: {total_reward}")

#     env.close()
#     print(f"Finished {episodes} episodes.")

#     # --- Generate Plots ---
#     plot_episode_rewards(episode_rewards, env_name)

#     # Only generate the policy plot for MountainCar
#     if env_name == 'MountainCar-v0':
#         plot_random_policy_mountaincar()

#     print("-" * 40 + "\n")

# # --- Run the Analysis ---
# run_random_agent_with_plotting('MountainCar-v0', episodes=400) # 200 episodes for a smooth average
# run_random_agent_with_plotting('Pong-v0', episodes=50)       # 50 episodes is enough for Pong

In [None]:
# import gymnasium as gym
# import math
# import random
# import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.patches as mpatches
# from collections import namedtuple, deque
# from itertools import count
# import pandas as pd
# import warnings

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F

# # Suppress warnings
# warnings.filterwarnings("ignore")

# # --- Setup Device ---
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # --- Replay Memory ---
# # (Using your sample's structure, it's good)
# Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

# class ReplayMemory(object):
#     def __init__(self, capacity):
#         # Use a deque for efficient memory management
#         self.memory = deque([], maxlen=capacity)

#     def push(self, *args):
#         """Save a transition"""
#         self.memory.append(Transition(*args))

#     def sample(self, batch_size):
#         return random.sample(self.memory, batch_size)

#     def __len__(self):
#         return len(self.memory)

# # --- DQN Model ---
# # (Using your sample's model, it's perfect for MountainCar)
# class DQN(nn.Module):
#     def __init__(self, n_observations, n_actions):
#         super(DQN, self).__init__()
#         self.layer1 = nn.Linear(n_observations, 128)
#         self.layer2 = nn.Linear(128, 128)
#         self.layer3 = nn.Linear(128, n_actions)

#     def forward(self, x):
#         x = F.relu(self.layer1(x))
#         x = F.relu(self.layer2(x))
#         return self.layer3(x)

# # --- Plotting Function ---
# # (Using your sample's plotting logic, it's great)
# def plot_results(all_episode_rewards, all_steps, mean_n, policy_net, env_name, n_actions):
#     """Plots the reward learning curve and the learned policy."""

#     # Calculate n-episode moving average
#     rewards_t = torch.tensor(all_episode_rewards, dtype=torch.float)
#     steps_t = torch.tensor(all_steps, dtype=torch.float)

#     means = rewards_t.unfold(0, mean_n, 1).mean(1).view(-1)
#     means = torch.cat((torch.zeros(mean_n - 1), means))

#     # Get steps corresponding to the means
#     mean_steps = steps_t.unfold(0, mean_n, 1).mean(1).view(-1)
#     mean_steps = torch.cat((torch.zeros(mean_n - 1), mean_steps)) # Pad for alignment

#     best_mean_reward = -float('inf')
#     best_means = []
#     for m in means:
#         best_mean_reward = max(best_mean_reward, m)
#         best_means.append(best_mean_reward)

#     # Create figure
#     fig = plt.figure(figsize=(15, 6))

#     # 1. Performance Plot
#     ax1 = fig.add_subplot(121)
#     ax1.plot(all_steps, all_episode_rewards, label='Episode Reward', alpha=0.3)
#     ax1.plot(mean_steps.numpy(), means.numpy(), label=f'{mean_n}-Episode Mean Reward', color='orange')
#     ax1.plot(mean_steps.numpy(), best_means, label='Best Mean Reward', color='red', linestyle='--')
#     ax1.grid()
#     ax1.set_xlabel("Total Steps")
#     ax1.set_ylabel("Reward")
#     ax1.legend()
#     ax1.set_title(f"DQN Performance on {env_name}")

#     # 2. Policy Plot
#     ax2 = fig.add_subplot(122)
#     colour_map = {0: 'lime', 1: 'red', 2: 'blue'}
#     action_choices = ['Push Left', 'No Push', 'Push Right']

#     # Sample 1000 random states
#     X_pos = np.random.uniform(-1.2, 0.6, 1000)
#     Y_vel = np.random.uniform(-0.07, 0.07, 1000)

#     # Get the greedy action for each state
#     states = torch.tensor(np.array(list(zip(X_pos, Y_vel))), dtype=torch.float).to(device)
#     with torch.no_grad():
#         actions_tensor = policy_net(states).max(1)[1] # Get index of max Q-value
#     Z_action = actions_tensor.cpu().numpy()

#     # Map actions to colors
#     colors = [colour_map[a] for a in Z_action]

#     ax2.scatter(X_pos, Y_vel, c=colors, alpha=0.6, s=10)
#     ax2.set_title(f"Learned Policy (Action vs. State) - {env_name}")
#     ax2.set_xlabel("Position")
#     ax2.set_ylabel("Velocity")

#     legend_patches = [mpatches.Patch(color=colour_map[i], label=action_choices[i]) for i in range(n_actions)]
#     ax2.legend(handles=legend_patches, loc='best')

#     plt.tight_layout()
#     filename = f"{env_name}_dqn_results.png"
#     plt.savefig(filename)
#     print(f"Saved final plot to {filename}")
#     plt.show()


# # --- Action Selection ---
# def select_action(state, steps_done):
#     """Selects an action using an epsilon-greedy policy."""
#     sample = random.random()
#     eps_threshold = EPS_END + (EPS_START - EPS_END) * \
#                     math.exp(-1. * steps_done / EPS_DECAY)

#     if sample > eps_threshold:
#         with torch.no_grad():
#             # t.max(1) returns (value, index) for each row
#             # We want the index (the action)
#             return policy_net(state).max(1)[1].view(1, 1)
#     else:
#         # Select a random action
#         return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

# # --- Model Optimization ---
# def optimize_model():
#     """Performs one step of optimization on the policy network."""
#     if len(memory) < BATCH_SIZE:
#         return  # Not enough samples in memory yet

#     transitions = memory.sample(BATCH_SIZE)
#     # Transpose the batch (see https://stackoverflow.com/a/19343/3343043)
#     # Converts batch-array of Transitions to Transition of batch-arrays.
#     batch = Transition(*zip(*transitions))

#     # Compute a mask of non-final states and concatenate the batch elements
#     non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
#                                           batch.next_state)), device=device, dtype=torch.bool)
#     non_final_next_states = torch.cat([s for s in batch.next_state
#                                        if s is not None])

#     state_batch = torch.cat(batch.state)
#     action_batch = torch.cat(batch.action)
#     reward_batch = torch.cat(batch.reward)

#     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
#     # columns of actions taken. These are the actions which would've been taken
#     # for each batch state according to policy_net
#     state_action_values = policy_net(state_batch).gather(1, action_batch)

#     # Compute V(s_{t+1}) for all next states.
#     # Expected values of actions for non_final_next_states are computed based
#     # on the "older" target_net; selecting their best reward with max(1)[0].
#     # This is merged based on the mask, such that we'll have either the expected
#     # state value or 0 in case the state was final.
#     next_state_values = torch.zeros(BATCH_SIZE, device=device)
#     with torch.no_grad():
#         next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]

#     # Compute the expected Q values
#     # Q_target = r + gamma * max_a Q(s', a)
#     expected_state_action_values = (next_state_values * GAMMA) + reward_batch

#     # Compute Huber loss
#     criterion = nn.SmoothL1Loss()
#     loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

#     # Optimize the model
#     optimizer.zero_grad()
#     loss.backward()
#     # In-place gradient clipping
#     torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
#     optimizer.step()


# # --- Main Training Loop ---

# # Hyperparameters
# ENV_NAME = 'MountainCar-v0'
# NUM_EPISODES = 600
# BATCH_SIZE = 128
# GAMMA = 0.99
# EPS_START = 0.9
# EPS_END = 0.05
# EPS_DECAY = 1000
# TARGET_UPDATE = 5  # Update target network every 10 episodes
# LR = 1e-2             # Learning rate
# MEMORY_SIZE = 10000   # Replay buffer size
# MEAN_N = 25           # For plotting mean reward

# # Setup environment
# env = gym.make(ENV_NAME)
# n_actions = env.action_space.n
# state, info = env.reset()
# n_observations = len(state)

# # Initialize networks
# policy_net = DQN(n_observations, n_actions).to(device)
# target_net = DQN(n_observations, n_actions).to(device)
# target_net.load_state_dict(policy_net.state_dict())
# target_net.eval() # Target network is only for evaluation

# optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
# memory = ReplayMemory(MEMORY_SIZE)

# steps_done = 0
# all_episode_rewards = []
# all_steps = []
# total_steps = 0

# print(f"Starting training on {ENV_NAME}...")

# for i_episode in range(NUM_EPISODES):
#     # Initialize the environment and state
#     state, info = env.reset()
#     state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

#     episode_reward = 0

#     for t in count():  # count() is an infinite loop
#         # Select and perform an action
#         action = select_action(state, steps_done)
#         steps_done += 1
#         total_steps += 1

#         """observation, reward, terminated, truncated, _ = env.step(action.item())
#         episode_reward += reward
#         done = terminated or truncated

#         reward = torch.tensor([reward], device=device)

#         if terminated:
#             next_state = None
#         else:
#             next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

#         # Store the transition in memory
#         memory.push(state, action, next_state, reward)"""
#         observation, reward, terminated, truncated, _ = env.step(action.item())
#         episode_reward += reward # Track the TRUE score
#         done = terminated or truncated

#         # --- REWARD SHAPING (from your sample) ---
#         # This is the "tweak"
#         position = observation[0]
#         if position >= 0.5: # Goal reached (same as terminated)
#             shaped_reward = position + 1.5
#         else:
#             shaped_reward = position + 0.5

#         reward_tensor = torch.tensor([shaped_reward], device=device)
#         # --- END OF TWEAK ---

#         if terminated:
#             next_state = None
#         else:
#             next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

#         # Store the SHAPED reward in memory for learning
#         memory.push(state, action, next_state, reward_tensor)


#         # Move to the next state
#         state = next_state

#         # Perform one step of the optimization (on the policy network)
#         optimize_model()

#         if done:
#             break

#     all_episode_rewards.append(episode_reward)
#     all_steps.append(total_steps)

#     # Update the target network, copying all weights and biases in DQN
#     if i_episode % TARGET_UPDATE == 0:
#         target_net.load_state_dict(policy_net.state_dict())

#     if i_episode % MEAN_N == 0 and i_episode > 0:
#         mean_reward = np.mean(all_episode_rewards[-MEAN_N:])
#         print(f'Episode {i_episode} | Total Steps: {total_steps} | {MEAN_N}-Ep Mean Reward: {mean_reward:.2f}')

# print('Training complete')
# env.close()

# # --- Plot Final Results ---
# plot_results(all_episode_rewards, all_steps, MEAN_N, policy_net, ENV_NAME, n_actions)