In [None]:
%%writefile MountainCar-DQN-a.py

import argparse
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import random
from itertools import count

def parse_args():
    p = argparse.ArgumentParser(description="Running with different episodic counts and mean rewards")
    p.add_argument("--environment", type=str, default="MountainCar-v0", help="Gym environment id")
    p.add_argument("--episodes", type=int, default=100, help="Number of episodes to run")
    p.add_argument("--mean_n", type=int, default=5, help="n for rolling mean plot")
    p.add_argument("--seed", type=int, default=None, help="Random seed (optional)")
    p.add_argument("--render", action="store_true", help="Render environment (slows down execution)")
    return p.parse_args()


def safe_reset(env):
    """Handle gym vs gymnasium reset return types."""
    out = env.reset()
    if isinstance(out, tuple) and len(out) >= 1:
        return out[0]
    return out

def safe_step(env, action):
    """Handle gym vs gymnasium step return types."""
    out = env.step(action)
    if len(out) == 4:
        obs, reward, done, info = out
        return obs, reward, done, info
    elif len(out) == 5:
        obs, reward, terminated, truncated, info = out
        done = terminated or truncated
        return obs, reward, done, info
    else:
        raise RuntimeError("Unexpected step output format: len = {}".format(len(out)))

def get_state_tensor(obs):
    """Return numpy array representation for plotting / visualization. Kept simple."""
    return np.array(obs, dtype=np.float32)

def select_random_action(action_space):
    """Return a single integer action chosen uniformly at random."""
    return action_space.sample()

def plot_results(rewards_mean, steps, best_rewards_mean, env_name, file_name, action_scatter):
    """Create a 2-panel plot: performance and action choices scatter."""
    fig = plt.figure(figsize=(12,5))

    ax1 = fig.add_subplot(121)
    ax1.plot(steps, rewards_mean, label=f"{len(steps)}-point rolling mean")
    ax1.plot(steps, best_rewards_mean, label="Best mean reward")
    ax1.grid(True)
    ax1.set_xlabel("Total environment steps")
    ax1.set_ylabel("Reward (higher is better)")
    ax1.legend()
    ax1.set_title(f"Performance of random agent on {env_name}")

    ax2 = fig.add_subplot(122)
    if len(action_scatter) > 0:
        arr = np.array(action_scatter)
        X = arr[:,0].astype(float)
        Y = arr[:,1].astype(float)
        Z = arr[:,2].astype(int)
        # color map for 3 discrete actions
        cmap = {0: 'lime', 1: 'red', 2: 'blue'}
        colors = [cmap[int(a)] for a in Z]
        ax2.scatter(X, Y, c=colors, s=12, alpha=0.7)
        action_names = ['Left (0)', 'No-Op (1)', 'Right (2)']
        # legend patches
        legend_recs = [mpatches.Rectangle((0,0),1,1,fc=cmap[i]) for i in range(3)]
        ax2.legend(legend_recs, action_names, loc='best')
    ax2.set_title("Random agent action choices (sampled states)")
    ax2.set_xlabel("Position")
    ax2.set_ylabel("Velocity")

    plt.suptitle(f"{env_name} - Random Agent Analysis")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(file_name, dpi=200)
    print(f"Saved plot to {file_name}")
    plt.show()
    plt.close(fig)
    return

def run_random_agent(env_id, episodes, mean_n, seed=None, render=False):
    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    env = gym.make(env_id)
    print("Environment:", env_id)
    print("Observation space:", env.observation_space)
    print("Action space:", env.action_space)

    episode_rewards = []
    best_reward = -float('inf')
    rewards_mean = []
    best_rewards_mean = []
    steps = []

    total_steps = 0
    success_count = 0

    action_scatter = []  # store (pos, vel, action) samples for plotting

    for ep in range(episodes):
        obs = safe_reset(env)
        state = get_state_tensor(obs)
        total_reward = 0.0

        for t in count():
            if render:
                env.render()

            action = select_random_action(env.action_space)
            if len(action_scatter) < 2000 and (total_steps % max(1, int(100/episodes)) == 0):
                action_scatter.append((state[0], state[1], action))

            obs, reward, done, info = safe_step(env, action)
            state = get_state_tensor(obs)
            total_reward += reward
            total_steps += 1

            if done or t >= 10000:  # safety cap
                # success condition: position >= 0.5 at termination
                try:
                    pos = state[0]
                except:
                    pos = None
                if pos is not None and pos >= 0.5:
                    success_count += 1
                break

        episode_rewards.append(total_reward)

        if len(episode_rewards) >= mean_n:
            present_mean = float(np.mean(episode_rewards[-mean_n:]))
            rewards_mean.append(present_mean)
            best_reward = max(present_mean, best_reward)
            best_rewards_mean.append(best_reward)
            steps.append(total_steps)

        print(f"Episode {ep+1}/{episodes} | Reward = {total_reward:.2f} | Successes so far = {success_count}")

    env.close()

    fn = f"{env_id}_random_{episodes}ep_mean{mean_n}.png"
    plot_results(rewards_mean, steps, best_rewards_mean, env_id, fn, action_scatter)

    # Summary
    summary = {
        "total_episodes": episodes,
        "mean_n": mean_n,
        "final_mean_reward": rewards_mean[-1] if rewards_mean else None,
        "best_mean_reward": best_rewards_mean[-1] if best_rewards_mean else None,
        "success_count": success_count,
        "total_steps": total_steps
    }
    return summary

if __name__ == "__main__":
    args = parse_args()
    summary = run_random_agent(args.environment, args.episodes, args.mean_n, args.seed, args.render)
    print()
    print("="*10 +"Summary" + "="*10)
    for k, v in summary.items():
        print(f"{k}: {v}")


Writing MountainCar-DQN-a.py


In [None]:
# !python3 MountainCar-DQN-a.py --environment "MountainCar-v0" --episodes 100 --mean_n 5 --seed 303
# run the python CMD via CLI


Environment: MountainCar-v0
Observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Action space: Discrete(3)
Episode 1/100 | Reward = -200.00 | Successes so far = 0
Episode 2/100 | Reward = -200.00 | Successes so far = 0
Episode 3/100 | Reward = -200.00 | Successes so far = 0
Episode 4/100 | Reward = -200.00 | Successes so far = 0
Episode 5/100 | Reward = -200.00 | Successes so far = 0
Episode 6/100 | Reward = -200.00 | Successes so far = 0
Episode 7/100 | Reward = -200.00 | Successes so far = 0
Episode 8/100 | Reward = -200.00 | Successes so far = 0
Episode 9/100 | Reward = -200.00 | Successes so far = 0
Episode 10/100 | Reward = -200.00 | Successes so far = 0
Episode 11/100 | Reward = -200.00 | Successes so far = 0
Episode 12/100 | Reward = -200.00 | Successes so far = 0
Episode 13/100 | Reward = -200.00 | Successes so far = 0
Episode 14/100 | Reward = -200.00 | Successes so far = 0
Episode 15/100 | Reward = -200.00 | Successes so far = 0
Episode 16/100 | Reward = -200.00 

In [None]:
%%writefile MountainCar-DQN-b.py
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
import argparse

class QNetwork(nn.Module):
    """Deep Q-Network: 3-layer MLP for state-action value prediction."""
    def __init__(self, state_dimension, action_dimension):
        super().__init__()
        self.network_layers = nn.Sequential(
            nn.Linear(state_dimension, 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, action_dimension)
        )

    def forward(self, state):
        return self.network_layers(state)


def calculate_shaped_reward(state, environment_reward):
    """Enhances the reward signal by adding a velocity-based bonus."""
    car_velocity = state[1]
    velocity_bonus = abs(car_velocity) * 10  # Encourages maintaining speed
    return environment_reward + velocity_bonus


def train_agent(config):
    warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.functional')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Environment setup
    env = gym.make('MountainCar-v0')
    state_dimension = env.observation_space.shape[0]
    action_dimension = env.action_space.n

    # Initialize networks
    online_network = QNetwork(state_dimension, action_dimension).to(device)
    target_network = QNetwork(state_dimension, action_dimension).to(device)
    target_network.load_state_dict(online_network.state_dict())
    target_network.eval()

    optimizer = optim.Adam(online_network.parameters(), lr=config.learning_rate)
    criterion = nn.MSELoss()
    experience_buffer = deque(maxlen=config.memory_size)

    # Training metrics
    episode_rewards = []
    exploration_rate = config.epsilon_start
    total_timesteps = 0


    for episode in range(config.num_episodes):
        if episode % 10 == 0:
            print(f"Episode {episode}/{config.num_episodes}")

        state_np, _ = env.reset()
        state_tensor = torch.FloatTensor(state_np).unsqueeze(0).to(device)
        episode_reward = 0
        episode_steps = 0

        while True:
            if random.random() < exploration_rate:
                action = env.action_space.sample()  # Random exploration
            else:
                with torch.no_grad():
                    action = online_network(state_tensor).argmax().item()  # Greedy exploitation

            next_state_np, env_reward, terminated, truncated, _ = env.step(action)
            shaped_reward = calculate_shaped_reward(next_state_np, env_reward)
            next_state_tensor = torch.FloatTensor(next_state_np).unsqueeze(0).to(device)
            episode_done = terminated or truncated

            experience_buffer.append((state_tensor, action, shaped_reward, next_state_tensor, episode_done))

            state_tensor = next_state_tensor
            episode_reward += shaped_reward
            episode_steps += 1
            total_timesteps += 1

            if len(experience_buffer) > config.batch_size:
                batch = random.sample(experience_buffer, config.batch_size)
                batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*batch)

                states_tensor = torch.cat(batch_states).to(device)
                next_states_tensor = torch.cat(batch_next_states).to(device)
                actions_tensor = torch.tensor(batch_actions).unsqueeze(1).to(device)
                rewards_tensor = torch.tensor(batch_rewards, dtype=torch.float32).to(device)
                dones_tensor = torch.tensor(batch_dones, dtype=torch.bool).to(device)

                # Compute current Q-values
                current_q_values = online_network(states_tensor).gather(1, actions_tensor)

                # Compute target Q-values using target network
                with torch.no_grad():
                    next_q_values = target_network(next_states_tensor).max(1)[0]
                    target_q_values = rewards_tensor + config.gamma * next_q_values * (~dones_tensor).float()

                # Compute loss and update online network
                loss = criterion(current_q_values, target_q_values.unsqueeze(1))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if episode_done:
                break

        if exploration_rate > config.epsilon_min:
            exploration_rate *= config.epsilon_decay_factor

        if (episode + 1) % config.target_update_episodes == 0:
            target_network.load_state_dict(online_network.state_dict())

        episode_rewards.append(episode_reward)

        if len(episode_rewards) >= config.n_mean_episodes and episode % 10 == 0:
            recent_mean_reward = np.mean(episode_rewards[-config.n_mean_episodes:])
            print(f"Mean {config.n_mean_episodes}-episode reward: {recent_mean_reward:.2f}")

    env.close()
    return episode_rewards, online_network, device




def visualize_training_results(config, training_rewards, trained_network, device):
    # Prepare training metrics
    episode_count = config.num_episodes
    window_size = config.n_mean_episodes
    batch_size = config.batch_size

    episodes = list(range(1, len(training_rewards) + 1))
    smoothed_rewards = []
    peak_rewards = []

    highest_mean = -float('inf')
    for i in range(len(training_rewards)):
        if i >= window_size:
            window_mean = np.mean(training_rewards[i - window_size:i])
            smoothed_rewards.append(window_mean)
            highest_mean = max(highest_mean, window_mean)
            peak_rewards.append(highest_mean)
        else:
            smoothed_rewards.append(None)
            peak_rewards.append(None)

    output_filename = f"{config.environment}_DQN_{episode_count}_episodes_{batch_size}batch.png"

    _, (performance_plot, policy_plot) = plt.subplots(1, 2, figsize=(12, 5))

    performance_plot.plot(episodes, smoothed_rewards, label=f"{window_size}-episode mean", color="deepskyblue")
    performance_plot.plot(episodes, peak_rewards, label="Best mean reward", color="orange", linestyle="--")
    performance_plot.set_xlabel("Episode")
    performance_plot.set_ylabel("Reward")
    performance_plot.set_title(f"Learning Progress on {config.environment}")
    performance_plot.grid(True)
    performance_plot.legend()

    position_range = np.linspace(-1.5, 0.6, 50)  # Car position range
    velocity_range = np.linspace(-1, 1, 50)      # Car velocity range
    positions, velocities = np.meshgrid(position_range, velocity_range)
    states = np.array(list(zip(positions.flatten(), velocities.flatten())))

    state_tensor = torch.tensor(states, dtype=torch.float32, device=device)
    with torch.no_grad():
        action_choices = trained_network(state_tensor).argmax(dim=1).cpu().numpy()

    action_colors = ['lime', 'red', 'blue']  # left, no-op, right
    policy_plot.scatter(states[:, 0], states[:, 1], 
                       c=[action_colors[a] for a in action_choices], 
                       s=1, alpha=0.7)
    policy_plot.set_xlabel("Car Position")
    policy_plot.set_ylabel("Car Velocity")
    policy_plot.set_title("Learned Policy Map")

    action_labels = ["Left", "No Action", "Right"]
    legend_patches = [mpatches.Patch(color=action_colors[i], 
                                   label=action_labels[i]) 
                     for i in range(3)]
    policy_plot.legend(handles=legend_patches)
    policy_plot.set_xlim([-1.5, 0.6])
    policy_plot.set_ylim([-1, 1])

    plt.tight_layout()
    plt.savefig(output_filename, dpi=200)
    plt.close()
    print(f"Visualization saved as {output_filename}")

if __name__ == "__main__":
    # Set up command-line argument parser
    parser = argparse.ArgumentParser(description="Deep Q-Learning for Mountain Car Environment")

    # Training configuration parameters
    parser.add_argument("--environment", type=str, default="MountainCar-v0",help="Gymnasium environment to train on")
    parser.add_argument("--learning_rate", type=float, default=5e-4,help="Learning rate for the neural network")
    parser.add_argument("--gamma", type=float, default=0.99,help="Discount factor for future rewards")
    parser.add_argument("--epsilon_start", type=float, default=1.0,help="Initial exploration rate")
    parser.add_argument("--epsilon_min", type=float, default=0.01,help="Minimum exploration rate")
    parser.add_argument("--epsilon_decay_factor", type=float, default=0.997,help="Rate at which exploration decreases")
    parser.add_argument("--memory_size", type=int, default=10000,help="Size of experience replay buffer")
    parser.add_argument("--batch_size", type=int, default=64,help="Number of experiences to learn from at once")
    parser.add_argument("--target_update_episodes", type=int, default=20,help="Episodes between target network updates")
    parser.add_argument("--num_episodes", type=int, default=1000,help="Total number of training episodes")
    parser.add_argument("--n_mean_episodes", type=int, default=50,help="Window size for calculating mean reward")

    # Parse arguments and start training
    config = parser.parse_args()

    # Train the agent
    training_history, trained_network, device = train_agent(config)
    
    # Visualize and save results
    visualize_training_results(config, training_history, trained_network, device)


Writing MountainCar-DQN-b.py


In [None]:
# !python3 MountainCar-DQN-b.py --environment="MountainCar-v0" --num_episodes=100 --batch=8 --gamma=0.9 --learning_rate=1e-4 --mean_n=5
# run the python CMD via CLI


Episode 1/100 | Reward: -200.00 | Success: 0 | ε=0.784
Episode 2/100 | Reward: -200.00 | Success: 0 | ε=0.615
Episode 3/100 | Reward: -200.00 | Success: 0 | ε=0.483
Episode 4/100 | Reward: -200.00 | Success: 0 | ε=0.381
Episode 5/100 | Reward: -200.00 | Success: 0 | ε=0.301
Episode 6/100 | Reward: -200.00 | Success: 0 | ε=0.239
Episode 7/100 | Reward: -200.00 | Success: 0 | ε=0.191
Episode 8/100 | Reward: -200.00 | Success: 0 | ε=0.153
Episode 9/100 | Reward: -200.00 | Success: 0 | ε=0.123
Episode 10/100 | Reward: -200.00 | Success: 0 | ε=0.101
Episode 11/100 | Reward: -200.00 | Success: 0 | ε=0.083
Episode 12/100 | Reward: -200.00 | Success: 0 | ε=0.069
Episode 13/100 | Reward: -200.00 | Success: 0 | ε=0.058
Episode 14/100 | Reward: -200.00 | Success: 0 | ε=0.050
Episode 15/100 | Reward: -200.00 | Success: 0 | ε=0.043
Episode 16/100 | Reward: -200.00 | Success: 0 | ε=0.038
Episode 17/100 | Reward: -200.00 | Success: 0 | ε=0.034
Episode 18/100 | Reward: -200.00 | Success: 0 | ε=0.031
E

In [None]:
%%writefile MountainCar-DQN-c.py

import subprocess
import sys
import os

# Fixed settings
environment = "MountainCar-v0"
num_episodes = 1000


batch_sizes = [16, 32, 64, 128]

batch_rewards = {}

for b in batch_sizes:
    print(f"\nRunning with batch size = {b}")
    result = subprocess.run(
        [
            sys.executable,
            "MountainCar-DQN-b.py",
            "--environment", environment,
            "--num_episodes", str(num_episodes),
            "--batch_size", str(b),
        ],
        capture_output=True, text=True
    )

    batch_rewards[b] = result.stdout
    print("-"*10)
    print(result.stdout)
    if result.returncode != 0:
        print("--- STDERR (error) ---")
        print(result.stderr)


In [None]:
# run the python CMD via CLI


## Part(a)
A random agent in the MountainCar-v0 environment performs poorly, consistently receiving rewards around -200 without any improvement over time. Its actions are uniformly random across all states, which is an ineffective strategy for building the momentum required to escape the valley and reach the goal.

--- 

![Random Agent Performance and Policy Map](part(a)/MountainCar-v0_random_100ep_mean5.png)
<center>
Figure-1: Random Agent Performance (left) and its Policy Map (right) for 100 episodes of random actions.
</center>

---

# Part(b)
The DQN agent successfully learned to solve the MountainCar-v0 environment. This is demonstrated by three key outcomes:

- Training Loss: The loss initially rose and then steadily decreased, indicating successful convergence of the Q-network.
- Reward Improvement: The agent's average reward consistently increased throughout training, showing it learned to perform better.
- Effective Policy: The final policy map reveals the agent learned the essential momentum strategy, choosing actions that build momentum to reach the goal.

--- 

![DQN Agent Performance and Policy Map](part(b)/MountainCar-v0_DQN_1000_episodes_64batch.png)
<center>
Figure-2: DQN Agent Performance (left) and its Policy Map (right) after training for 1000 episodes with a batch size of 64.
</center>

---

![Agent's Training Loss](part(b)/MountainCar-v0_DQN_1000_episodes_64batch_loss.png)
<center>
Figure-3: DQN Agent Training Loss over 1000 episodes with a batch size of 64.
</center>

---

# Part(c)
- Small batch sizes yield greater update randomness and learning curve variance, potentially helping escape local minima but also risking instability.
- Large batch sizes offer steadier convergence and cleaner policy maps, but is observed that that 'No action' option isn't learned as effectively with larger batches.
- For MountainCar-v0, all tested batch sizes enable successful DQN convergence.

---

![](part(c)/MountainCar-v0_DQN_1000_episodes_16batch.png)
![](part(c)/MountainCar-v0_DQN_1000_episodes_32batch.png)
![](part(c)/MountainCar-v0_DQN_1000_episodes_64batch.png)
![](part(c)/MountainCar-v0_DQN_1000_episodes_128batch.png)
<center>
Figures 4,5,6,7: DQN Agent Performance for batch sizes of 16, 32, 64, and 128 over 1000 episodes.
</center>

---