🎒 Step 1: Replay Buffer Code Block

In [1]:
# 🎒 Replay Buffer - Experience Replay Memory

import random
from collections import deque
import numpy as np

class ReplayBuffer:
    """
    A fixed-size buffer that stores agent experiences for training the Q-network.

    Attributes:
        buffer (deque): A double-ended queue to store experience tuples.
        capacity (int): Maximum number of experiences the buffer can hold.
    """

    def __init__(self, capacity=100_000):
        """
        Initialize the replay buffer.

        Args:
            capacity (int): Maximum number of transitions to store in the buffer.
        """
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        """
        Store a new experience in the buffer.

        Args:
            state (np.ndarray): The current state.
            action (int): The action taken.
            reward (float): The reward received.
            next_state (np.ndarray): The resulting next state.
            done (bool): Whether the episode has ended.
        """
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        """
        Sample a batch of experiences from the buffer.

        Args:
            batch_size (int): The number of samples to return.

        Returns:
            tuple of np.ndarrays: Batched (states, actions, rewards, next_states, dones)
        """
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards, dtype=np.float32),
            np.array(next_states),
            np.array(dones, dtype=np.uint8)
        )

    def __len__(self):
        """
        Return the current size of the buffer.

        Returns:
            int: Number of stored experiences.
        """
        return len(self.buffer)


In [2]:
# 🧪 Testing the ReplayBuffer functionality

# Create a buffer with small capacity for testing
buffer = ReplayBuffer(capacity=10)

# Push in 5 dummy experiences
for i in range(5):
    state = np.zeros(8) + i  # Fake state vector (8 features like LunarLander)
    action = i % 4
    reward = float(i)
    next_state = state + 1
    done = i % 2 == 0
    buffer.push(state, action, reward, next_state, done)

print(f"✅ Buffer size after pushing: {len(buffer)}")

# Sample 3 experiences
sampled = buffer.sample(3)
print("✅ Sampled batch shapes:")
print("States shape:", sampled[0].shape)
print("Actions shape:", sampled[1].shape)
print("Rewards shape:", sampled[2].shape)
print("Next States shape:", sampled[3].shape)
print("Dones shape:", sampled[4].shape)


✅ Buffer size after pushing: 5
✅ Sampled batch shapes:
States shape: (3, 8)
Actions shape: (3,)
Rewards shape: (3,)
Next States shape: (3, 8)
Dones shape: (3,)


🧠 Step 2: Q-Network (Neural Net Brain)

In [3]:
# 🧠 Q-Network - The Deep Neural Network Approximator

import torch
import torch.nn as nn

class QNetwork(nn.Module):
    """
    A feedforward neural network that estimates Q-values for each action
    given a state input from the environment.

    Attributes:
        model (nn.Sequential): The sequential network architecture.
    """

    def __init__(self, state_size, action_size):
        """
        Initialize the Q-network layers.

        Args:
            state_size (int): Dimension of the input state.
            action_size (int): Number of possible discrete actions.
        """
        super(QNetwork, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(state_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_size)
        )

    def forward(self, x):
        """
        Forward pass of the network.

        Args:
            x (torch.Tensor): The input state tensor.

        Returns:
            torch.Tensor: The predicted Q-values for each action.
        """
        return self.model(x)


In [4]:
# 🧪 Testing the QNetwork with dummy input

state_size = 8     # LunarLander-v2 has 8 state features
action_size = 4    # 4 discrete actions

net = QNetwork(state_size, action_size)

dummy_input = torch.randn(1, state_size)
output = net(dummy_input)

print("✅ Output Q-values:", output)
print("✅ Output shape:", output.shape)


✅ Output Q-values: tensor([[ 0.1017,  0.1722, -0.2195,  0.0700]], grad_fn=<AddmmBackward0>)
✅ Output shape: torch.Size([1, 4])


🤖 Step 3: DQN Agent Class

In [5]:
# 🤖 DQN Agent - The Reinforcement Learning Engine

import torch.optim as optim

class DQNAgent:
    """
    Deep Q-Learning Agent that interacts with and learns from the environment.

    Attributes:
        q_network (QNetwork): Main Q-network used to predict Q-values.
        target_network (QNetwork): Target Q-network for stable learning.
        optimizer (torch.optim.Adam): Optimizer for updating q_network weights.
        buffer (ReplayBuffer): Memory buffer for storing experiences.
        gamma (float): Discount factor for future rewards.
        epsilon (float): Exploration rate for epsilon-greedy policy.
    """

    def __init__(self, state_size, action_size, buffer, lr=1e-3, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        """
        Initialize the agent.

        Args:
            state_size (int): Number of features in the environment state.
            action_size (int): Number of discrete actions.
            buffer (ReplayBuffer): Replay buffer instance.
            lr (float): Learning rate for the optimizer.
            gamma (float): Discount factor.
            epsilon (float): Initial exploration rate.
            epsilon_decay (float): Rate at which epsilon decays.
            epsilon_min (float): Minimum value of epsilon.
        """
        self.q_network = QNetwork(state_size, action_size)
        self.target_network = QNetwork(state_size, action_size)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()

        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.buffer = buffer

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.action_size = action_size

    def act(self, state):
        """
        Select an action using epsilon-greedy strategy.

        Args:
            state (np.ndarray): Current state.

        Returns:
            int: Action index.
        """
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.q_network(state)
        return q_values.argmax().item()

    def learn(self, batch_size):
        """
        Sample a batch from memory and perform a learning step.

        Args:
            batch_size (int): Size of the mini-batch for learning.
        """
        if len(self.buffer) < batch_size:
            return

        states, actions, rewards, next_states, dones = self.buffer.sample(batch_size)

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions).unsqueeze(1)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones).unsqueeze(1)

        q_values = self.q_network(states).gather(1, actions)
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1, keepdim=True)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q_values

        loss = nn.MSELoss()(q_values, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def update_epsilon(self):
        """
        Decay the exploration rate after each episode.
        """
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        """
        Copy weights from the main Q-network to the target network.
        """
        self.target_network.load_state_dict(self.q_network.state_dict())


🔁 Step 4: Training Loop

In [7]:
# 🔁 Training Loop - Interact, Learn, and Track Progress

import gymnasium as gym
from tqdm import trange

# Environment setup
env = gym.make("LunarLander-v3")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Agent setup
buffer = ReplayBuffer(capacity=100_000)
agent = DQNAgent(state_size, action_size, buffer)

# Training hyperparameters
num_episodes = 1000
batch_size = 64
target_update_freq = 10  # how often to sync target network

# Tracking
reward_history = []
loss_history = []

# Training loop
for episode in trange(num_episodes, desc="🚀 Training"):
    state, _ = env.reset()
    total_reward = 0
    losses = []

    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        agent.buffer.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        # Learning step
        loss = agent.learn(batch_size)
        if loss is not None:
            losses.append(loss)

    # End-of-episode updates
    agent.update_epsilon()
    if episode % target_update_freq == 0:
        agent.update_target_network()

    # Track performance
    reward_history.append(total_reward)
    avg_loss = np.mean(losses) if losses else 0
    loss_history.append(avg_loss)

    # Print episode summary every 100
    if (episode + 1) % 100 == 0:
        print(f"🎯 Episode {episode+1} - Reward: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f} | Loss: {avg_loss:.4f}")


🚀 Training:  10%|█         | 100/1000 [00:27<05:12,  2.88it/s]

🎯 Episode 100 - Reward: -91.49 | Epsilon: 0.606 | Loss: 24.8011


🚀 Training:  20%|██        | 200/1000 [02:02<26:04,  1.96s/it]

🎯 Episode 200 - Reward: -33.73 | Epsilon: 0.367 | Loss: 16.8870


🚀 Training:  30%|███       | 300/1000 [05:17<23:26,  2.01s/it]

🎯 Episode 300 - Reward: -104.36 | Epsilon: 0.222 | Loss: 7.6364


🚀 Training:  33%|███▎      | 329/1000 [06:07<12:30,  1.12s/it]


KeyboardInterrupt: 