In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

"""
🚀 LUNAR LANDER DQN TUTORIAL 🌙
===============================
A comprehensive implementation of Deep Q-Learning for the Lunar Lander environment!

🎮 GAME OBJECTIVE:
----------------
Land the lunar module safely on the landing pad!
- Control thrust and rotation
- Manage fuel consumption
- Avoid crashes and achieve smooth landings

🧠 LEARNING COMPONENTS:
--------------------
1. 🤖 Deep Q-Network (DQN)
   - Neural network for Q-value approximation
   - Experience replay for stable learning
   - Target network for reduced variance

2. 🎯 Training Process
   - Epsilon-greedy exploration
   - Bellman equation updates
   - Gradient descent optimization

3. 🔄 Key Algorithms
   - Deep Q-Learning
   - Experience Replay
   - Soft Target Updates
"""

# 📚 IMPORT REQUIRED LIBRARIES 📚
import os                           # For handling file paths and directory operations
import random                       # For generating random numbers in exploration
import numpy as np                  # For efficient numerical operations
import torch                        # The main PyTorch library for deep learning
import torch.nn as nn              # Neural network modules from PyTorch
import torch.optim as optim        # Optimization algorithms from PyTorch
import gymnasium as gym            # The environment simulation library
from collections import deque      # For implementing the replay buffer
import matplotlib.pyplot as plt    # For plotting training results

# 🎮 HYPERPARAMETERS 🎮
BUFFER_SIZE = int(1e5)    # Maximum size of experience replay buffer
BATCH_SIZE = 64           # Number of experiences to sample from buffer
GAMMA = 0.99              # Discount factor for future rewards
TAU = 1e-3               # Soft update parameter for target network
LR = 5e-4                # Learning rate for the optimizer
UPDATE_EVERY = 4         # How often to update the network
EPSILON_START = 1.0      # Starting value of epsilon (for exploration)
EPSILON_END = 0.01       # Minimum value of epsilon
EPSILON_DECAY = 0.995    # Decay rate of epsilon

# 🧠 NEURAL NETWORK ARCHITECTURE 🧠
class DQN(nn.Module):
    """
    Deep Q-Network (DQN) Architecture
    ================================
    Neural network that approximates the Q-function, mapping states to action values.

    ARCHITECTURE DETAILS:
    -------------------
    1. Network Structure:
       - Input Layer: State dimensions (8 for Lunar Lander)
       - Hidden Layer 1: 64 neurons with ReLU activation
       - Hidden Layer 2: 64 neurons with ReLU activation
       - Output Layer: Action dimensions (4 for Lunar Lander)

    2. Layer Choices:
       - Fully connected layers for flexibility in learning
       - ReLU activation for non-linearity
       - No dropout (stability in Q-learning more important than regularization)

    3. Output Interpretation:
       - Each output neuron represents Q-value for one action
       - Q-value predicts total future rewards for taking that action
       - Highest Q-value indicates best predicted action

    MATHEMATICAL BASIS:
    -----------------
    Q(s,a) ≈ Neural_Network(s)[a]
    - s: State input
    - a: Action index
    - Q(s,a): Expected future rewards for action a in state s
    """

    def __init__(self, state_size, action_size):
        """
        🏗️ Initialize the Q-Network architecture

        Parameters:
        - state_size: Input dimensions (8 for Lunar Lander)
        - action_size: Output dimensions (4 for Lunar Lander)

        Architecture:
        Input(state_size) -> FC(64) -> ReLU -> FC(64) -> ReLU -> FC(action_size)
        """
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)     # First fully connected layer
        self.fc2 = nn.Linear(64, 64)             # Second fully connected layer
        self.fc3 = nn.Linear(64, action_size)    # Output layer

    def forward(self, x):
        """
        🔄 Forward pass through the network

        Process:
        1. Input state tensor flows through layers
        2. ReLU activation applied after each hidden layer
        3. Final layer outputs Q-values for each action

        Parameter:
        - x: State tensor of shape (batch_size, state_size)

        Returns:
        - Q-values tensor of shape (batch_size, action_size)
        """
        x = torch.relu(self.fc1(x))    # First layer + ReLU
        x = torch.relu(self.fc2(x))    # Second layer + ReLU
        return self.fc3(x)             # Output Q-values

# 🗃️ REPLAY BUFFER IMPLEMENTATION 🗃️
class ReplayBuffer:
    """
    Experience Replay Buffer
    =======================
    A circular buffer that stores and samples experiences for training.

    KEY CONCEPTS:
    ------------
    1. Experience Storage:
       - Stores (state, action, reward, next_state, done) tuples
       - Uses deque with maxlen for automatic FIFO behavior
       - Efficiently manages memory by overwriting oldest experiences

    2. Random Sampling:
       - Breaks correlation between consecutive experiences
       - Improves learning stability by providing diverse experiences
       - Enables mini-batch learning for better generalization

    3. Data Processing:
       - Converts experiences to PyTorch tensors
       - Handles batch processing efficiently
       - Ensures proper data types for neural network input

    IMPLEMENTATION DETAILS:
    ---------------------
    - Buffer Size: Maximum number of experiences to store
    - Batch Size: Number of random experiences sampled for learning
    - Experience Format: (state, action, reward, next_state, done)
    """

    def __init__(self, buffer_size):
        """
        🏗️ Initialize the replay buffer
        Parameter:
        - buffer_size: Maximum number of experiences to store (FIFO queue)
        """
        self.memory = deque(maxlen=buffer_size)    # Create deque with max length
        self.buffer_size = buffer_size

    def add(self, state, action, reward, next_state, done):
        """
        ➕ Add a new experience to memory

        Experience tuple format:
        - state: Current state observation (numpy array)
        - action: Action taken (integer)
        - reward: Reward received (float)
        - next_state: Next state observation (numpy array)
        - done: Episode termination flag (boolean)
        """
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        """
        🎲 Random sampling of experiences from memory

        Process:
        1. Randomly select 'batch_size' experiences
        2. Unzip experiences into separate arrays
        3. Convert arrays to PyTorch tensors
        4. Return batch of experiences for learning

        Parameter:
        - batch_size: Number of experiences to sample

        Returns:
        - Tuple of (states, actions, rewards, next_states, dones) as tensors
        """
        experiences = random.sample(self.memory, k=batch_size)

        # Convert experiences to torch tensors for batch processing
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float()
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long()
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float()
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float()
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float()

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """
        📏 Return current size of memory
        Returns:
        - Current number of experiences stored
        """
        return len(self.memory)

# 🤖 DQN AGENT IMPLEMENTATION 🤖
class DQNAgent:
    """
    Deep Q-Learning Agent
    ====================
    This agent learns to control the lunar lander using Deep Q-Learning algorithm.
    It maintains two networks (local and target) for stable training.
    """

    def __init__(self, state_size, action_size):
        """
        🏗️ Initialize the DQN Agent
        Parameters:
        - state_size: Dimension of each state
        - action_size: Dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size

        # 🧠 Create the Neural Networks (local and target)
        self.local_net = DQN(state_size, action_size)    # Network for selecting actions
        self.target_net = DQN(state_size, action_size)   # Target network for stable Q-values
        self.optimizer = optim.Adam(self.local_net.parameters(), lr=LR)  # Adam optimizer

        # 🗃️ Create Replay Buffer
        self.memory = ReplayBuffer(BUFFER_SIZE)

        # 📊 Initialize time step for updating target network
        self.t_step = 0

        # 🎲 Initialize epsilon for exploration
        self.epsilon = EPSILON_START

    def step(self, state, action, reward, next_state, done):
        """
        👣 Take a step in the environment and handle learning process

        PROCESS FLOW:
        -------------
        1. Experience Storage:
           - Save (state, action, reward, next_state, done) tuple in replay buffer
           - This experience can be used later for batch learning

        2. Learning Timing:
           - Learn every UPDATE_EVERY time steps
           - This reduces computation and allows for batch processing
           - Requires minimum buffer size to ensure enough random samples

        Parameters:
        - state: Current state observation
        - action: Action taken in current state
        - reward: Reward received from environment
        - next_state: Next state observation
        - done: Boolean indicating if episode ended
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample(BATCH_SIZE)
            self.learn(experiences)

    def act(self, state, eval_mode=False):
        """
        🎯 Select an action using epsilon-greedy policy

        EPSILON-GREEDY STRATEGY:
        ----------------------
        1. Exploration (Random Action):
           - Probability = epsilon
           - Allows discovery of new strategies
           - Epsilon decays over time (from EPSILON_START to EPSILON_END)

        2. Exploitation (Best Known Action):
           - Probability = 1 - epsilon
           - Uses neural network to predict best action
           - Network outputs Q-values for each possible action

        Parameters:
        - state: Current state observation
        - eval_mode: If True, use greedy policy (no exploration)

        Returns:
        - action: Selected action index
        """
        state = torch.from_numpy(state).float().unsqueeze(0)  # Convert state to tensor

        # Set network to evaluation mode and get action values
        self.local_net.eval()  # Disable dropout/batchnorm for prediction
        with torch.no_grad():  # Disable gradient calculation for efficiency
            action_values = self.local_net(state)
        self.local_net.train()  # Re-enable training mode

        # Epsilon-greedy action selection
        if not eval_mode and random.random() < self.epsilon:
            return random.randrange(self.action_size)    # Random action (exploration)
        return np.argmax(action_values.cpu().data.numpy())    # Best action (exploitation)

    def learn(self, experiences):
        """
        🧠 LEARNING PROCESS - WHERE THE MAGIC HAPPENS! ✨

        SUPER AWESOME ALGORITHM STEPS:
        ---------------------------
        1. 📦 Unpack Experiences:
           - Get (state, action, reward, next_state, done) from batch
           - Convert everything to proper tensor format

        2. 🎯 Get Q-Values:
           - Current Q-values from local network
           - Next Q-values from target network (no_grad for stability)

        3. 🔮 Calculate Target Q-Values:
           - Use Bellman equation: Q = R + γ * max(Q_next)
           - Handle terminal states (done = True)

        4. 📈 Update Network:
           - Calculate loss between current and target Q-values
           - Perform gradient descent step
           - Update target network softly

        5. 🎭 Epsilon Update:
           - Decay exploration rate over time
           - Balance exploration vs exploitation

        MATHEMATICAL MAGIC ✨:
        -------------------
        Q_target = R + γ * max(Q_next) * (1 - done)
        Loss = MSE(Q_current, Q_target)

        Parameters:
        - experiences: Tuple of (states, actions, rewards, next_states, dones)
        """
        states, actions, rewards, next_states, dones = experiences

        # 🎯 Get max predicted Q-values (for next states) from target model
        Q_targets_next = self.target_net(next_states).detach().max(1)[0].unsqueeze(1)

        # 🔮 Compute Q targets for current states
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # 🎭 Get expected Q values from local model
        Q_expected = self.local_net(states).gather(1, actions)

        # 📈 Compute loss and perform optimization step
        loss = nn.MSELoss()(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 🔄 Update target network
        self.soft_update()

        # 🎭 Update epsilon
        self.epsilon = max(EPSILON_END, EPSILON_DECAY * self.epsilon)

    def soft_update(self):
        """
        🔄 Soft update of target network parameters
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, local_param in zip(self.target_net.parameters(),
                                           self.local_net.parameters()):
            target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)

# 🎮 TRAINING FUNCTION 🎮
def train_agent(n_episodes=2000):
    """
    🏃‍♂️ Train the DQN agent
    Parameter:
    - n_episodes: Number of episodes to train
    Returns:
    - scores: List of scores from each episode
    """
    # Create environment
    env = gym.make('LunarLander-v3')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # Create agent
    agent = DQNAgent(state_size, action_size)

    # Initialize score list
    scores = []

    # Training loop
    for i_episode in range(1, n_episodes+1):
        state, _ = env.reset()
        score = 0
        done = False

        # Episode loop
        while not done:
            # Select and perform action
            action = agent.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Update agent
            agent.step(state, action, reward, next_state, done)

            # Update state and score
            state = next_state
            score += reward

        # Store score
        scores.append(score)

        # Print progress
        if i_episode % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(f'Episode {i_episode}\tAverage Score: {avg_score:.2f}')

    # Save trained model
    torch.save(agent.local_net.state_dict(), 'lunar_lander_dqn.pth')

    return scores

# 📊 VISUALIZATION TIME - LET'S SEE THOSE AWESOME RESULTS! 📈
def plot_scores(scores):
    """
    📊 VISUALIZATION TIME - LET'S SEE THOSE AWESOME RESULTS! 📈

    PLOT DETAILS:
    ------------
    1. 📉 Score History:
       - Raw scores for each episode
       - Moving average for trend visualization
       - Clear display for real-time updates

    2. 🎨 Plot Features:
       - Episode numbers on x-axis
       - Scores on y-axis
       - Running average line for smoothness

    Parameter:
    - scores: List of scores from all episodes
    """
    plt.figure(2)
    plt.clf()
    plt.title('🚀 Training Progress 📈')
    plt.xlabel('Episode Number 🎮')
    plt.ylabel('Score 🎯')
    plt.plot(scores)
    plt.plot([np.mean(scores[max(0, i-100):i]) for i in range(1, len(scores)+1)])
    plt.pause(0.001)

# 🎥 SHOWTIME - RECORD OUR AWESOME AGENT IN ACTION! 🎬
def record_trained_agent(agent, num_episodes=3):
    """
    🎥 SHOWTIME - RECORD OUR AWESOME AGENT IN ACTION! 🎬

    RECORDING PROCESS:
    ----------------
    1. 🎮 Episode Setup:
       - Create video recorder
       - Reset environment
       - Initialize state

    2. 🎯 Action Selection:
       - Use trained agent
       - No exploration (eval mode)
       - Record each step

    3. 📽️ Video Creation:
       - Save episode recordings
       - Track performance
       - Generate MP4 files

    Parameters:
    - agent: Trained DQN agent
    - num_episodes: Number of episodes to record (default: 3)

    Returns:
    - scores: List of scores from recorded episodes
    """
    env = gym.make("LunarLander-v2", render_mode="rgb_array")
    scores = []

    for i in range(num_episodes):
        state, _ = env.reset()
        score = 0

        while True:
            action = agent.act(state, eval_mode=True)  # No exploration in evaluation
            next_state, reward, done, truncated, _ = env.step(action)
            score += reward
            state = next_state
            if done or truncated:
                break

        scores.append(score)
        print(f'🎬 Episode {i+1} Score: {score:.2f}')

    env.close()
    return scores

# 🚀 MAIN EXECUTION 🚀
if __name__ == "__main__":
    """
    🎮 LET'S TRAIN OUR LUNAR LANDER! 🌙

    EXECUTION STEPS:
    --------------
    1. 🔧 Setup Environment
    2. 🎓 Train Agent
    3. 📊 Plot Results
    4. 🎥 Record Videos
    """
    # 🌟 Create and train the agent
    print("🚀 Starting training... Hold onto your spacesuits! 🧑‍🚀")
    scores, agent = train_agent()

    # 📊 Plot final results
    print("\n📈 Plotting training results...")
    plot_scores(scores)

    # 🎥 Record some epic landings
    print("\n🎬 Recording agent performances...")
    record_trained_agent(agent)

    print("\n🎉 Training complete! Check out those smooth landings! 🌙")

Collecting ale-py>=0.9 (from gymnasium[accept-rom-license,atari])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.10.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 