# Reinforcement Learning Demo: Lunar Lander

## Setup

In [None]:
!pip install gymnasium[box2d] numpy matplotlib pandas tqdm

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
from tqdm import tqdm
import pandas as pd

## Environment

In [None]:
env = gym.make('LunarLander-v2')
print(f"Actions: {env.action_space.n}")
print(f"Observations: {env.observation_space.shape}")

## Q-Learning Agent

In [None]:
class QLearningAgent:
    def __init__(self, n_actions=4, lr=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995):
        self.n_actions = n_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.q_table = defaultdict(lambda: np.zeros(n_actions))
        
    def discretize_state(self, state):
        return tuple(np.round(state * 10).astype(int))
    
    def choose_action(self, state):
        discrete_state = self.discretize_state(state)
        if random.random() < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        return np.argmax(self.q_table[discrete_state])
    
    def learn(self, state, action, reward, next_state, done):
        discrete_state = self.discretize_state(state)
        discrete_next_state = self.discretize_state(next_state)
        
        current_q = self.q_table[discrete_state][action]
        next_max_q = 0 if done else np.max(self.q_table[discrete_next_state])
        target = reward + self.gamma * next_max_q
        
        self.q_table[discrete_state][action] = current_q + self.lr * (target - current_q)
        
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

## Training

In [None]:
agent = QLearningAgent()
scores = []

for episode in tqdm(range(500)):
    state, _ = env.reset()
    total_reward = 0
    
    while True:
        action = agent.choose_action(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        
        agent.learn(state, action, reward, next_state, done)
        
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    scores.append(total_reward)
    
    if (episode + 1) % 100 == 0:
        avg_score = np.mean(scores[-100:])
        print(f"Episode {episode + 1}: Avg Score = {avg_score:.2f}")

## Results

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(scores)
plt.title('Training Scores')
plt.xlabel('Episode')
plt.ylabel('Score')

plt.subplot(1, 2, 2)
moving_avg = pd.Series(scores).rolling(50).mean()
plt.plot(moving_avg)
plt.title('Moving Average (50 episodes)')
plt.xlabel('Episode')
plt.ylabel('Average Score')

plt.tight_layout()
plt.show()

print(f"Final average score: {np.mean(scores[-100:]):.2f}")