In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import matplotlib.pyplot as plt
from itertools import product
%matplotlib inline

# Neural Network for Q-Value Approximation
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256, num_layers=2):
        super(QNetwork, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.ReLU())
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_dim, output_dim))
        self.net = nn.Sequential(*layers)
        self._init_weights()

    def _init_weights(self):
        for layer in self.net:
            if isinstance(layer, nn.Linear):
                nn.init.uniform_(layer.weight, -0.001, 0.001)
                nn.init.uniform_(layer.bias, -0.001, 0.001)

    def forward(self, x):
        return self.net(x)

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# Training Function
def train_agent(env_name, algorithm, use_replay, epsilon, lr, num_episodes=1000, num_seeds=50):
    all_rewards = []
    for seed in range(num_seeds):
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        
        env = gym.make(env_name)
        state_dim = env.observation_space.shape[0] if 'Assault' not in env_name else 128
        action_dim = env.action_space.n
        
        # Preprocess state
        def preprocess(state):
            return state / 255.0 if 'Assault' in env_name else state
        
        q_net = QNetwork(state_dim, action_dim)
        optimizer = optim.Adam(q_net.parameters(), lr=lr)
        buffer = ReplayBuffer(1000000) if use_replay else None
        
        episode_rewards = []
        for episode in range(num_episodes):
            state = preprocess(env.reset())
            total_reward = 0
            done = False
            
            while not done:
                state_tensor = torch.FloatTensor(state)
                with torch.no_grad():
                    q_values = q_net(state_tensor)
                
                # Epsilon-greedy action
                if np.random.rand() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = q_values.argmax().item()
                
                next_state, reward, done, _ = env.step(action)
                next_state = preprocess(next_state)
                total_reward += reward
                
                # Store transition
                if use_replay:
                    buffer.push(state, action, reward, next_state, done)
                else:
                    # Immediate update
                    next_state_tensor = torch.FloatTensor(next_state)
                    with torch.no_grad():
                        next_q = q_net(next_state_tensor)
                    
                    if algorithm == 'q_learning':
                        target = reward + 0.99 * next_q.max() * (not done)
                    elif algorithm == 'expected_sarsa':
                        probs = torch.ones(action_dim) * (epsilon / action_dim)
                        probs[next_q.argmax()] += (1 - epsilon)
                        target = reward + 0.99 * torch.sum(probs * next_q) * (not done)
                    
                    loss = (q_values[action] - target) ** 2
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                
                # Replay buffer update
                if use_replay and len(buffer) >= 64:
                    batch = buffer.sample(64)
                    states, actions, rewards, next_states, dones = zip(*batch)
                    
                    states = torch.FloatTensor(np.array(states))
                    actions = torch.LongTensor(actions)
                    rewards = torch.FloatTensor(rewards)
                    next_states = torch.FloatTensor(np.array(next_states))
                    dones = torch.BoolTensor(dones)
                    
                    with torch.no_grad():
                        next_q = q_net(next_states)
                        if algorithm == 'q_learning':
                            targets = rewards + 0.99 * next_q.max(1)[0] * (~dones)
                        elif algorithm == 'expected_sarsa':
                            probs = torch.ones_like(next_q) * (epsilon / action_dim)
                            best_actions = next_q.argmax(1)
                            probs[range(64), best_actions] += (1 - epsilon)
                            targets = rewards + 0.99 * (probs * next_q).sum(1) * (~dones)
                    
                    current_q = q_net(states).gather(1, actions.unsqueeze(1)).squeeze()
                    loss = nn.functional.mse_loss(current_q, targets)
                    
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                
                state = next_state
            
            episode_rewards.append(total_reward)
            print(f'Seed {seed}, Episode {episode}: Reward {total_reward}')
        
        all_rewards.append(episode_rewards)
        env.close()
    
    return np.array(all_rewards)

# Plotting Results
def plot_curves(env_name, use_replay, data):
    plt.figure(figsize=(12, 8))
    line_styles = ['-', '--', ':']
    for i, (algo, eps, lr) in enumerate(data.keys()):
        rewards = data[(algo, eps, lr)]
        mean = np.mean(rewards, axis=0)
        std = np.std(rewards, axis=0)
        x = np.arange(len(mean))
        color = 'green' if algo == 'q_learning' else 'red'
        ls = line_styles[int(np.log2(lr)) - 2]  # lr 1/4, 1/8, 1/16
        plt.plot(x, mean, color=color, linestyle=ls, label=f'{algo}, ε={eps}, lr={lr}')
        plt.fill_between(x, mean - std, mean + std, color=color, alpha=0.1)
    
    plt.title(f'{env_name} {"with" if use_replay else "without"} Replay Buffer')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.legend()
    plt.show()
    plt.savefig(f'{env_name}_{"replay" if use_replay else "no_replay"}.png')
    plt.close()

In [3]:
envs = ['Acrobot-v1', 'ALE/Assault-ram-v5']
algorithms = ['q_learning', 'expected_sarsa']
epsilons = [0.1, 0.3, 0.5]
lrs = [0.25, 0.125, 0.0625]
buffers = [True, False]

for env in envs:
    for use_replay in buffers:
        results = {}
        for algo, eps, lr in product(algorithms, epsilons, lrs):
            print(f'Training {env} {algo} ε={eps} lr={lr} buffer={use_replay}')
            rewards = train_agent(env, algo, use_replay, eps, lr, 1000, 50)
            results[(algo, eps, lr)] = rewards
        plot_curves(env, use_replay, results)

Training Acrobot-v1 q_learning ε=0.1 lr=0.25 buffer=True
Seed 0, Episode 0: Reward -500.0
Seed 0, Episode 1: Reward -500.0
Seed 0, Episode 2: Reward -500.0
Seed 0, Episode 3: Reward -500.0
Seed 0, Episode 4: Reward -500.0
Seed 0, Episode 5: Reward -500.0
Seed 0, Episode 6: Reward -500.0
Seed 0, Episode 7: Reward -500.0
Seed 0, Episode 8: Reward -500.0
Seed 0, Episode 9: Reward -500.0
Seed 0, Episode 10: Reward -500.0
Seed 0, Episode 11: Reward -500.0
Seed 0, Episode 12: Reward -500.0
Seed 0, Episode 13: Reward -500.0
Seed 0, Episode 14: Reward -382.0
Seed 0, Episode 15: Reward -189.0
Seed 0, Episode 16: Reward -500.0
Seed 0, Episode 17: Reward -500.0
Seed 0, Episode 18: Reward -500.0
Seed 0, Episode 19: Reward -500.0
Seed 0, Episode 20: Reward -500.0
Seed 0, Episode 21: Reward -500.0
Seed 0, Episode 22: Reward -500.0
Seed 0, Episode 23: Reward -451.0
Seed 0, Episode 24: Reward -500.0
Seed 0, Episode 25: Reward -500.0
Seed 0, Episode 26: Reward -500.0
Seed 0, Episode 27: Reward -346.0
S

KeyboardInterrupt: 