In [None]:
import gymnasium as gym
import numpy as np
import minigrid
from minigrid.wrappers import FullyObsWrapper
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
 
env = gym.make("MiniGrid-Empty-16x16-v0")
env = FullyObsWrapper(env)
 
# Hyperparamètres
gamma = 0.99
epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001
episodes = 500
batch_size = 64
lr = 1e-3
memory_size = 10000
 
# Réseau Q
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
    def forward(self, x):
        return self.fc(x)
 
# Préparation
obs, info = env.reset()
img_shape = obs['image'].shape
input_dim = np.prod(img_shape)
n_actions = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
policy_net = DQN(input_dim, n_actions).to(device)
target_net = DQN(input_dim, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=lr)
memory = deque(maxlen=memory_size)
 
def preprocess(obs):
    # Convertit l'image en float et normalise
    return torch.tensor(obs['image'].flatten(), dtype=torch.float32, device=device).unsqueeze(0)
 
def calculate_reward(next_obs, done):
    if done:
        return 1.0
    else:
        img = next_obs['image']
        agent_pos = np.where(img == 10)
        goal_pos = np.where(img == 8)
        agent_pos = (agent_pos[0][0], agent_pos[1][0]) if len(agent_pos[0]) > 0 else (0, 0)
        goal_pos = (goal_pos[0][0], goal_pos[1][0]) if len(goal_pos[0]) > 0 else (0, 0)
        distance = np.linalg.norm(np.array(agent_pos) - np.array(goal_pos))
        return -0.00001 * distance
 
rewards = []
losses = []
success_rates = []
window_size = 100
 
for episode in range(episodes):
    obs, info = env.reset()
    state = preprocess(obs)
    done = False
    total_reward = 0
    success = 0
 
    while not done:
        # Choix de l'action (epsilon-greedy)
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_values = policy_net(state)
                action = torch.argmax(q_values).item()
 
        next_obs, _, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        reward = calculate_reward(next_obs, done)
        next_state = preprocess(next_obs)
 
        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward
 
        # Apprentissage
        if len(memory) >= batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards_b, next_states, dones = zip(*batch)
            states = torch.cat(states).float()
            next_states = torch.cat(next_states).float()
            actions = torch.tensor(actions, device=device)
            rewards_b = torch.tensor(rewards_b, dtype=torch.float32, device=device)
            dones = torch.tensor(dones, dtype=torch.float32, device=device)
 
            q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze()
            with torch.no_grad():
                next_q = target_net(next_states).max(1)[0]
                target = rewards_b + gamma * next_q * (1 - dones)
            loss = nn.MSELoss()(q_values, target)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
 
        # Mise à jour du réseau cible
        if episode % 10 == 0:
            target_net.load_state_dict(policy_net.state_dict())
 
    rewards.append(total_reward)
    if done and total_reward > 0:
        success = 1
    success_rates.append(success)
    epsilon = max(min_epsilon, epsilon * np.exp(-decay_rate * episode))
    print(f"Episode {episode+1}: Reward = {total_reward}")
 
env.close()
 
# Visualisations
plt.figure(figsize=(12, 8))
 
# Courbe des récompenses
plt.subplot(2, 2, 1)
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Evolution des récompenses (DQN)')
 
# Courbe de la moyenne des récompenses
plt.subplot(2, 2, 2)
moving_avg = np.convolve(rewards, np.ones(window_size)/window_size, mode='valid')
plt.plot(moving_avg)
plt.xlabel('Episode')
plt.ylabel('Moyenne des récompenses')
plt.title(f'Moyenne des récompenses (fenêtre de {window_size} épisodes)')
 
# Courbe des pertes
plt.subplot(2, 2, 3)
plt.plot(losses)
plt.xlabel('Episode')
plt.ylabel('Perte')
plt.title('Evolution de la perte (DQN)')
 
# Courbe du taux de succès
plt.subplot(2, 2, 4)
success_rate = np.convolve(success_rates, np.ones(window_size)/window_size, mode='valid')
plt.plot(success_rate)
plt.xlabel('Episode')
plt.ylabel('Taux de succès')
plt.title(f'Taux de succès (fenêtre de {window_size} épisodes)')
 
plt.tight_layout()
plt.show()