In [None]:
#code version Moussa
import gymnasium as gym
import numpy as np
import minigrid
from minigrid.wrappers import FullyObsWrapper
import matplotlib.pyplot as plt
 
env = gym.make("MiniGrid-Empty-16x16-v0")
env = FullyObsWrapper(env)
 
# Hyperparamètres
alpha = 0.1
gamma = 0.99
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001
episodes = 1000
 
Q = {}
 
def get_state(obs):
    
    img = obs['image']
    agent_pos = np.where(img == 10)  
    goal_pos = np.where(img == 8)    
    agent_pos = (agent_pos[0][0], agent_pos[1][0]) if len(agent_pos[0]) > 0 else (0, 0)
    goal_pos = (goal_pos[0][0], goal_pos[1][0]) if len(goal_pos[0]) > 0 else (0, 0)
    return agent_pos + goal_pos
 
def calculate_reward(next_obs, done):
    if done:
        return 1.0  # Récompense du MiniGrid rappel : A reward of ‘1 - 0.9 * (step_count / max_steps)’ is given for success, and ‘0’ for failure.
    else:
        img = next_obs['image']
        agent_pos = np.where(img == 10)
        goal_pos = np.where(img == 8)
        agent_pos = (agent_pos[0][0], agent_pos[1][0]) if len(agent_pos[0]) > 0 else (0, 0)
        goal_pos = (goal_pos[0][0], goal_pos[1][0]) if len(goal_pos[0]) > 0 else (0, 0)
        distance = np.linalg.norm(np.array(agent_pos) - np.array(goal_pos))
        return -0.00001 * distance
rewards = []  
for episode in range(episodes):
    obs, info = env.reset()
    state = get_state(obs)
    done = False
    total_reward = 0
 
    while not done:
        if state not in Q:
            Q[state] = np.zeros(env.action_space.n)
 
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])
 
        next_obs, _, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        reward = calculate_reward(next_obs, done)
        if done:
            print("Goal atteint !")
        next_state = get_state(next_obs)
 
        if next_state not in Q:
            Q[next_state] = np.zeros(env.action_space.n)
 
        Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])
 
        state = next_state
        total_reward += reward
    rewards.append(total_reward)  # récompense totale de l'épisode à la liste des récompenses
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
 
    print(f"Episode {episode+1}: Reward = {total_reward}")
 
env.close()
 
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Evolution des récompenses')
plt.show()