In [None]:
%pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt

In [2]:
import numpy as np
import gymnasium as gym
import random
import imageio
import os
import tqdm
import pickle
import math
import numpy as np
from tqdm.notebook import tqdm

In [3]:
# Create the FrozenLake-v1 environment using 4x4 map and non-slippery version and render_mode="rgb_array"
env = gym.make('FrozenLake-v1',map_name="4x4",is_slippery=True, render_mode="rgb_array") 

state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


In [4]:
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((state_space, action_space))
  #Qtable = np.full((state_space, action_space), 1.0)
  return Qtable


In [5]:
Qtable_frozenlake = initialize_q_table(state_space, action_space)

In [6]:
# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.1          # Learning rate

# Evaluation parameters
n_eval_episodes = 1000       # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 999               # Max steps per episode
gamma = 0.99                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Boltzmann exploration parameters
init_temp = 1.0           # initial temperature (high for more exploration)
min_temp = 0.001            # minimum temperature
temp_decay = 0.0005          # temperature decay rate

In [7]:
# Boltzmann exploration for the slippery / stochochastic policy 
def boltzmann_exploration(state, Qtable, temperature):
    """
    Select an action using Boltzmann exploration strategy
    
    Args:
        state: The current state
        Qtable: Dictionary or array of Q-values for each action in current state
        temperature: Temperature parameter controlling exploration (τ)
        
    Returns:
        Selected action
    """
    # Get Q-values for all actions in current state
    action_values = Qtable[state]
    
    # Avoid numerical instability by subtracting max value (doesn't change softmax result)
    max_value = np.max(action_values)
    exp_values = np.exp((action_values - max_value) / temperature)
    
    # Calculate softmax probabilities
    probabilities = exp_values / np.sum(exp_values)
    
    
    # Select action based on the calculated probabilities
    return np.random.choice(range(len(action_values)), p=probabilities)

In [9]:
def train_boltzmann(n_training_episodes, min_temperature, initial_temperature, temperature_decay, env, max_steps, Qtable, learning_rate, gamma):
    """
    Train the agent using Q-learning with Boltzmann exploration
    """
    for episode in tqdm(range(n_training_episodes)):
        # Reduce temperature (because we need less and less exploration)
        # temperature = min_temperature + (initial_temperature - min_temperature) * np.exp(-temperature_decay * episode)
        temperature = max(min_temperature, initial_temperature * np.exp(-temperature_decay * episode))
        
        # Reset the environment
        state, info = env.reset()
        terminated = False
        truncated = False

        # repeat for each step
        for step in range(max_steps):
            # Choose the action using Boltzmann policy
            action = boltzmann_exploration(state, Qtable, temperature)

            # Take action and observe the outcome state and reward
            new_state, reward, terminated, truncated, info = env.step(action)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            Qtable[state][action] = Qtable[state][action] + learning_rate * (
                reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]
            )

            # If terminated or truncated finish the episode
            if terminated or truncated:
                break

            # Our next state is the new state
            state = new_state
            
    return Qtable

In [10]:
Qtable_frozenlake = train_boltzmann(
    n_training_episodes, 
    min_temp, 
    init_temp, 
    temp_decay, 
    env, 
    max_steps, 
    Qtable_frozenlake,
    learning_rate,
    gamma
)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [11]:
def greedy_policy(Qtable, state):
  # Exploitation: take the action with the highest state, action value
  action = np.argmax(Qtable[state][:])

  return action

In [12]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param max_steps: Maximum number of steps per episode
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The Q-table
  :param seed: The evaluation seed array (for taxi-v3)
  """
  episode_rewards = []
  for episode in tqdm(range(n_eval_episodes)):
    if seed:
      state, info = env.reset(seed=seed[episode])
    else:
      state, info = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
      # Take the action (index) that have the maximum expected future reward given that state
      action = greedy_policy(Q, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [13]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Mean_reward=0.75 +/- 0.44
