In [1]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from tqdm.notebook import tqdm

In [2]:
def initialize_q_table(env):
    """
    Initialize and return the Q-table as a numpy array.

    The Q-table should have dimensions [number_of_states, number_of_actions].
    Each entry corresponds to the estimated value for taking an action in a given state.

    Parameters:
    - env (gym.Env): The environment from which to derive the number of states and actions.

    Returns:
    - q_table (np.ndarray): The initialized Q-table (e.g., zeros) with shape (n_states, n_actions).
    """
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    q_table = np.zeros((n_states, n_actions))
    return q_table


def epsilon_greedy_policy(state, q_table, epsilon):
    """
    Selects an action using the epsilon-greedy policy.
    
    Args:
        state (int): Current state of the agent.
        epsilon (float): Exploration probability (0 ≤ epsilon ≤ 1).
        q_table (np.ndarray): Q-table for action-value estimates.
        
    Returns:
        action (int): Chosen action (0 for LEFT, 1 for DOWN, 2 for RIGHT, 3 for UP).
    """
    if np.random.random()<epsilon:
        return np.random.randint(0, len(q_table[state]))
    else:
        return int(np.argmax(q_table[state]))


def q_learning_update(q_table, state, action, reward, next_state, alpha, gamma):
    """
    Updates the Q-table using the Q-learning rule.
    
    Args:
        q_table (np.ndarray): Q-table to update.
        state (int): Current state before taking the action.
        action (int): Action taken.
        reward (float): Reward received after taking the action.
        next_state (int): Next state observed after the action.
        alpha (float): Learning rate (0 < alpha ≤ 1).
        gamma (float): Discount factor (0 ≤ gamma ≤ 1).
        
    Returns:
        None (the input q_table is modified in-place).
    """
    # Your code here to update the Q-table using the SARSA update rule
    q_table[state][action]+=alpha*(reward+gamma*max(q_table[next_state])-q_table[state][action])
    return


def train_q_learning(env, num_episodes, alpha, gamma):
    """
    Train the agent using the table lookup Q learning algorithm over a given number of episodes.

    For each episode:
      - Reset the environment.
      - Choose the initial action using the epsilon-greedy policy.
      - For each time step, perform the Q learning update until the episode terminates.
      - Optionally decay epsilon after each episode.

    Parameters:
    - env (gym.Env): The environment to train on.
    - num_episodes (int): Total number of episodes for training.
    - alpha (float): The learning rate.
    - gamma (float): The discount factor.

    Returns:
    - q_table (np.ndarray): The trained Q-table.
    - rewards (list): A list of total rewards per episode (useful for monitoring learning progress).
    """
    # Your code here to implement the training loop using Q-Learning
    q_table = initialize_q_table(env)
    rewards = []

    for eps_id in tqdm(range(num_episodes)):
        rewards.append(0)
        state, _ = env.reset()
        epsilon = 1.0 / (eps_id/10000 + 1) if eps_id < num_episodes*0.9 else 0
        # epsilon = 1.0 / (eps_id + 1)
        action = epsilon_greedy_policy(state, q_table, epsilon)
        for _ in range(1000):
            action = epsilon_greedy_policy(state, q_table, epsilon)
            next_state, reward, is_done, is_trunc, _ = env.step(action)
            q_learning_update(q_table, state, action, reward, next_state, alpha, gamma)
            rewards[-1]+=reward
            if is_done or is_trunc:
                break
            state = next_state

        # Print progess every 5000 episodes
        if (eps_id+1)%5000==0:
            avg_reward = np.mean(rewards[-5000:])
            print(f"Episode {eps_id + 1}, Average Reward (last 5000 episodes): {avg_reward:.2f}")

    return q_table, rewards    

# FrozenLake-v1

In [None]:
num_episodes = 60000

env = gym.make("FrozenLake-v1", is_slippery=False)
q_table, rewards = train_q_learning(env, num_episodes, alpha=0.1, gamma=0.99)
env.close()

  0%|          | 0/60000 [00:00<?, ?it/s]

Episode 5000, Average Reward (last 5000 episodes): 0.10
Episode 10000, Average Reward (last 5000 episodes): 0.31
Episode 15000, Average Reward (last 5000 episodes): 0.48
Episode 20000, Average Reward (last 5000 episodes): 0.59
Episode 25000, Average Reward (last 5000 episodes): 0.65
Episode 30000, Average Reward (last 5000 episodes): 0.71
Episode 35000, Average Reward (last 5000 episodes): 0.74
Episode 40000, Average Reward (last 5000 episodes): 0.77
Episode 45000, Average Reward (last 5000 episodes): 0.80
Episode 50000, Average Reward (last 5000 episodes): 0.82
Episode 55000, Average Reward (last 5000 episodes): 0.87
Episode 60000, Average Reward (last 5000 episodes): 1.00


In [6]:
env = gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery=False)
env = RecordVideo(env, video_folder="./videos")
obs, _ = env.reset()
for _ in range(1000):
    env.render()
    action = epsilon_greedy_policy(obs, q_table, 0)
    next_obs, reward, is_done, is_trunc, _ = env.step(action)
    if is_done or is_trunc:
        break
    obs = next_obs
env.close()

  logger.warn(


# Taxi-v3

In [3]:
env_id = "Taxi-v3"

In [6]:
num_episodes = 200000

env = gym.make(env_id, render_mode="rgb_array")
q_table, rewards = train_q_learning(env, num_episodes, alpha=0.1, gamma=0.99)
env.close()

  0%|          | 0/200000 [00:00<?, ?it/s]

Episode 5000, Average Reward (last 5000 episodes): -354.51
Episode 10000, Average Reward (last 5000 episodes): -73.43
Episode 15000, Average Reward (last 5000 episodes): -36.78
Episode 20000, Average Reward (last 5000 episodes): -22.69
Episode 25000, Average Reward (last 5000 episodes): -15.41
Episode 30000, Average Reward (last 5000 episodes): -11.00
Episode 35000, Average Reward (last 5000 episodes): -7.55
Episode 40000, Average Reward (last 5000 episodes): -5.62
Episode 45000, Average Reward (last 5000 episodes): -3.95
Episode 50000, Average Reward (last 5000 episodes): -2.76
Episode 55000, Average Reward (last 5000 episodes): -1.53
Episode 60000, Average Reward (last 5000 episodes): -0.64
Episode 65000, Average Reward (last 5000 episodes): -0.08
Episode 70000, Average Reward (last 5000 episodes): 0.67
Episode 75000, Average Reward (last 5000 episodes): 1.28
Episode 80000, Average Reward (last 5000 episodes): 1.46
Episode 85000, Average Reward (last 5000 episodes): 1.92
Episode 9000

In [7]:
env = gym.make(env_id, render_mode="rgb_array")
env = RecordVideo(env, video_folder="./videos")
obs, _ = env.reset()
for _ in range(1000):
    env.render()
    action = epsilon_greedy_policy(obs, q_table, 0)
    next_obs, reward, is_done, is_trunc, _ = env.step(action)
    if is_done or is_trunc:
        break
    obs = next_obs
env.close()

  logger.warn(
