In [17]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from tqdm.notebook import tqdm

# Fixed learning rate alpha

In [None]:
def initialize_q_table(n_states, n_actions):
    """
    Initializes a Q-table with zeros.

    Parameters:
        n_states (int): Number of states in the environment.
        n_actions (int): Number of possible actions in the environment.

    Returns:
        numpy.ndarray: Q-table of shape (n_states, n_actions) initialized with zeros.
    """
    return np.zeros((n_states, n_actions))



def epsilon_greedy_action(state, q_table, epsilon):
    """
    Selects an action using an epsilon-greedy policy.

    Parameters:
        state (int): Current state index.
        q_table (numpy.ndarray): Current Q-table.
        epsilon (float): Exploration rate (probability of choosing a random action).
    Returns:
        int: Selected action (0 to n_actions-1).
    """
    if np.random.random()<epsilon:
        return np.random.randint(0, len(q_table[state]))
    else:
        return int(np.argmax(q_table[state]))
    

def generate_episode(env, q_table, epsilon, max_steps=100):
    """
    Generates an episode using the current Q-table and epsilon-greedy policy.

    Parameters:
        env (gym.Env): FrozenLake environment instance.
        q_table (numpy.ndarray): Current Q-table.
        epsilon (float): Exploration rate.
        max_steps (int): Maximum steps allowed per episode.

    Returns:
        list: Episode as a list of tuples (state, action, reward).
    """
    episode = []
    obs, _ = env.reset()
    for _ in range(max_steps):
        env.render()
        action = epsilon_greedy_action(obs, q_table, epsilon)
        next_obs, reward, is_done, is_trunc, _ = env.step(action)
        episode.append([obs, action, reward])
        if is_done or is_trunc:
            break
        obs = next_obs

    return episode


def calculate_returns(episode, gamma):
    """
    Calculates the discounted return for each step in an episode.

    Parameters:
        episode (list): List of (state, action, reward) tuples.
        gamma (float): Discount factor (between 0 and 1).

    Returns:
        list: Discounted returns for each step in the episode.
    """
    returns = []
    G = 0 

    for step in reversed(episode):
        _, _, reward = step
        G = reward + gamma * G 
        returns.insert(0, G)  

    return returns


def update_q_table(q_table, episode, returns, alpha):
    """
    Updates the Q-table using the first-visit Monte Carlo method.

    Parameters:
        q_table (numpy.ndarray): Q-table to update.
        episode (list): Episode data as list of (state, action, reward) tuples.
        returns (list): Precomputed discounted returns for each step.
        alpha (float): Learning rate.

    Returns:
        None (updates Q-table in-place).
    """
    visited = set()  # Track visited (state, action) pairs
    for (obs, action, _), discounted_return in zip(episode, returns):
        if (obs, action) not in visited:
            visited.add((obs, action))
            q_table[obs][action] += alpha * (discounted_return - q_table[obs][action])
    
    return


def decay_epsilon(current_epsilon, decay_rate, min_epsilon):
    """
    Decays the exploration rate exponentially.

    Parameters:
        current_epsilon (float): Current exploration rate.
        decay_rate (float): Multiplicative decay factor (e.g., 0.99).
        min_epsilon (float): Minimum allowed epsilon.

    Returns:
        float: New epsilon value.
    """
    return max(current_epsilon*decay_rate, min_epsilon)


def train_monte_carlo(env, num_episodes, gamma, initial_epsilon, epsilon_decay_rate, min_epsilon, alpha):
    """
    Executes the On-Policy Monte Carlo Control training loop.

    Parameters:
        env (gym.Env): FrozenLake environment instance.
        num_episodes (int): Total number of training episodes.
        gamma (float): Discount factor.
        initial_epsilon (float): Initial exploration rate.
        epsilon_decay (float): Epsilon decay rate.
        min_epsilon (float): Minimum exploration rate.
        alpha (float): Learning rate.

    Returns:
        tuple: (Q-table, list of total rewards per episode)
    """
    q_table = initialize_q_table(env.observation_space.n, env.action_space.n)
    epsilon = initial_epsilon
    episode_rewards = [] 

    for _ in tqdm(range(num_episodes)):
        episode = generate_episode(env, q_table, epsilon, max_steps=100)
        total_reward = sum(reward for (_, _, reward) in episode)
        episode_rewards.append(total_reward)
        discounted_returns = calculate_returns(episode, gamma)
        update_q_table(q_table, episode, discounted_returns, alpha)
        epsilon = decay_epsilon(epsilon, epsilon_decay_rate, min_epsilon)

    return q_table, episode_rewards

In [28]:
num_episodes = 10000

env = gym.make("FrozenLake-v1", render_mode="rgb_array")
env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda eps: eps==num_episodes-1)
total_reward = 0.0
total_steps = 0
obs, _ = env.reset()
_, _ = train_monte_carlo(env, num_episodes, 0.99, 1.0, 0.995, 0.01, 0.1)
env.close()

  logger.warn("Unable to save last video! Did you call close()?")


  0%|          | 0/10000 [00:00<?, ?it/s]

# Using visitation table N

In [63]:
def initialize_q_table(n_states, n_actions):
    """
    Initializes a Q-table with zeros.

    Parameters:
        n_states (int): Number of states in the environment.
        n_actions (int): Number of possible actions in the environment.

    Returns:
        numpy.ndarray: Q-table of shape (n_states, n_actions) initialized with zeros.
    """
    return np.zeros((n_states, n_actions))


def initialize_visitation_table(n_states, n_actions):
    """
    Initializes a visitation count table with zeros.

    Parameters:
        n_states (int): Number of states in the environment.
        n_actions (int): Number of possible actions in the environment.

    Returns:
        numpy.ndarray: Visitation count table of shape (n_states, n_actions).
    """
    return np.zeros((n_states, n_actions))


def epsilon_greedy_action(state, q_table, epsilon):
    """
    Selects an action using an epsilon-greedy policy.

    Parameters:
        state (int): Current state index.
        q_table (numpy.ndarray): Current Q-table.
        epsilon (float): Exploration rate (probability of choosing a random action).
    Returns:
        int: Selected action (0 to n_actions-1).
    """
    if np.random.random()<epsilon:
        return np.random.randint(0, len(q_table[state]))
    else:
        return int(np.argmax(q_table[state]))
    

def generate_episode(env, q_table, epsilon, max_steps=1000):
    """
    Generates an episode using the current Q-table and epsilon-greedy policy.

    Parameters:
        env (gym.Env): FrozenLake environment instance.
        q_table (numpy.ndarray): Current Q-table.
        epsilon (float): Exploration rate.
        max_steps (int): Maximum steps allowed per episode.

    Returns:
        list: Episode as a list of tuples (state, action, reward).
    """
    episode = []
    obs, _ = env.reset()
    for _ in range(max_steps):
        # if (eps_id+1)%5000==0:
        #     env.render()
        action = epsilon_greedy_action(obs, q_table, epsilon)
        next_obs, reward, is_done, is_trunc, _ = env.step(action)
        episode.append([obs, action, reward])
        if is_done or is_trunc:
            break
        obs = next_obs

    return episode


def calculate_returns(episode, gamma):
    """
    Calculates the discounted return for each step in an episode.

    Parameters:
        episode (list): List of (state, action, reward) tuples.
        gamma (float): Discount factor (between 0 and 1).

    Returns:
        list: Discounted returns for each step in the episode.
    """
    returns = []
    G = 0 

    for step in reversed(episode):
        _, _, reward = step
        G = reward + gamma * G 
        returns.insert(0, G)  

    return returns


def update_q_table(q_table, n_table, episode, returns):
    """
    Updates the Q-table using the first-visit Monte Carlo method.

    Parameters:
        q_table (numpy.ndarray): Q-table to update.
        n_table (numpy.adarray): Visitation count table.
        episode (list): Episode data as list of (state, action, reward) tuples.
        returns (list): Precomputed discounted returns for each step.

    Returns:
        None (updates Q-table in-place).
    """
    visited = set()  # Track visited (state, action) pairs
    for (obs, action, _), discounted_return in zip(episode, returns):
        if (obs, action) not in visited:
            visited.add((obs, action))
            n_table[obs][action]+=1
            q_table[obs][action] += 1/n_table[obs][action] * (discounted_return - q_table[obs][action])
    
    return


def train_monte_carlo(env, num_episodes, gamma):
    """
    Executes the On-Policy Monte Carlo Control training loop with improved epsilon decay.

    Parameters:
        env (gym.Env): FrozenLake environment instance.
        num_episodes (int): Total number of training episodes.
        gamma (float): Discount factor.

    Returns:
        tuple: (Q-table, visitation table, list of total rewards per episode)
    """
    q_table = initialize_q_table(env.observation_space.n, env.action_space.n)
    n_table = initialize_visitation_table(env.observation_space.n, env.action_space.n)
    episode_rewards = []

    for eps_id in tqdm(range(num_episodes)):
        # Update epsilon using inverse decay
        epsilon = 1.0 / (eps_id/10000 + 1) if eps_id < 90000 else 1.0 / (eps_id + 1)
        # epsilon = 1.0 / (eps_id + 1)
        episode = generate_episode(env, q_table, epsilon, max_steps=100)
        total_reward = sum(reward for (_, _, reward) in episode)
        episode_rewards.append(total_reward)
        discounted_returns = calculate_returns(episode, gamma)
        update_q_table(q_table, n_table, episode, discounted_returns)

        # Print progress every 5000 episodes
        if (eps_id + 1) % 5000 == 0:
            avg_reward = np.mean(episode_rewards[-5000:])
            print(f"Episode {eps_id + 1}, Average Reward (last 5000 episodes): {avg_reward:.2f}")

    return q_table, n_table, episode_rewards

In [64]:
num_episodes = 100000

env = gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery=False)
# env = RecordVideo(env, video_folder="./videos", episode_trigger=lambda eps: eps==num_episodes-1)
obs, _ = env.reset()
q_table, n_table, episode_rewards = train_monte_carlo(env, num_episodes, 0.99)
env.close()

  0%|          | 0/100000 [00:00<?, ?it/s]

Episode 5000, Average Reward (last 5000 episodes): 0.10
Episode 10000, Average Reward (last 5000 episodes): 0.31
Episode 15000, Average Reward (last 5000 episodes): 0.48
Episode 20000, Average Reward (last 5000 episodes): 0.58
Episode 25000, Average Reward (last 5000 episodes): 0.64
Episode 30000, Average Reward (last 5000 episodes): 0.70
Episode 35000, Average Reward (last 5000 episodes): 0.75
Episode 40000, Average Reward (last 5000 episodes): 0.76
Episode 45000, Average Reward (last 5000 episodes): 0.80
Episode 50000, Average Reward (last 5000 episodes): 0.81
Episode 55000, Average Reward (last 5000 episodes): 0.83
Episode 60000, Average Reward (last 5000 episodes): 0.84
Episode 65000, Average Reward (last 5000 episodes): 0.85
Episode 70000, Average Reward (last 5000 episodes): 0.86
Episode 75000, Average Reward (last 5000 episodes): 0.88
Episode 80000, Average Reward (last 5000 episodes): 0.88
Episode 85000, Average Reward (last 5000 episodes): 0.89
Episode 90000, Average Reward (l

In [60]:
env = gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery=False)
env = RecordVideo(env, video_folder="./videos")
obs, _ = env.reset()
for _ in range(1000):
    print(f"Current obs: {obs}")
    env.render()
    action = epsilon_greedy_action(obs, q_table, 0)
    print(f"Current action: {action}")
    print("="*10)
    next_obs, reward, is_done, is_trunc, _ = env.step(action)
    if is_done or is_trunc:
        break
    obs = next_obs
env.close()

  logger.warn(


Current obs: 0
Current action: 1
Current obs: 4
Current action: 1
Current obs: 8
Current action: 2
Current obs: 9
Current action: 1
Current obs: 13
Current action: 2
Current obs: 14
Current action: 2
