In [1]:
import numpy as np
import random
import time
import gym
from collections import defaultdict

def initialize_q_table(states, actions):
    """Initialize the Q-table with small random values."""
    return np.random.uniform(low=-0.01, high=0.01, size=(states, actions))

def choose_action(state, q_table, epsilon):
    """Choose an action using epsilon-greedy strategy."""
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, q_table.shape[1] - 1)
    else:
        return np.argmax(q_table[state])

def choose_action_mc(state, q_table, epsilon, action_space):
    """Choose an action for Monte Carlo using epsilon-greedy strategy."""
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, action_space - 1)
    else:
        return np.argmax(q_table[state])

def update_q_table(q_table, state, action, reward, next_state, next_action, alpha, gamma):
    """Update the Q-value using the SARSA update rule."""
    state, action, next_state, next_action = int(state), int(action), int(next_state), int(next_action)
    td_target = reward + gamma * q_table[next_state, next_action]
    td_error = td_target - q_table[state, action]
    q_table[state, action] += alpha * td_error

def sarsa(env, episodes, alpha, gamma, epsilon, epsilon_decay):
    """SARSA algorithm implementation."""
    q_table = initialize_q_table(env.observation_space.n, env.action_space.n)
    start_time = time.time()

    for episode in range(episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]

        action = choose_action(state, q_table, epsilon)
        total_reward = 0

        print(f"Episode {episode + 1}/{episodes} (SARSA)")

        while True:
            result = env.step(action)
            if len(result) == 5:
                next_state, reward, done, truncated, _ = result
            elif len(result) == 4:
                next_state, reward, done, truncated = result
            else:
                raise ValueError("Unexpected return format from env.step()")

            if isinstance(next_state, tuple):
                next_state = next_state[0]

            next_action = choose_action(next_state, q_table, epsilon)
            update_q_table(q_table, state, action, reward, next_state, next_action, alpha, gamma)

            total_reward += reward
            state, action = next_state, next_action

            if done or truncated:
                print(f"Episode finished with total reward: {total_reward}\n")
                break

        epsilon = max(epsilon * epsilon_decay, 0.01)

    elapsed_time = time.time() - start_time
    print(f"Total time for SARSA: {elapsed_time:.2f} seconds")
    return q_table, elapsed_time

def monte_carlo(env, episodes, gamma, epsilon, epsilon_decay):
    """Monte Carlo control with epsilon-greedy policy."""
    q_table = defaultdict(lambda: np.zeros(env.action_space.n))
    returns = defaultdict(list)  # Store returns for state-action pairs
    start_time = time.time()

    for episode in range(episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]

        episode_data = []  # To store state, action, reward
        total_reward = 0

        print(f"Episode {episode + 1}/{episodes} (Monte Carlo)")

        while True:
            action = choose_action_mc(state, q_table, epsilon, env.action_space.n)
            result = env.step(action)

            if len(result) == 5:
                next_state, reward, done, truncated, _ = result
            elif len(result) == 4:
                next_state, reward, done, truncated = result
            else:
                raise ValueError("Unexpected return format from env.step()")

            if isinstance(next_state, tuple):
                next_state = next_state[0]

            episode_data.append((state, action, reward))
            total_reward += reward
            state = next_state

            if done or truncated:
                print(f"Episode finished with total reward: {total_reward}\n")
                break

        # Calculate returns and update Q-values
        G = 0  # Initialize return
        visited = set()

        for state, action, reward in reversed(episode_data):
            G = reward + gamma * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                q_table[state][action] = np.mean(returns[(state, action)])

        epsilon = max(epsilon * epsilon_decay, 0.01)

    elapsed_time = time.time() - start_time
    print(f"Total time for Monte Carlo: {elapsed_time:.2f} seconds")
    return q_table, elapsed_time

if __name__ == "__main__":
    env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="rgb_array")

    # Hyperparameters
    episodes = 500
    alpha = 0.3  # Learning rate (SARSA only)
    gamma = 0.99  # Discount factor
    epsilon = 1.0  # Exploration rate
    epsilon_decay = 0.995

    # Run SARSA
    sarsa_q_table, sarsa_time = sarsa(env, episodes, alpha, gamma, epsilon, epsilon_decay)

    # Run Monte Carlo
    mc_q_table, mc_time = monte_carlo(env, episodes, gamma, epsilon, epsilon_decay)

    # Display Comparison Table
    print("\nComparison Table:")
    print("Algorithm    | Time (seconds)")
    print("------------------------------")
    print(f"SARSA        | {sarsa_time:.2f}")
    print(f"Monte Carlo  | {mc_time:.2f}")

    print("\nTrained Q-Table (SARSA):")
    print(sarsa_q_table)

    print("\nTrained Q-Table (Monte Carlo):")
    for state, actions in mc_q_table.items():
        print(f"State {state}: {actions}")


Episode 1/500 (SARSA)
Episode finished with total reward: 0.0

Episode 2/500 (SARSA)
Episode finished with total reward: 0.0

Episode 3/500 (SARSA)
Episode finished with total reward: 0.0

Episode 4/500 (SARSA)
Episode finished with total reward: 0.0

Episode 5/500 (SARSA)
Episode finished with total reward: 0.0

Episode 6/500 (SARSA)
Episode finished with total reward: 0.0

Episode 7/500 (SARSA)
Episode finished with total reward: 0.0

Episode 8/500 (SARSA)
Episode finished with total reward: 0.0

Episode 9/500 (SARSA)
Episode finished with total reward: 0.0

Episode 10/500 (SARSA)
Episode finished with total reward: 0.0

Episode 11/500 (SARSA)
Episode finished with total reward: 0.0

Episode 12/500 (SARSA)
Episode finished with total reward: 0.0

Episode 13/500 (SARSA)
Episode finished with total reward: 0.0

Episode 14/500 (SARSA)
Episode finished with total reward: 0.0

Episode 15/500 (SARSA)
Episode finished with total reward: 0.0

Episode 16/500 (SARSA)
Episode finished with tota

  if not isinstance(terminated, (bool, np.bool8)):


Episode finished with total reward: 0.0

Episode 447/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 448/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 449/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 450/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 451/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 452/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 453/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 454/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 455/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 456/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 457/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 458/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 459/500 (Monte Carlo)
Episode finished with total reward: 0.0

Episode 460/500 (Monte Carlo)
Episod

In [2]:
# For visualizing frozen lake but not able to visualize
import os
os.environ["XDG_RUNTIME_DIR"] = "/tmp/runtime-$(id -u)"

# i also tried with this one but didn't work
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human")