# Ambiente GYM
https://gymnasium.farama.org/

In [1]:
# MONTE CARLO POLICY EVALUATION
# Episódios inteiros e cálculo do valor do estado de trás para frente

import gymnasium as gym
import numpy as np

# Initialise the environment and variables
env = gym.make("FrozenLake-v1", is_slippery=False)
V = np.zeros(16) # Value function
returns = [[] for _ in range(16)]  # List of returns for each state
discount = 0.98
max_episodes = 10000
episode_counter = 0 # Episode counter

def get_action(observation):
    # Create the policy (random policy)
    return env.action_space.sample()

def value_update(trajectory):
    # First-visit Monte Carlo update
    global V, returns
    visited = set()
    G = 0

    # Go backwards through the trajectory
    for s, a, r, s1 in reversed(trajectory):
        G = r + discount * G
        if s not in visited:
            returns[s].append(G)
            V[s] = np.mean(returns[s])
            visited.add(s)

while True:
    # Reset trajectory for the new episode
    trajectory = []
    observation, info = env.reset()
    s = observation

    while True:
        action = get_action(s)
        observation, reward, terminated, truncated, info = env.step(action)
        s1 = observation
        trajectory.append((s, action, reward, s1))
        if terminated or truncated:
            value_update(trajectory)
            break
        s = s1
        
    episode_counter += 1
    if episode_counter >= max_episodes:
        break

env.close()

print("V as 4x4 matrix:\n", V.reshape(4, 4))

V as 4x4 matrix:
 [[0.01015825 0.00929785 0.01768505 0.01127781]
 [0.01213049 0.         0.03658819 0.        ]
 [0.02648688 0.07281767 0.13098602 0.        ]
 [0.         0.16393868 0.41979522 0.        ]]
