In [None]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import matplotlib.pyplot as plt
import numpy as np

# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', desc=generate_random_map(size=4), is_slippery=True, render_mode='human')
observation, info = env.reset()

# Initialize value policy
num_states = env.observation_space.n
VP = np.zeros(num_states)

# Epsilon-greedy policy definition
def epsilon_greedy_policy(state, VP, epsilon=0.5):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Choose random action (explore)
    else:
        return np.argmax(VP[state])  # Choose the best action based on value (exploit)

# Parameters
alpha = 0.5
gamma = 0.9

# To track value updates over time
values_over_time = []

# Run multiple episodes
for episode in range(5000):
    state, info = env.reset()  # Reset environment at the start of each episode
    done = False

    while not done:  # Continue until the episode is done
        action = epsilon_greedy_policy(state, VP)  # Choose action using epsilon-greedy policy
        next_state, reward, terminated, truncated, _ = env.step(action)  # Take action and observe result

        done = terminated or truncated  # Check if episode has ended

        # If it's a terminal state, update based on immediate reward
        if done:
            VP[state] = reward  # Update terminal state with the reward (since no future state exists)
        else:
            # Apply the TD(0) update rule for non-terminal states
            VP[state] = VP[state] + alpha * (reward + gamma * VP[next_state] - VP[state])

        # Move to the next state
        state = next_state

        print(f"State: {state}, Next State: {next_state}, Reward: {reward}, VP[{state}]: {VP[state]}")

    # Track value updates after each episode
    values_over_time.append(VP.copy())

# Plot the value function over time
for i, values in enumerate(values_over_time):
    if (i + 1) % 1000 == 0:
        plt.plot(values, label=f'Episode {i + 1}')

plt.xlabel('State')
plt.ylabel('Value')
plt.title('Value Function Over Time')
plt.show()

# Close environment
env.close()