In [1]:
import gym
import numpy as np

# Initialize the environment
env = gym.make('FrozenLake-v1', is_slippery=False)  # Setting is_slippery to False for deterministic behavior

# Set parameters
alpha = 0.8    # Learning rate
gamma = 0.95   # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 2000

# Initialize Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Function for choosing an action using epsilon-greedy policy
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state, :])

# Q-Learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    if isinstance(state, tuple):
        state = state[0]  # Ensure state is an integer
    
    done = False
    steps = 0

    while not done:
        action = choose_action(state)
        step_result = env.step(action)
        next_state, reward, done = step_result[0], step_result[1], step_result[2]
        
        if isinstance(next_state, tuple):
            next_state = next_state[0]  # Ensure next_state is an integer

        # Update Q-table
        old_value = Q[state, action]
        next_max = np.max(Q[next_state, :])
        Q[state, action] = old_value + alpha * (reward + gamma * next_max - old_value)

        state = next_state
        steps += 1

    if episode % 100 == 0:
        print(f"Episode {episode}: finished in {steps} steps")

# Display the learned Q-table
print("Learned Q-table:")
print(Q)

# Evaluate the learned policy
state = env.reset()
if isinstance(state, tuple):
    state = state[0]  # Ensure state is an integer

env.render()
done = False
total_reward = 0
while not done:
    action = choose_action(state)
    step_result = env.step(action)
    next_state, reward, done = step_result[0], step_result[1], step_result[2]
    
    if isinstance(next_state, tuple):
        next_state = next_state[0]  # Ensure next_state is an integer

    state = next_state
    total_reward += reward
    env.render()

print("Total Reward:", total_reward)


  if not isinstance(terminated, (bool, np.bool8)):


Episode 0: finished in 88 steps
Episode 100: finished in 203 steps
Episode 200: finished in 36 steps
Episode 300: finished in 12 steps
Episode 400: finished in 19 steps
Episode 500: finished in 4 steps
Episode 600: finished in 145 steps
Episode 700: finished in 60 steps
Episode 800: finished in 101 steps
Episode 900: finished in 170 steps
Episode 1000: finished in 153 steps
Episode 1100: finished in 46 steps
Episode 1200: finished in 80 steps
Episode 1300: finished in 41 steps
Episode 1400: finished in 223 steps
Episode 1500: finished in 110 steps
Episode 1600: finished in 160 steps
Episode 1700: finished in 29 steps
Episode 1800: finished in 89 steps
Episode 1900: finished in 62 steps
Learned Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Total Reward: 0.0


  logger.warn(


**Reinforcement learning is a powerful approach for solving complex decision-making problems across various domains.**