In [4]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 10000
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1

# Initialize the Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.zeros((state_space, action_space))

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        if np.random.uniform(0, 1) < epsilon:
            # Explore: Choose a random action
            action = env.action_space.sample()
        else:
            # Exploit: Choose the action with the highest Q-value
            action = np.argmax(q_table[state, :])

        next_state, reward, done, _ = env.step(action)

        # Update the Q-table using the Q-learning update rule
        q_table[state, action] += learning_rate * (
                reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

        state = next_state

# Print the learned Q-table
print(q_table)

# Close the environment
env.close()


[[0.48130663 0.45538179 0.40459107 0.4269724 ]
 [0.28364326 0.2972368  0.21366365 0.39359471]
 [0.31075468 0.27339823 0.26223331 0.28590036]
 [0.19141684 0.07169671 0.05037005 0.09195343]
 [0.49085176 0.30737112 0.38486896 0.29584442]
 [0.         0.         0.         0.        ]
 [0.18995486 0.18240083 0.12137764 0.18638361]
 [0.         0.         0.         0.        ]
 [0.30108276 0.29479794 0.35622174 0.53969676]
 [0.47245904 0.6311741  0.49260539 0.40020258]
 [0.60595428 0.38741066 0.33555638 0.25839696]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.25874135 0.55277729 0.74046285 0.58866823]
 [0.67387712 0.8852763  0.64569    0.74526304]
 [0.         0.         0.         0.        ]]


In [None]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 500
learning_rate = 0.1
discount_factor = 0.99
epsilon = 0.1

# Initialize the Q-table
state_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.zeros((state_space, action_space))

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        if np.random.uniform(0, 1) < epsilon:
            # Explore: Choose a random action
            action = env.action_space.sample()
        else:
            # Exploit: Choose the action with the highest Q-value
            action = np.argmax(q_table[state, :])

        next_state, reward, done, _ = env.step(action)

        # Update the Q-table using the Q-learning update rule
        q_table[state, action] += learning_rate * (
                reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

        state = next_state

# Print the learned Q-table
print(q_table)

# Close the environment
env.close()


In [3]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1')

# Set the parameters
num_episodes = 1000
max_steps = 100
epsilon = 1.0
epsilon_decay = 0.99
gamma = 0.99
batch_size = 32

# Define the DQN model
model = Sequential()
model.add(Dense(64, input_dim=1, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(env.action_space.n, activation='linear'))
model.compile(loss='mse', optimizer=Adam())

# DQN algorithm
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False

    for step in range(max_steps):
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state)
            action = np.argmax(q_values[0])

        next_state, reward, done, _ = env.step(action)

        target = reward
        if not done:
            next_q_values = model.predict(next_state)
            target = reward + gamma * np.max(next_q_values[0])

        q_values = model.predict(state)
        q_values[0][action] = target

        model.fit(np.reshape(state, (1, 16)), q_values, verbose=0)

        state = next_state
        total_reward += reward

        if done:
            break

    epsilon *= epsilon_decay

    if episode % 100 == 0:
        print("Episode:", episode, "Total Reward:", total_reward)

# Evaluate the trained model
total_rewards = []
num_evaluations = 100
for _ in range(num_evaluations):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        q_values = model.predict(np.reshape(state, (1, 16)))
        action = np.argmax(q_values[0])
        state, reward, done, _ = env.step(action)
        total_reward += reward

    total_rewards.append(total_reward)

print("Average Reward:", np.mean(total_rewards))

# Close the environment
env.close()


IndexError: ignored