In [7]:
import numpy as np
import random
np.random.seed(0)

import gymnasium as gym
env = gym.make(
    'FrozenLake-v1',
    desc=None,
    map_name="4x4",
    is_slippery=True,
    success_rate=1.0/3.0,
    reward_schedule=(1, 0, 0)
)

In [8]:
# Initialization
# Initialize Q(s,a) arbitrarily, for all s in S, a in A(s), except for
#Q(terminal, .) <- 0

Q = np.zeros((16, 4))
print(Q)



[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
alpha = 0.8      # learning rate
gamma = 0.95     # discount factor
epsilon = 0.5    # constant epsilon (epsilon-greedy)

episodes = 20000
max_steps = 100


In [10]:
for ep in range(episodes):
    #Initialize state S
    state, _ = env.reset()
    for step in range(max_steps):

        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        next_state, reward, done, truncated, _ = env.step(action)

        # Q-learning update rule
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )

        state = next_state

        if done or truncated:
            break

print("Training completed")

Training completed


In [12]:
np.save("frozenlake_qtable.npy", Q)
print("Q-table saved successfully")
print(Q)


Q-table saved successfully
[[2.74850399e-01 2.74700879e-01 2.88910771e-01 2.72114709e-01]
 [2.65323545e-01 2.70840352e-01 4.42524648e-02 2.86234344e-01]
 [1.29237147e-01 1.56062490e-01 2.02313039e-01 2.02887432e-01]
 [1.40588920e-01 1.75878060e-01 1.69779874e-01 1.75958008e-01]
 [2.95182790e-01 3.22266128e-01 1.06576018e-02 3.05140109e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.97274340e-01 2.62845161e-03 2.54758455e-03 4.22651930e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.53024709e-02 6.86910072e-02 2.90499032e-01 3.20328541e-01]
 [1.58373835e-01 3.67716678e-01 4.48770711e-03 5.79358263e-02]
 [1.19256066e-01 6.01361220e-03 1.55023581e-01 1.13469269e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.25598509e-01 1.79460232e-01 6.42308014e-01 1.66967032e-01]
 [8.08119503e-01 9.45053962e-01 9.86559432e-01 8.62865062e-01]
 [0.00000000e+00 0.00000000e