# Main

In [1]:
!pip install swig
!pip install gymnasium

import gymnasium as gym
import numpy as np
import random



In [2]:
# Learning parameters
ALPHA = 0.8        # learning rate (given)
GAMMA = 0.95       # discount factor (given)

# Epsilon parameters (Exponential Decay)
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 0.9995  # close to 1

EPISODES = 8000
MAX_STEPS = 100

In [3]:
# Create FrozenLake environment
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False)


state_space = env.observation_space.n
action_space = env.action_space.n

# Initialize Q-table with all zeros and maintain an list for rewards in all the episodes
Q = np.zeros((state_space, action_space))

## Training

In [4]:
rewards_all_episodes = []

for episode in range(EPISODES):
    # Start the episode
    state, _ = env.reset()
    done = False
    total_reward = 0

    # Calculate epsilon for this episode
    epsilon = max(EPS_END, EPS_START * (EPS_DECAY ** episode))

    for step in range(MAX_STEPS):
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()   # Exploration
        else:
            action = np.argmax(Q[state])         # Exploitation

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # A penalty is required for falling into hole. Otherwise, agent will never learn(Q-table will have all zeros)
        if reward == 0 and done:
            reward = -0.01

        # Temporal Difference error
        td_error = reward + GAMMA * np.max(Q[next_state]) - Q[state, action]

        # Q-value update
        Q[state, action] += ALPHA * td_error

        state = next_state
        total_reward += reward

        if done:
            break

    rewards_all_episodes.append(total_reward)

print("Training finished.")

Training finished.


## Saving Q-table

In [5]:
np.save("frozenlake_qtable.npy", Q)

Q = np.load("frozenlake_qtable.npy")

## Testing

In [6]:
test_episodes = 100
successes = 0

for episode in range(test_episodes):
    state, _ = env.reset()
    done = False

    for step in range(MAX_STEPS):
        action = np.argmax(Q[state])  # purely greedy
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        if done:
            if reward == 1:
                successes += 1
            break

print(f"Success rate: {successes}/{test_episodes}")
# Note- A 100% success rate is seen because agent takes best action at all points and is_slippery was set to False.
#       So there is no randomness and the agent takes same path in every test episode

Success rate: 100/100


# Extra Visualisation

In [7]:
# Environment Visualisation
print("State space:", state_space)
print("Action space:", action_space)
print("Environment:")
desc = env.unwrapped.desc
for row in desc:
    print("\t", " ".join(cell.decode("utf-8") for cell in row))
print("\nKey: \tS -> Start\n\tF -> Frozen\n\tH -> Hole\n\tG -> Goal")

State space: 64
Action space: 4
Environment:
	 S F F F F F F F
	 F F F F F F F F
	 F F F H F F F F
	 F F F F F H F F
	 F F F H F F F F
	 F H H F F F H F
	 F H F F H F H F
	 F F F H F F F G

Key: 	S -> Start
	F -> Frozen
	H -> Hole
	G -> Goal


In [8]:
# Policy Visualisation
actions = ["←", "↓", "→", "↑"]

policy = np.array([actions[np.argmax(Q[state])] for state in range(state_space)])
policy = policy.reshape((8, 8))

print("Optimal Policy:")
for row in policy:
    print("\t"," ".join(cell for cell in row))

Optimal Policy:
	 ↓ ↓ ← ← ← ← ← ←
	 ↓ ↓ ← ← ← ↓ ← ←
	 ↓ ↓ ↓ ← ↓ ← ← ←
	 → → → → ↓ ← ↓ ↓
	 ↑ ← ↑ ← → → → ↓
	 ← ← ← → ↑ ↑ ← ↓
	 ↓ ← ↓ ← ← ↓ ← ↓
	 → ← ← ← → ← ← ←


In [9]:
# Print Q-table
np.set_printoptions(precision=3, suppress=True)

for state in range(Q.shape[0]):
    print(f"State {state}: {Q[state]}")

State 0: [0.488 0.513 0.513 0.488]
State 1: [0.488 0.54  0.488 0.513]
State 2: [0.513 0.    0.    0.39 ]
State 3: [0.39 0.   0.   0.  ]
State 4: [0. 0. 0. 0.]
State 5: [ 0.  0. -0.  0.]
State 6: [ 0.  0. -0.  0.]
State 7: [ 0. -0.  0.  0.]
State 8: [0.513 0.54  0.54  0.488]
State 9: [0.513 0.569 0.513 0.513]
State 10: [0.54  0.    0.462 0.468]
State 11: [ 0.513 -0.01   0.     0.   ]
State 12: [0.452 0.    0.    0.   ]
State 13: [-0.  0.  0.  0.]
State 14: [0. 0. 0. 0.]
State 15: [ 0.  0. -0. -0.]
State 16: [0.54  0.569 0.569 0.513]
State 17: [0.54  0.599 0.599 0.54 ]
State 18: [ 0.569  0.63  -0.01   0.509]
State 19: [0. 0. 0. 0.]
State 20: [-0.01   0.698  0.     0.237]
State 21: [ 0.635 -0.01   0.     0.   ]
State 22: [0.563 0.    0.    0.   ]
State 23: [0.304 0.    0.    0.   ]
State 24: [0.569 0.54  0.599 0.54 ]
State 25: [0.569 0.513 0.63  0.569]
State 26: [0.599 0.599 0.663 0.599]
State 27: [ 0.63  -0.01   0.698 -0.01 ]
State 28: [ 0.663  0.735 -0.01   0.663]
State 29: [0. 0. 0. 0.