# Q-learning on a 5×5 Gridworld
This notebook implements Q-learning to approximate the optimal value function and policy.

In [1]:
from IPython import get_ipython
from IPython.display import display

import numpy as np
import matplotlib.pyplot as plt

In [2]:
grid_size = (5, 5)
special_states = {'A': (0,1), 'B': (0,3)}
next_states = {'A\'': (4,1), 'B\'': (2,3)}
special_rewards = {'A': 10, 'B': 5}

gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
alpha = 0.2  # Learning rate
episodes = 5000
steps_per_episode = 5000

actions = ['north', 'south', 'east', 'west']
Q_table = np.zeros((*grid_size, len(actions)))

In [3]:
def get_next_state(state, action):
    """Returns the next state based on action."""
    x, y = state
    if action == 'north' and x > 0: x -= 1
    elif action == 'south' and x < grid_size[0] - 1: x += 1
    elif action == 'east' and y < grid_size[1] - 1: y += 1
    elif action == 'west' and y > 0: y -= 1
    return (x, y)

def get_reward(state):
    """Returns the reward for the given state."""
    if state in special_states.values():
        return special_rewards[[k for k, v in special_states.items() if v == state][0]]
    return -1 if state in [(0,0), (grid_size[0]-1, grid_size[1]-1)] else 0


In [None]:
for episode in range(episodes):
    state = (np.random.randint(grid_size[0]), np.random.randint(grid_size[1]))
    for step in range(steps_per_episode):
        # Choose action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(actions)
        else:
            action = actions[np.argmax(Q_table[state])]

        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        # Update Q-value using the Bellman equation
        Q_table[state][actions.index(action)] += alpha * (
            reward + gamma * np.max(Q_table[next_state]) - Q_table[state][actions.index(action)]
        )

        state = next_state  # Move to next state

In [None]:
print("Initializing Gridworld...")
print("Grid size:", grid_size[0], "x", grid_size[1])
print("Special_states =", special_states)
print("Next_to_states =", next_states)
print("Special_rewards =", special_rewards)
print("Starting Q-learning with parameters:")
print("  γ =", gamma)
print("  ε =", epsilon)
print("  α =", alpha)
print("  Episodes =", episodes)
print("  Steps =", steps_per_episode)
print()

In [None]:
for episode in range(episodes):
    state = (np.random.randint(grid_size[0]), np.random.randint(grid_size[1]))
    for step in range(steps_per_episode):
        # Choose action using epsilon-greedy policy
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(actions)
        else:
            action = actions[np.argmax(Q_table[state])]

        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        # Update Q-value using the Bellman equation
        Q_table[state][actions.index(action)] += alpha * (
            reward + gamma * np.max(Q_table[next_state]) - Q_table[state][actions.index(action)]
        )

        state = next_state  # Move to next state

In [None]:
optimal_policy = np.array([actions[np.argmax(Q_table[state])] for state in np.ndindex(grid_size)])

# Format the output
print("Evaluating optimal value function and policy...")
print("Optimal Value Function:")
print(np.array_str(Q_table.max(axis=-1), precision=2, suppress_small=True))
print()
print("Optimal Policy:")
print(np.array_str(optimal_policy.reshape(grid_size), max_line_width=100))  # Adjust line width as needed
print()

policy_arrows = {'north': '↑', 'south': '↓', 'east': '→', 'west': '←'}
visual_policy = np.vectorize(lambda x: policy_arrows[x])(optimal_policy.reshape(grid_size))

print("Optimal Policy (arrows):")
for row in visual_policy:
    print(" ".join(row))