In [4]:
import numpy as np
import random

In [5]:
class Gridworld:
    def __init__(self):
        self.height = 5
        self.width = 5
        self.actions = ['north', 'south', 'east', 'west']
        self.special_states = {'A': (0, 1), 'B': (0, 3)}
        self.next_to_states = {"A'": (4, 1), "B'": (2, 3)}
        self.rewards = {'A': 10, 'B': 5}

    def reset(self):
        return (random.randint(0, 4), random.randint(0, 4))

    def step(self, state, action):
        if state == self.special_states['A']:
            return self.next_to_states["A'"], self.rewards['A']
        if state == self.special_states['B']:
            return self.next_to_states["B'"], self.rewards['B']

        i, j = state
        if action == 'north':
            i = max(i - 1, 0)
        elif action == 'south':
            i = min(i + 1, self.height - 1)
        elif action == 'west':
            j = max(j - 1, 0)
        elif action == 'east':
            j = min(j + 1, self.width - 1)

        return (i, j), 0

## Q-Learning Implementation

In [6]:
# Q-learning parameters
gamma = 0.9
epsilon = 0.1
alpha = 0.2
episodes = 5000
steps = 5000

grid = Gridworld()
print("Initializing Gridworld...")
print(f"Grid size: {grid.height}x{grid.width}")
print(f"Special_states = {grid.special_states}")
print(f"Next_to_states = {grid.next_to_states}")
print(f"Special_rewards = {grid.rewards}")
print("Starting Q-learning with parameters:")
print(f"  γ = {gamma}")
print(f"  ε = {epsilon}")
print(f"  α = {alpha}")
print(f"  Episodes = {episodes}")
print(f"  Steps = {steps}")

# Initialize Q-table
Q = {(i, j): {a: 0 for a in grid.actions} for i in range(grid.height) for j in range(grid.width)}

# Run Q-learning
for ep in range(episodes):
    state = grid.reset()
    for _ in range(steps):
        if random.random() < epsilon:
            action = random.choice(grid.actions)
        else:
            action = max(Q[state], key=Q[state].get)

        next_state, reward = grid.step(state, action)
        best_next = max(Q[next_state].values())
        Q[state][action] += alpha * (reward + gamma * best_next - Q[state][action])
        state = next_state


Initializing Gridworld...
Grid size: 5x5
Special_states = {'A': (0, 1), 'B': (0, 3)}
Next_to_states = {"A'": (4, 1), "B'": (2, 3)}
Special_rewards = {'A': 10, 'B': 5}
Starting Q-learning with parameters:
  γ = 0.9
  ε = 0.1
  α = 0.2
  Episodes = 5000
  Steps = 5000


## Results: Value Function & Policy

In [7]:
print("\nEvaluating optimal value function and policy...")
value_function = np.zeros((grid.height, grid.width))
policy = np.empty((grid.height, grid.width), dtype=object)
policy_arrows = np.empty((grid.height, grid.width), dtype=object)
arrows = {'north':'↑','south':'↓','east':'→','west':'←'}

for i in range(grid.height):
    for j in range(grid.width):
        best_act = max(Q[(i, j)], key=Q[(i, j)].get)
        value_function[i, j] = Q[(i, j)][best_act]
        policy[i, j] = best_act
        policy_arrows[i, j] = arrows[best_act]

print("\nOptimal Value Function:")
for row in value_function:
    print('  '.join(f"{v:.2f}" for v in row))

print("\nOptimal Policy:")
for row in policy:
    print('  '.join(row))

print("\nOptimal Policy (arrows):")
for row in policy_arrows:
    print('  '.join(row))



Evaluating optimal value function and policy...

Optimal Value Function:
21.98  24.42  21.98  19.42  17.48
19.78  21.98  19.78  17.80  16.02
17.80  19.78  17.80  16.02  14.42
16.02  17.80  16.02  14.42  12.98
14.42  16.02  14.42  12.98  11.68

Optimal Policy:
east  north  west  north  west
north  north  north  west  west
north  north  north  north  west
north  north  north  north  north
north  north  north  north  north

Optimal Policy (arrows):
→  ↑  ←  ↑  ←
↑  ↑  ↑  ←  ←
↑  ↑  ↑  ↑  ←
↑  ↑  ↑  ↑  ↑
↑  ↑  ↑  ↑  ↑
