In [1]:
import numpy as np
import random

# Environment settings
grid_size = 5
actions = ["up", "down", "left", "right"]
num_actions = len(actions)

# Q-Table initialization
q_table = np.zeros((grid_size, grid_size, num_actions))

# Parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.2  # Exploration rate
num_episodes = 500

# Reward function (goal at bottom-right)
def get_reward(state):
    if state == (grid_size - 1, grid_size - 1):
        return 10  # Goal reward
    return -1  # Step penalty

# Q-Learning
for episode in range(num_episodes):
    state = (0, 0)  # Start at top-left
    done = False
    while not done:
        # Choose action (epsilon-greedy)
        if random.uniform(0, 1) < epsilon:
            action_idx = random.randint(0, num_actions - 1)  # Explore
        else:
            action_idx = np.argmax(q_table[state[0], state[1]])  # Exploit

        action = actions[action_idx]

        # Transition to next state
        if action == "up" and state[0] > 0:
            next_state = (state[0] - 1, state[1])
        elif action == "down" and state[0] < grid_size - 1:
            next_state = (state[0] + 1, state[1])
        elif action == "left" and state[1] > 0:
            next_state = (state[0], state[1] - 1)
        elif action == "right" and state[1] < grid_size - 1:
            next_state = (state[0], state[1] + 1)
        else:
            next_state = state

        reward = get_reward(next_state)

        # Q-Table update
        best_future_q = np.max(q_table[next_state[0], next_state[1]])
        q_table[state[0], state[1], action_idx] += alpha * (reward + gamma * best_future_q - q_table[state[0], state[1], action_idx])

        state = next_state

        if state == (grid_size - 1, grid_size - 1):  # Reached goal
            done = True

print("Training Complete!")

Training Complete!
