In [1]:
import random
import numpy as np

# maze grid
maze = [
    [0, 0, 0, 0, 0],
    [0, 1, 1, 1, 0],
    [0, 0, 0, 0, 0],
    [0, 1, 1, 1, 0],
    [0, 0, 0, 0, 0]
]

# start and goal positions
start = (0, 0)
goal = (4, 4)

# Q-learning parameters
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1

# (up, down, left, right)
actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]
num_actions = len(actions)  # Number of possible actions

# Initialize Q-table with zeros
num_states = len(maze) * len(maze[0])
Q = np.zeros((num_states, num_actions))

# Function to map (x, y) coordinates to a state index
def state_to_index(state):
    x, y = state
    return x * len(maze[0]) + y

# Function to check if a given state is valid in the maze
def is_valid_state(state):
    x, y = state
    if 0 <= x < len(maze) and 0 <= y < len(maze[0]) and maze[x][y] != 1:
        return True
    return False

# Function to take an action and return the next state
def take_action(state, action):
    x, y = state
    dx, dy = actions[action]
    next_state = (x + dx, y + dy)
    
    # Check if the next state is valid
    if is_valid_state(next_state):
        return next_state
    else:
        return state  # If the action is invalid, stay in the current state

# Function to get the reward for a given state
def get_reward(state):
    if state == goal:
        return 1  # Positive reward for reaching the goal
    elif maze[state[0]][state[1]] == 1:
        return -1  # Negative reward for hitting a wall
    else:
        return 0  # No reward for other states

# Q-learning algorithm
for episode in range(1000):  # Run for a fixed number of episodes
    state = start
    while state != goal:
        state_index = state_to_index(state)
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, num_actions - 1)  # Exploration
        else:
            action = np.argmax(Q[state_index])  # Exploitation
        next_state = take_action(state, action)
        next_state_index = state_to_index(next_state)
        reward = get_reward(next_state)
        Q[state_index][action] += learning_rate * (reward + discount_factor * np.max(Q[next_state_index]) - Q[state_index][action])
        state = next_state

In [2]:
# Print the Q-table
print("Q-table:")
print(Q)

Q-table:
[[0.37851382 0.2487329  0.40772438 0.4782969 ]
 [0.38461161 0.         0.02250593 0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.4367319  0.44979114 0.39891109 0.531441  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.49512896 0.37191836 0.43609877 0.59049   ]
 [0.50232296 0.0010193  0.02121818 0.        ]
 [0.05254363 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.55984764 0.5638626  0.50043993 0.6561    ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.58429946 0.729      0.55693107 0.63578606]
 [0.

In [3]:
# Extract the optimal policy (actions for each state)
optimal_policy = []
for state_index in range(num_states):
    optimal_action = np.argmax(Q[state_index])
    optimal_policy.append(optimal_action)

# Print the optimal policy
print("Optimal Policy:")
print(np.array(optimal_policy).reshape(len(maze), len(maze[0])))


Optimal Policy:
[[3 0 0 0 0]
 [3 0 0 0 0]
 [3 0 0 0 0]
 [3 0 0 0 0]
 [1 1 1 1 0]]
