In [1]:
import numpy as np

# Define grid size and obstacles
GRID_SIZE = (5, 5)
OBSTACLES = [(2, 2), (3, 3)]  # Obstacle positions

# Define actions (up, down, left, right)
ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]
NUM_ACTIONS = len(ACTIONS)

# Define Q-learning parameters
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EPSILON = 0.1
NUM_EPISODES = 1000

# Initialize Q-values
Q_values = np.zeros((GRID_SIZE[0], GRID_SIZE[1], NUM_ACTIONS))

# Define rewards
REWARDS = {
    'end': 10,
    'obstacle': -10,
    'step': -1
}

# Define grid environment
class GridEnvironment:
    def __init__(self):
        self.grid = np.zeros(GRID_SIZE)

    def reset(self):
        self.grid = np.zeros(GRID_SIZE)
        start_pos = (0, 0)
        self.grid[start_pos] = 1  # Set start position
        return start_pos

    def step(self, action, current_pos):
        next_pos = (current_pos[0] + action[0], current_pos[1] + action[1])
        reward = REWARDS['step']  # Initialize reward with default value

        # Check if next position is valid
        if next_pos[0] < 0 or next_pos[0] >= GRID_SIZE[0] or next_pos[1] < 0 or next_pos[1] >= GRID_SIZE[1]:
            next_pos = current_pos  # Stay in current position if out of bounds
        elif next_pos in OBSTACLES:
            reward = REWARDS['obstacle']
        elif next_pos == (GRID_SIZE[0] - 1, GRID_SIZE[1] - 1):
            reward = REWARDS['end']

        return next_pos, reward

# Q-learning algorithm
def q_learning(env):
    for _ in range(NUM_EPISODES):
        current_pos = env.reset()
        done = False

        while not done:
            # Choose action
            if np.random.uniform(0, 1) < EPSILON:
                action = np.random.randint(NUM_ACTIONS)  # Explore
            else:
                action = np.argmax(Q_values[current_pos[0], current_pos[1]])  # Exploit

            next_pos, reward = env.step(ACTIONS[action], current_pos)

            # Update Q-values
            Q_values[current_pos[0], current_pos[1], action] += LEARNING_RATE * \
                (reward + DISCOUNT_FACTOR * np.max(Q_values[next_pos[0], next_pos[1]]) - Q_values[current_pos[0], current_pos[1], action])

            current_pos = next_pos

            if reward == REWARDS['end'] or reward == REWARDS['obstacle']:
                done = True

# Main function
if __name__ == "__main__":
    env = GridEnvironment()
    q_learning(env)
    print("Q-values:")
    print(Q_values)


Q-values:
[[[-2.00342081 -0.434062   -1.80071551 -1.432542  ]
  [-2.59115868  0.2347655  -2.60869668 -2.63494206]
  [-2.06076928 -1.99181686 -2.09204652 -1.96759448]
  [-1.39941645 -0.84491141 -1.45628839 -1.37481525]
  [-0.95187454 -0.74970468 -1.07174655 -0.95617925]]

 [[-1.56030552  0.62882    -0.73232017  0.12336005]
  [-2.20724213  1.77604839 -1.98502874 -2.06657648]
  [-1.60182551 -4.0951     -1.62842866 -1.19606368]
  [-1.13027125  1.3088542  -1.27349014 -0.83761882]
  [-0.61708505  1.90425592 -0.62390884 -0.58519851]]

 [[-0.542441    1.8098      0.41564372  1.43442428]
  [-1.49985017  3.12185086 -1.74753092 -4.68559   ]
  [ 0.          0.          0.          0.        ]
  [-0.68997675 -1.         -2.71        4.23359497]
  [-0.15041138  7.29066543 -0.199      -0.199     ]]

 [[ 0.41385731  1.50627563  1.46824324  3.122     ]
  [ 1.45776824  4.58        1.38779831  3.94104721]
  [-1.9         6.1668482  -0.56445941 -3.439     ]
  [ 0.          0.          0.          0.      