In [5]:
import numpy as np
import random

# Environment settings
grid_size = 5
goals = [(0, 0), (4, 4)]  # Goals for each agent
n_actions = 4  # Up, Down, Left, Right
n_states = grid_size * grid_size
n_agents = 2

# Hyperparameters
alpha = 0.1
gamma = 0.95
epsilon = 0.1
episodes = 500

# Initialize Q-tables for each agent
Q = [np.zeros((n_states, n_actions)) for _ in range(n_agents)]

# Convert state tuple to linear state index
def state_to_index(state):
    return state[0] * grid_size + state[1]

# Action to state transition
def step(state, action):
    x, y = state
    if action == 0 and x > 0:  # Up
        x -= 1
    elif action == 1 and x < grid_size - 1:  # Down
        x += 1
    elif action == 2 and y > 0:  # Left
        y -= 1
    elif action == 3 and y < grid_size - 1:  # Right
        y += 1
    return (x, y)

# Reward function considering goals and agent collisions
def reward(state, next_state, goal):
    if next_state == goal:
        return 100  # Reward for reaching the goal
    return -1  # Step penalty

# Main training loop
for episode in range(episodes):
    # Initialize agents at random positions, making sure they're not on their goal
    states = [(random.randint(0, grid_size-1), random.randint(0, grid_size-1)) for _ in range(n_agents)]
    while states[0] == goals[0] or states[1] == goals[1]:
        states = [(random.randint(0, grid_size-1), random.randint(0, grid_size-1)) for _ in range(n_agents)]

    steps = 0
    while states[0] != goals[0] or states[1] != goals[1]:
        next_states = states.copy()
        for i in range(n_agents):
            state_index = state_to_index(states[i])

            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = random.randint(0, n_actions - 1)
            else:
                action = np.argmax(Q[i][state_index])

            next_state = step(states[i], action)
            next_states[i] = next_state

            # Compute reward
            r = reward(states[i], next_state, goals[i])

            # Q-learning update
            next_state_index = state_to_index(next_state)
            Q[i][state_index, action] = Q[i][state_index, action] + \
                                        alpha * (r + gamma * np.max(Q[i][next_state_index]) - Q[i][state_index, action])

        states = next_states
        steps += 1
        if steps > 100:  # Prevent infinite loops
            break

# Display learned Q-values
for i in range(n_agents):
    print(f"Agent {i+1} Q-table:")
    print(Q[i].reshape((grid_size, grid_size, n_actions)))


Agent 1 Q-table:
[[[ 1.95246120e+03  1.41884879e+03  1.62694012e+03  1.55478139e+03]
  [ 1.34950950e+03  5.30521190e+02  1.95178601e+03  9.82822992e+02]
  [ 2.04452268e+02  4.54955729e+02  1.84204366e+03  4.97031141e+02]
  [ 4.82951876e+02  2.10960065e+02  1.72921642e+03 -7.28912893e-02]
  [-1.99500000e-01  9.13894514e+01  1.54196991e+03 -1.00000000e-01]]

 [[ 1.93956359e+03  0.00000000e+00  3.06352780e+02  4.54695487e+02]
  [ 1.84926967e+03  8.55843442e+02  5.82511088e+02  4.57603066e+02]
  [-1.90000000e-01 -2.08050000e-01  1.57022750e+03  1.37441885e+02]
  [ 1.57701119e+03  3.49860946e+02  8.72039663e+01  1.06963035e+02]
  [ 1.05092632e+03 -9.91039400e-02 -3.85243549e-01 -3.97009987e-01]]

 [[ 1.79251550e+03  1.51380408e+02  1.22337723e+02  2.23998049e+02]
  [ 1.75159395e+03  5.86448106e+02  6.94469477e+02  4.54990837e+02]
  [-2.80500000e-01  1.25725322e+03  5.21868049e+02  1.14292524e+02]
  [ 1.39118636e+03 -4.04725896e-01  4.20655709e+02 -3.72286704e-01]
  [ 7.54532187e+01 -6.07753

In [13]:
# Import required libraries
import numpy as np
import random

# Environment settings
grid_size = 5  # Define the size of the grid (5x5 in this case)
goals = [(0, 0), (4, 4)]  # Define the goal positions for each of the two agents
n_actions = 4  # Define the number of possible actions (Up, Down, Left, Right)
n_states = grid_size * grid_size  # Calculate the total number of states in the grid
n_agents = 2  # Define the number of agents

# Hyperparameters for the learning process
alpha = 0.1  # Learning rate, determines to what extent newly acquired information overrides old information
gamma = 0.95  # Discount factor, represents the difference in importance between future rewards and present rewards
epsilon = 0.1  # Exploration rate, the probability of choosing a random action instead of the best one
episodes = 500  # Number of episodes to train the agents

# Initialize Q-tables for each agent
Q = [np.zeros((n_states, n_actions)) for _ in range(n_agents)]  # Create a zero-initialized Q-table for each agent

# Function to convert a state tuple (x, y) into a linear index for the Q-table
def state_to_index(state):
    #print(state)
    #print(state[0],grid_size,state[1])
    return state[0] * grid_size + state[1]

# Function to determine the new state after taking an action
def step(state, action):
    x, y = state
    # Update the state based on the action taken
    if action == 0 and x > 0:  # Move up if not at the top edge
        x -= 1
    elif action == 1 and x < grid_size - 1:  # Move down if not at the bottom edge
        x += 1
    elif action == 2 and y > 0:  # Move left if not at the left edge
        y -= 1
    elif action == 3 and y < grid_size - 1:  # Move right if not at the right edge
        y += 1
    return (x, y)

# Function to compute the reward for moving from one state to another
def reward(state, next_state, goal):
    if next_state == goal:
        return 100  # Provide a high reward for reaching the goal
    return -1  # Otherwise, return a step penalty

# Main training loop
for episode in range(1):
    # Randomly initialize the positions of the agents, ensuring they are not on their goals
    states = [(random.randint(0, grid_size-1), random.randint(0, grid_size-1)) for _ in range(n_agents)]
    while states[0] == goals[0] or states[1] == goals[1]:
        states = [(random.randint(0, grid_size-1), random.randint(0, grid_size-1)) for _ in range(n_agents)]

    steps = 0  # Keep track of the number of steps taken
    # Continue the episode until both agents reach their goals
    while states[0] != goals[0] or states[1] != goals[1]:
        next_states = states.copy()  # Prepare to update the states based on the actions taken
        for i in range(n_agents):  # For each agent
            state_index = state_to_index(states[i])  # Convert the state to an index

            # Decide whether to take a random action or the best known action
            if random.random() < epsilon:  # Exploration: choose a random action
                action = random.randint(0, n_actions - 1)
            else:  # Exploitation: choose the best action based on current Q-values
                action = np.argmax(Q[i][state_index])

            next_state = step(states[i], action)  # Determine the next state after taking the action
            next_states[i] = next_state  # Update the next state for the agent

            # Compute the reward for the action taken
            r = reward(states[i], next_state, goals[i])

            # Update the Q-value for the state-action pair using the Q-learning formula
            next_state_index = state_to_index(next_state)
            Q[i][state_index, action] = Q[i][state_index, action] + \
                                        alpha * (r + gamma * np.max(Q[i][next_state_index]) - Q[i][state_index, action])

        states = next_states  # Update the states for the next iteration
        steps += 1
        if steps > 100:  # Break the loop if it takes too many steps, to prevent infinite loops
            break

# Display
# Display learned Q-values
for i in range(n_agents):
    print(f"Agent {i+1} Q-table:")
    # Reshape the Q-table for easier interpretation, then print it
    # The Q-table is reshaped to (grid_size, grid_size, n_actions) for visualization
    # This shows the Q-values for each action at each position in the grid
    print(Q[i].reshape((grid_size, grid_size, n_actions)))

Agent 1 Q-table:
[[[ 7.19781254e+02  4.10972948e+00  0.00000000e+00 -1.00000000e-01]
  [-1.00000000e-01 -1.00000000e-01  7.27669141e+01  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 7.87043775e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [-1.00000000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[-1.00000000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000

In [12]:
print(len(Q[0]))

25
