In [None]:
import numpy as np

# Define the environment parameters
num_states = 16  # Number of states in the grid map (4x4 matrix)
num_actions = 4  # Number of possible actions (up, down, left, right)
gamma = 0.9  # Discount factor

# Define the rewards and penalties
rewards = np.zeros(num_states)
rewards[15] = 10  # Reward for reaching the goal location (state 15)
rewards[[5, 7, 11, 12]] = -10  # Penalty for reaching trap locations (states 5, 7, 11, 12)
rewards[[2,3,4,6,8,9,10,13,14]] = 1  # Rewards for each step in non - trap locations

# Define the transition function
def transition_function(state, action):
    if action == 0:  # Move up
        if state >= 4:
            return state - 4
        else:
            return state
    elif action == 1:  # Move down
        if state < 12:
            return state + 4
        else:
            return state
    elif action == 2:  # Move left
        if state % 4 != 0:
            return state - 1
        else:
            return state
    elif action == 3:  # Move right
        if (state + 1) % 4 != 0:
            return state + 1
        else:
            return state

# Initialize the Q-value function
Q = np.zeros((num_states, num_actions))

# Policy Evaluation
def policy_evaluation(Q, rewards, gamma, num_iterations):
    for _ in range(num_iterations):
        for state in range(num_states):
            for action in range(num_actions):
                next_state = transition_function(state, action)  # Transition function to get the next state
                reward = rewards[state]  # Reward for the current state
                Q[state, action] = reward + gamma * np.max(Q[next_state])  # Update Q-value using Bellman equation
    return Q

# Policy Iteration
def policy_iteration(Q, rewards, gamma, num_iterations):
    policy = np.argmax(Q, axis=1)  # Initialize the policy with the greedy action from the current Q-values
    for _ in range(num_iterations):
        Q = policy_evaluation(Q, rewards, gamma, 1)  # Perform policy evaluation for one iteration
        policy_stable = True  # Flag to check if the policy has changed
        for state in range(num_states):
            old_action = policy[state]
            new_action = np.argmax(Q[state])  # Update the policy with the greedy action from the updated Q-values
            if old_action != new_action:
                policy_stable = False  # Policy has changed
            policy[state] = new_action
        if policy_stable:
            break  # Exit the loop if the policy is stable
        if policy[15] == 0:  # Stop when goal state (state 24) is reached
            break
    return policy

# Perform policy evaluation
Q = policy_evaluation(Q, rewards, gamma, num_iterations=1000)

# Perform policy iteration
policy = policy_iteration(Q, rewards, gamma, num_iterations=1000)

# Print the learned policy
print("Learned Policy:", policy)

Learned Policy: [1 3 1 2 1 1 1 1 3 1 1 1 3 3 3 1]


In [None]:
# Define RL agent
def rl_agent(policy):
    state = 0  # Start from the initial state
    steps = 0  # Number of steps taken
    while state != 15:  # Continue until the goal state is reached
        action = policy[state]  # Select action from the policy
        next_state = transition_function(state, action)  # Get the next state
        reward = rewards[state]  # Get the reward for the current state
        print("Step:", steps, "State:", state, "Action:", action, "Next State:", next_state, "Reward:", reward)
        state = next_state  # Update the current state
        steps += 1  # Increment the number of steps taken

    print("Goal reached in", steps, "steps!")

# Run the RL agent
rl_agent(policy)


Step: 0 State: 0 Action: 1 Next State: 4 Reward: 0.0
Step: 1 State: 4 Action: 1 Next State: 8 Reward: 1.0
Step: 2 State: 8 Action: 3 Next State: 9 Reward: 1.0
Step: 3 State: 9 Action: 1 Next State: 13 Reward: 1.0
Step: 4 State: 13 Action: 3 Next State: 14 Reward: 1.0
Step: 5 State: 14 Action: 3 Next State: 15 Reward: 1.0
Goal reached in 6 steps!
