Iteration algorithm implementation

In [1]:
import numpy as np

In [3]:
#define the grid world enviroment
grid_size = 5
actions = ["up", "down", "left", "right"]
goal_state = (4,4)
gamma = 0.9
threshold = 1e-6


In [4]:
#initialize value functions

values = np.zeros((grid_size, grid_size))

# define rewards
rewards = np.full((grid_size, grid_size), -1)
rewards[goal_state] = 10

# value iteration

def value_iteration():
    while True:
        delta = 0
        new_values = np.zeros((grid_size, grid_size))
        for i in range(grid_size):
            for j in range(grid_size):
                if (i,j) == goal_state:
                    new_values[i, j] = 0
                    continue
                #calculate the value for each action
                action_Values = []
                for action in actions:
                    if action == 'up':
                        next_i, next_j = max(i-1, 0), j
                    elif action == 'down':
                        next_i, next_j = min(i+1, grid_size-1), j
                    elif action == 'left':
                        next_i, next_j = i, max(j-1, 0)
                    elif action == 'right':
                        next_i, next_j = i, min(j+1, grid_size - 1)
                    action_Values.append(rewards[next_i, next_j] + gamma * values[next_i, next_j])


                #update the value for the current state
                new_values[i, j] = max(action_Values)
                delta = max(delta, abs(new_values[i, j] - values[i, j]))
        values[:] = new_values
        if delta < threshold:
            break

# Extract the optimal policy
def get_optimal_policy():
    policy = np.empty((grid_size, grid_size), dtype=object)
    for i in range(grid_size):
        for j in range(grid_size):
            if (i, j) == goal_state:
                policy[i, j] = 'goal'
                continue
            action_values = []
            for action in actions:
                if action == 'up':
                    next_i, next_j = max(i - 1, 0), j
                elif action == 'down':
                    next_i, next_j = min(i + 1, grid_size - 1), j
                elif action == 'left':
                    next_i, next_j = i, max(j - 1, 0)
                elif action == 'right':
                    next_i, next_j = i, min(j + 1, grid_size - 1)
                action_values.append(rewards[next_i, next_j] + gamma * values[next_i, next_j])
            policy[i, j] = actions[np.argmax(action_values)]
    return policy

# Run value iteration and print results
value_iteration()
optimal_policy = get_optimal_policy()
print("Optimal Policy:")
print(optimal_policy)




Optimal Policy:
[['down' 'down' 'down' 'down' 'down']
 ['down' 'down' 'down' 'down' 'down']
 ['down' 'down' 'down' 'down' 'down']
 ['down' 'down' 'down' 'down' 'down']
 ['right' 'right' 'right' 'right' 'goal']]


In [None]:
Q-learning algorithm implementation

In [5]:
# Define the grid world environment
grid_size = 5
actions = ['up', 'down', 'left', 'right']
goal_state = (4, 4)  # Bottom-right corner
gamma = 0.9  # Discount factor
alpha = 0.5  # Learning rate
num_episodes = 1000


In [6]:
# Initialize Q-table
q_table = np.zeros((grid_size, grid_size, len(actions)))

# Define rewards
rewards = np.full((grid_size, grid_size), -1)
rewards[goal_state] = 10

In [7]:

# Q-learning
def q_learning():
    for episode in range(num_episodes):
        state = (0, 0)  # Start state
        while state != goal_state:
            i, j = state
            # Choose action (epsilon-greedy)
            action_idx = np.argmax(q_table[i, j])
            # Take action
            if actions[action_idx] == 'up':
                next_i, next_j = max(i - 1, 0), j
            elif actions[action_idx] == 'down':
                next_i, next_j = min(i + 1, grid_size - 1), j
            elif actions[action_idx] == 'left':
                next_i, next_j = i, max(j - 1, 0)
            elif actions[action_idx] == 'right':
                next_i, next_j = i, min(j + 1, grid_size - 1)
            # Update Q-value
            reward = rewards[next_i, next_j]
            q_table[i, j, action_idx] += alpha * (
                reward + gamma * np.max(q_table[next_i, next_j]) - q_table[i, j, action_idx]
            )
            state = (next_i, next_j)

# Run Q-learning and print results
q_learning()
print("Learned Q-values:")
print(q_table)


Learned Q-values:
[[[-3.36579569 -0.434062   -3.36579569 -3.30712321]
  [-3.01662704 -2.75268281 -2.90306621 -2.23265769]
  [-2.26219063  0.07293168 -2.38447641 -2.16634297]
  [-1.8549375  -0.7093125  -1.7361875  -1.545     ]
  [-1.42625    -1.2125     -1.7361875  -1.42625   ]]

 [[-2.8193904  -2.68523117 -2.64908109  0.62882   ]
  [-2.52370531 -2.23956563 -2.20562813  1.8098    ]
  [-1.90356563 -1.8659375  -1.78428125  3.122     ]
  [-1.325      -1.20125    -1.18875     4.58      ]
  [-0.975       6.2        -1.18875    -0.975     ]]

 [[-2.5297782  -2.11303125 -2.26219063 -2.3236875 ]
  [-2.04756562 -1.81025    -2.20562813 -2.018375  ]
  [-1.977225   -1.539375   -1.787375   -1.60125   ]
  [-0.86023437 -1.1        -1.2        -0.975     ]
  [-0.975       8.         -0.725      -0.5       ]]

 [[-2.00228438 -1.8861875  -1.8549375  -1.809375  ]
  [-1.5275     -1.6861875  -1.595      -1.60125   ]
  [-1.5280625  -1.20125    -1.2        -0.3125    ]
  [-0.975       4.1875     -0.725      -