In [8]:
import numpy as np
import matplotlib.pyplot as plt

# Define the environment
num_states = 6  # Number of states in the 1D grid
trap_state = 0  # Trap position
goal_state = 5  # Goal position
start_state = 2  # Starting position

actions = [-1, 1]  # Possible actions: left (-1), right (+1)

# Rewards
rewards = np.zeros(num_states)
rewards[trap_state] = -1  # Negative reward for the trap
rewards[goal_state] = 1   # Positive reward for the goal

# Hyperparameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 500  # Number of episodes

# Initialize Q-table
Q = np.zeros((num_states, len(actions)))

def epsilon_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(actions))  # Explore: random action
    else:
        return np.argmax(Q[state])  # Exploit: best action

# Training loop
for episode in range(num_episodes):
    state = start_state
    while state != trap_state and state != goal_state:
        action_idx = epsilon_greedy(state, epsilon)
        action = actions[action_idx]

        # Take action and observe next state and reward
        next_state = max(0, min(num_states - 1, state + action))
        reward = rewards[next_state] - 0.01  # Include move cost

        # Update Q-value
        best_next_action = np.max(Q[next_state])
        Q[state, action_idx] += alpha * (reward + gamma * best_next_action - Q[state, action_idx])

        # Move to the next state
        state = next_state

# Derive policy from Q-table
policy = np.argmax(Q, axis=1)
policy_actions = [actions[a] for a in policy]

# Display results
print("Optimal Q-table:")
print(Q)
print("\nOptimal Policy:")
print(policy_actions)


def visualize_policy(policy):
    grid = [' '] * num_states  # Initialize the grid with empty spaces
    grid[trap_state] = 'Trap'
    grid[goal_state] = 'Goal'
    arrows = ['←' if action == -1 else '→' for action in policy]
    for i, arrow in enumerate(arrows):
        if i == trap_state or i == goal_state:
            continue
        grid[i] = arrow
    return grid


visualized_policy = visualize_policy(policy)
print("\nVisualized Policy:")
print(visualized_policy)


Optimal Q-table:
[[ 0.          0.        ]
 [-0.101       0.6504407 ]
 [ 0.46092552  0.7829    ]
 [ 0.67441166  0.881     ]
 [ 0.73130448  0.99      ]
 [ 0.          0.        ]]

Optimal Policy:
[-1, 1, 1, 1, 1, -1]

Visualized Policy:
['Trap', '→', '→', '→', '→', 'Goal']


In [10]:
import numpy as np
import matplotlib.pyplot as plt

# Define the environment
rows, cols = 3, 4  # Grid size (3x4)
trap_state = (1, 3)  # Trap position
goal_state = (0, 3)  # Goal position
start_state = (2, 0)  # Starting position

# Possible actions
actions = {"UP": (-1, 0), "DOWN": (1, 0), "LEFT": (0, -1), "RIGHT": (0, 1)}

# Rewards
rewards = np.full((rows, cols), -0.01)  # Default move cost
rewards[trap_state] = -1  # Negative reward for the trap
rewards[goal_state] = 1   # Positive reward for the goal

# Stochastic environment probabilities
prob_success = 0.8
prob_lateral = 0.1

# Hyperparameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 500  # Number of episodes

# Initialize Q-table
Q = np.zeros((rows, cols, len(actions)))

def epsilon_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(list(actions.keys()))  # Explore: random action
    else:
        return list(actions.keys())[np.argmax(Q[state[0], state[1]])]  # Exploit: best action

def step(state, action):
    # Determine next state based on chosen action and stochastic outcomes
    next_state = tuple(np.add(state, actions[action]))

    # Ensure the agent remains within grid bounds
    if next_state[0] < 0 or next_state[0] >= rows or next_state[1] < 0 or next_state[1] >= cols:
        next_state = state  # Bounce back

    # Apply stochasticity
    random_value = np.random.rand()
    if random_value < prob_success:
        return next_state
    elif random_value < prob_success + prob_lateral:
        lateral_action = [a for a in actions if a != action and actions[a][0] == actions[action][1]]
        return tuple(np.add(state, actions[np.random.choice(lateral_action)]))
    else:
        return state

# Training loop
for episode in range(num_enepisodes)]:

    
# Training loop
for episode in range(num_episodes):
    state = start_state
    while state != trap_state and state != goal_state:
        action_idx = epsilon_greedy(state, epsilon)
        action = actions[action_idx]

        # Take action and observe next state and reward
        next_state = max(0, min(num_states - 1, state + action))
        reward = rewards[next_state] - 0.01  # Include move cost

        # Update Q-value
        best_next_action = np.max(Q[next_state])
        Q[state, action_idx] += alpha * (reward + gamma * best_next_action - Q[state, action_idx])

        # Move to the next state
        state = next_state

# Derive policy from Q-table
policy = np.argmax(Q, axis=1)
policy_actions = [actions[a] for a in policy]

# Display results
print("Optimal Q-table:")
print(Q)
print("\nOptimal Policy:")
print(policy_actions)


TypeError: '<' not supported between instances of 'tuple' and 'int'

In [11]:
import numpy as np

# Define the environment
grid_size = (3, 4)  # 3x4 grid
trap_position = (1, 3)  # Position of the trap
goal_position = (0, 3)  # Position of the goal
start_position = (2, 0)  # Starting position

actions = [(0, -1), (0, 1), (-1, 0), (1, 0)]  # LEFT, RIGHT, UP, DOWN

# Rewards
rewards = np.full(grid_size, -0.01)  # Default move cost
rewards[trap_position] = -1  # Negative reward for the trap
rewards[goal_position] = 1   # Positive reward for the goal

# Hyperparameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 1000  # Number of episodes

# Initialize Q-table
Q = np.zeros((*grid_size, len(actions)))


def epsilon_greedy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(len(actions))  # Explore: random action
    else:
        return np.argmax(Q[state])  # Exploit: best action


def is_terminal(state):
    return state == trap_position or state == goal_position


# Training loop
for episode in range(num_episodes):
    state = start_position
    while not is_terminal(state):
        action_idx = epsilon_greedy(state, epsilon)
        action = actions[action_idx]

        # Calculate next state
        next_state = (state[0] + action[0], state[1] + action[1])

        # Check for boundaries (bouncing walls)
        next_state = (
            max(0, min(grid_size[0] - 1, next_state[0])),
            max(0, min(grid_size[1] - 1, next_state[1]))
        )

        # Get reward
        reward = rewards[next_state]

        # Update Q-value
        best_next_action = np.max(Q[next_state])
        Q[state][action_idx] += alpha * (reward + gamma * best_next_action - Q[state][action_idx])

        # Move to next state
        state = next_state

# Derive policy from Q-table
policy = np.argmax(Q, axis=2)

# Visualize results
arrows = {0: '←', 1: '→', 2: '↑', 3: '↓'}

def visualize_policy(policy):
    grid = np.full(grid_size, ' ')
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            if (i, j) == trap_position:
                grid[i, j] = 'Trap'
            elif (i, j) == goal_position:
                grid[i, j] = 'Goal'
            else:
                grid[i, j] = arrows[policy[i, j]]
    return grid


print("Optimal Q-table:")
print(Q)
print("\nVisualized Policy:")
visualized_policy = visualize_policy(policy)
print(visualized_policy)


Optimal Q-table:
[[[ 0.074004    0.74293225 -0.0029701   0.05772543]
  [ 0.5329027   0.89        0.75560792  0.64635922]
  [ 0.77022513  1.          0.8644691   0.63218413]
  [ 0.          0.          0.          0.        ]]

 [[ 0.13016274  0.69928865  0.07728472  0.09917792]
  [ 0.54140587  0.60936283  0.791       0.57143315]
  [ 0.06830553 -0.19        0.87704753  0.07444601]
  [ 0.          0.          0.          0.        ]]

 [[ 0.50182575  0.62171     0.55803257  0.50210131]
  [ 0.52719389  0.27432408  0.7019      0.50723312]
  [ 0.52726791 -0.00444341  0.04263039 -0.0029701 ]
  [-0.0039178  -0.0039404  -0.1        -0.0039404 ]]]

Visualized Policy:
[['→' '→' '→' 'G']
 ['→' '↑' '↑' 'T']
 ['→' '↑' '←' '←']]
