In [1]:
import numpy as np
# Define a simple environment
class SimpleEnvironment:
    def __init__(self):
        self.state = 0
    def reset(self):
        self.state = 0
        return self.state

    def step(self, action):
        if action == 0:  # Move left
            self.state -= 1
        elif action == 1:  # Move right
            self.state += 1

        reward = -abs(self.state)  # Reward is the negative distance from origin
        done = abs(self.state) >= 10  # Episode ends if state is >= 10 or <= -10

        return self.state, reward, done, {}

# Define a base policy
def base_policy(state):
    return np.random.choice([0, 1])  # Randomly choose an action

# Rollout algorithm
def rollout_algorithm(env, state, num_rollouts=10):
    action_values = np.zeros(2)  # Two possible actions: 0 (left) and 1 (right)

    for action in [0, 1]:
        total_reward = 0
        for _ in range(num_rollouts):
            env.reset()
            env.state = state
            current_state, reward, done, _ = env.step(action)
            total_reward += reward

            while not done:
                action = base_policy(current_state)
                current_state, reward, done, _ = env.step(action)
                total_reward += reward

        action_values[action] = total_reward / num_rollouts

    best_action = np.argmax(action_values)
    return best_action, action_values

# Main function to demonstrate the rollout algorithm
def main():
    env = SimpleEnvironment()
    initial_state = env.reset()
    best_action, action_values = rollout_algorithm(env, initial_state)

    print(f"Initial State: {initial_state}")
    print(f"Action Values: {action_values}")
    print(f"Best Action: {best_action}")

if __name__ == "__main__":
    main()


Initial State: 0
Action Values: [   0.  -248.6]
Best Action: 0
