 Implement an Epsilon-Greedy algorithm to solve a Multi-Armed Bandit problem with 5 slot machines. Each machine has a different fixed but unknown probability of payout. Simulate 1000 pulls and track the total reward and cumulative regret.

In [5]:
import random
import numpy as np

true_probabilities = [0.2, 0.4, 0.6, 0.8, 1.0]  # payout probabilities of slot machines

slot_machines = len(true_probabilities)  # number of slot machines
pulls = 1000  # number of pulls
epsilon = 0.1  # epsilon value

estimated_probabilities = np.zeros(slot_machines)  # estimated probabilities
pull_count = np.zeros(slot_machines)  # count of pulls for each machine
total_reward = 0
cumulative_regret = []

for pull in range(1, pulls + 1):
    if np.random.rand() < epsilon:  # random selection method
        chosen_action = np.random.randint(slot_machines)
    else:
        chosen_action = np.argmax(estimated_probabilities)

    reward = np.random.rand() < true_probabilities[chosen_action]  # update the reward
    total_reward += reward
    pull_count[chosen_action] += 1  # update the count and estimated probabilities
    estimated_probabilities[chosen_action] += (reward - estimated_probabilities[chosen_action]) / pull_count[chosen_action]

    best_possible_reward = pull * max(true_probabilities)
    cumulative_regret.append(best_possible_reward - total_reward)

print("Estimated Probabilities:", estimated_probabilities)
print("Total Reward:", total_reward)
print("Cumulative Regret:", cumulative_regret[-1])


Estimated Probabilities: [0.34210526 0.37662338 0.44827586 0.79464286 1.        ]
Total Reward: 888
Cumulative Regret: 112.0
