In [1]:
import numpy as np

class KArmedBandit:
    def __init__(self, k=10, epsilon=0.1, initial_reward=0.0):
        self.k = k #Number of arms
        self.epsilon = epsilon
        self.q_values = np.full(k, initial_reward)  # Estimated rewards for each arm
        self.action_counts = np.zeros(k)  # Counts of each action's pulls

    def select_action(self):
        if np.random.rand() < self.epsilon:
            # Explore: Randomly select an arm
            return np.random.randint(self.k)
        else:
            # Exploit: Select the arm with the highest estimated reward
            return np.argmax(self.q_values)

    def update_estimates(self, action, reward):
        self.action_counts[action] += 1
        alpha = 1 / self.action_counts[action]  # Learning rate
        self.q_values[action] += alpha * (reward - self.q_values[action])

    def run(self, true_rewards, steps=1000):
        rewards = []
        for step in range(steps):
            action = self.select_action()
            reward = np.random.normal(true_rewards[action], 1)  # Sample reward with some noise
            self.update_estimates(action, reward)
            rewards.append(reward)
        return rewards

In [2]:
k = 15  # Number of arms
epsilon = 0.1  # Exploration rate
true_rewards = np.random.normal(0, 1, k)  

bandit = KArmedBandit(k=k, epsilon=epsilon)
rewards = bandit.run(true_rewards, steps=1000)

print("Estimated rewards:", bandit.q_values)
print("True rewards:", true_rewards)


Estimated rewards: [-0.47206523 -0.71498116  1.15559955  0.58431731  0.31273101  1.27083679
  0.02592271  1.75629445  0.05597507  0.86077529 -1.75213205 -0.85330144
 -0.57378473  0.90737564  0.68687842]
True rewards: [-0.13468281 -0.24536828  0.91654646  0.75565093 -0.1580986   1.28700239
 -0.41827881  1.78239363 -0.0332194   0.60527318 -2.02569577 -0.51996135
 -0.61394526  0.8357007   0.59370834]
