<a href="https://colab.research.google.com/github/saltycookie/RLIntroNotebook/blob/main/K_armed_Bandit_Testbed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def random_argmax(arr, axis=None):
  max_values_along_axis = np.max(arr, axis, keepdims=True)
  is_max_mask = (arr == max_values_along_axis)
  random_tiebreaker = np.where(is_max_mask, np.random.rand(*arr.shape), -1)
  return np.argmax(random_tiebreaker, axis)


In [None]:
class KArmedBanditEnv:
  def __init__(self, k, reward_std):
    self.k = k
    self.reward_std = reward_std
    self.q_star = np.random.normal(0, 1, k)

  def step(self, action):
    reward = np.random.normal(self.q_star[action], self.reward_std)
    return reward


class KArmedBanditAgent:
  def __init__(self, num_runs, k, init_q=0.0, alpha=0.0):
    self.k = k
    self.num_runs = num_runs
    self.q = np.zeros(shape=(num_runs, k)) + init_q
    self.n = np.zeros(shape=(num_runs, k), dtype=int)
    self.alpha = alpha

  def act(self):
    pass

  def update(self, action, reward):
    selected_indices = np.arange(action.shape[0]), action
    self.n[selected_indices] += 1
    self.q[selected_indices] += (reward - self.q[selected_indices]) / (
        self.alpha if self.alpha else self.n[selected_indices])


class EpsilonGreedyAgent(KArmedBanditAgent):
  def __init__(self, num_runs, k, init_q=0.0, alpha=0.0, epsilon=0.0):
    super().__init__(num_runs, k, init_q, alpha)
    self.epsilon = epsilon

  def act(self):
    if self.epsilon == 0:
      return random_argmax(self.q, axis=1)
    is_rand_mask = np.random.rand(self.q.shape[0]) < self.epsilon
    return np.where(is_rand_mask,
                    np.random.randint(0, self.k, size=self.q.shape[0]),
                    random_argmax(self.q, axis=1))


def simulate(env, agent, num_steps):
  rewards = []
  for _ in range(num_steps):
    action = agent.act()
    reward = env.step(action)
    agent.update(action, reward)
    rewards.append(np.mean(reward))
  return rewards



In [None]:
env = KArmedBanditEnv(10, 1)
num_steps = 3000
num_runs = 10000

plt.figure(figsize=(15, 6))

greedy_agent = EpsilonGreedyAgent(num_runs, env.k)
rewards_1 = simulate(env, greedy_agent, num_steps)
plt.plot(rewards_1, label='Greedy')

optimistic_greedy_agent = EpsilonGreedyAgent(num_runs, env.k, init_q=5)
rewards_4 = simulate(env, optimistic_greedy_agent, num_steps)
plt.plot(rewards_4, label='Optimistic Greedy')

epsilon_greedy_agent_1 = EpsilonGreedyAgent(num_runs, env.k, epsilon=0.1)
rewards_2 = simulate(env, epsilon_greedy_agent_1, num_steps)
plt.plot(rewards_2, label='Espilon Greedy (ε=0.1)')

epsilon_greedy_agent_2 = EpsilonGreedyAgent(num_runs, env.k, epsilon=0.01)
rewards_3 = simulate(env, epsilon_greedy_agent_2, num_steps)
plt.plot(rewards_3, label='Espilon Greedy (ε=0.01)')

plt.legend()
plt.show()