In [None]:
%matplotlib notebook

from IPython.display import HTML
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

class MultiArmedBandit:
    def __init__(self, k):
        self.k = k
        # Random true probabilities of winning for each arm, unknown to the agent
        self.probs = np.random.rand(k)

    def pull(self, arm):
        # Return reward 1 with probability probs[arm], else 0
        return 1 if np.random.rand() < self.probs[arm] else 0

In [None]:
mab = MultiArmedBandit(k=10)

In [None]:
class EpsilonGreedyAgent:
    def __init__(self, k, epsilon):
        self.k = k
        self.epsilon = epsilon
        self.counts = np.zeros(k)  # Number of times each arm was played
        self.values = np.zeros(k)  # Estimated value (mean reward) for each arm

    def select_arm(self):
        if np.random.rand() < self.epsilon:
            # Explore
            return np.random.randint(self.k)
        else:
            # Exploit
            return np.argmax(self.values)

    def update(self, arm, reward):
        self.counts[arm] += 1
        n = self.counts[arm]
        value = self.values[arm]
        # Update estimated value with incremental formula
        self.values[arm] = value + (reward - value) / n


In [None]:
class UCBAgent:
    def __init__(self, k):
        self.k = k
        self.counts = np.zeros(k)  # Number of times each arm was played
        self.values = np.zeros(k)  # Estimated value (mean reward) for each arm
        self.total_counts = 0

    def select_arm(self):
        self.total_counts += 1
        # Play each arm once to initialize
        for arm in range(self.k):
            if self.counts[arm] == 0:
                return arm

        ucb_values = self.values + np.sqrt(2 * np.log(self.total_counts) / self.counts)
        return np.argmax(ucb_values)
    """
         Values 2 factors:
         self.values (how well the arm has done so far)
         np.sqrt(2 * np.log(self.total_counts) / self.counts) (uncertainty factor ==> bigger the less the arm has been used)
    """

    def update(self, arm, reward):
        self.counts[arm] += 1
        n = self.counts[arm]
        value = self.values[arm]
        self.values[arm] = value + (reward - value) / n


In [None]:
def run_simulation(k=10, agent='epsilon', epsilon=0.1, steps=1000):
    bandit = MultiArmedBandit(k)
    if agent == 'epsilon':
        agent = EpsilonGreedyAgent(k, epsilon)
    elif agent == 'ucb':
        agent = UCBAgent(k)
    else:
        raise ValueError(f"Unknown agent type: {agent}")
    rewards = np.zeros(steps)
    chosen_arms = np.zeros(steps, dtype=int)

    for t in range(steps):
        arm = agent.select_arm()
        reward = bandit.pull(arm)
        agent.update(arm, reward)
        rewards[t] = reward
        chosen_arms[t] = arm

    print("True probabilities of arms:", np.round(bandit.probs, 3))
    print("Estimated values:", np.round(agent.values, 3))

    # Plot cumulative average reward
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(np.cumsum(rewards) / (np.arange(steps) + 1))
    plt.xlabel('Step')
    plt.ylabel('Average reward')
    plt.title(f'Cumulative average reward (epsilon={epsilon})')

    # Plot counts of chosen arms
    plt.subplot(1,2,2)
    plt.bar(range(k), agent.counts)
    plt.xlabel('Arm')
    plt.ylabel('Times chosen')
    plt.title('Number of times each arm was chosen')
    plt.show()

In [None]:
run_simulation()

In [None]:
run_simulation(agent='ucb')

In [None]:
######### ANIMATED AGENT ###############

def animate_bandit(agent_class, k=10, steps=200, epsilon=0.1):
    bandit = MultiArmedBandit(k)
    if agent_class == 'epsilon':
        agent = EpsilonGreedyAgent(k, epsilon)
    elif agent_class == 'ucb':
        agent = UCBAgent(k)
    else:
        raise ValueError('Unknown agent')

    rewards = []

    fig, ax = plt.subplots()
    ax.set_xlim(0, steps)
    ax.set_ylim(0, 1)
    line, = ax.plot([], [], lw=2)
    ax.set_xlabel('Step')
    ax.set_ylabel('Cumulative Average Reward')
    ax.set_title(f'Learning Curve ({agent_class}, epsilon={epsilon})')

    def update(frame):
        arm = agent.select_arm()
        reward = bandit.pull(arm)
        agent.update(arm, reward)
        rewards.append(reward)
        avg_reward = sum(rewards) / len(rewards)
        line.set_data(range(len(rewards)), [sum(rewards[:i+1])/(i+1) for i in range(len(rewards))])
        return line,

    anim = FuncAnimation(fig, update, frames=steps, blit=True, interval=50)

    return HTML(anim.to_jshtml())  # Return HTML for notebook rendering

# Call the function and display animation
animate_bandit('epsilon', k=10, steps=300, epsilon=0.1)


In [None]:
def ucb_scores(mean_rewards, counts, total_counts):
    # mean_rewards: array of average rewards per arm
    # counts: array of times each arm was played
    # total_counts: current time step
    ucb_vals = []
    for i in range(len(mean_rewards)):
        if counts[i] == 0:
            # Force exploration of unplayed arms
            ucb_vals.append(np.inf)
        else:
            bonus = np.sqrt(2 * np.log(total_counts) / counts[i])
            ucb_vals.append(mean_rewards[i] + bonus)
    return np.array(ucb_vals)

# Example fixed mean rewards
mean_rewards = np.array([0.2, 0.5, 0.8])

# Let's simulate pulls and total counts
pulls_per_arm = [
    [1, 5, 10, 20],  # example pulls for arm 0
    [1, 5, 10, 20],  # arm 1
    [1, 5, 10, 20],  # arm 2
]

total_counts = [3, 10, 20, 40]  # total steps increasing

plt.figure(figsize=(10,6))

for idx, arm in enumerate(range(3)):
    scores = []
    for total, count in zip(total_counts, pulls_per_arm[arm]):
        scores.append(ucb_scores(mean_rewards, np.array([pulls_per_arm[0][total_counts.index(total)],
                                                       pulls_per_arm[1][total_counts.index(total)],
                                                       pulls_per_arm[2][total_counts.index(total)]]),
                                total)[arm])
    plt.plot(total_counts, scores, label=f'Arm {arm} (mean={mean_rewards[arm]:.1f})')

plt.xlabel('Total Steps (t)')
plt.ylabel('UCB Score')
plt.title('UCB Scores for Different Arms Over Time')
plt.legend()
plt.grid(True)
plt.show()
