In [1]:
import numpy as np
import matplotlib.pyplot as plt


# Multi-Armed Bandit related Code

In [2]:
import numpy as np
from abc import ABC, abstractmethod


# Abstract Arm class
class Arm(ABC):
    @abstractmethod
    def pull(self):
        pass

    @abstractmethod
    def mean(self):
        pass

    @abstractmethod
    def reset(self):
        pass


# NormalArm: Gaussian distributed rewards
class NormalArm(Arm):
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma
        self.total_reward = 0.0
        self.pull_count = 0

    def pull(self):
        reward = np.random.normal(self.mu, self.sigma)
        self.total_reward += reward
        self.pull_count += 1
        return reward

    def mean(self):
        return self.mu

    def reset(self):
        self.total_reward = 0.0
        self.pull_count = 0


# DiscreteArm: Sample from a discrete distribution
class DiscreteArm(Arm):
    def __init__(self, rewards, probabilities):
        assert len(rewards) == len(probabilities), "Mismatched reward-probability lengths"
        assert np.isclose(sum(probabilities), 1.0), "Probabilities must sum to 1"
        self.rewards = rewards
        self.probabilities = probabilities
        self.total_reward = 0.0
        self.pull_count = 0

    def pull(self):
        reward = np.random.choice(self.rewards, p=self.probabilities)
        self.total_reward += reward
        self.pull_count += 1
        return reward

    def mean(self):
        return np.dot(self.rewards, self.probabilities)

    def reset(self):
        self.total_reward = 0.0
        self.pull_count = 0


# MultiArmedBandit manager class
class MultiArmedBandit:
    def __init__(self, arms):
        self.arms = arms
        self.best_arm_index = np.argmax([arm.mean() for arm in arms])
        self.best_mean = self.arms[self.best_arm_index].mean()

    def pull(self, index):
        return self.arms[index].pull()

    def regret(self, reward):
        return self.best_mean - reward

    def get_best_arm(self):
        return self.best_arm_index

    def reset_all(self):
        for arm in self.arms:
            arm.reset()

    def all_means(self):
        return [arm.mean() for arm in self.arms]

In [4]:
arms = [
    NormalArm(mu=1.0, sigma=1.0),
    DiscreteArm(rewards=[0, 1, 5], probabilities=[0.2, 0.5, 0.3]),
    NormalArm(mu=2.5, sigma=0.5)
]

bandit = MultiArmedBandit(arms)

for t in range(100):
    chosen = np.random.randint(len(arms))
    reward = bandit.pull(chosen)
    reg = bandit.regret(reward)
    print(f"Round {t}: Pulled arm {chosen}, reward={reward:.2f}, regret={reg:.2f}")

print(f"Best arm is arm {bandit.get_best_arm()} with expected mean {bandit.best_mean:.2f}")

Round 0: Pulled arm 2, reward=2.27, regret=0.23
Round 1: Pulled arm 0, reward=-0.25, regret=2.75
Round 2: Pulled arm 1, reward=5.00, regret=-2.50
Round 3: Pulled arm 0, reward=0.47, regret=2.03
Round 4: Pulled arm 0, reward=1.00, regret=1.50
Round 5: Pulled arm 2, reward=2.74, regret=-0.24
Round 6: Pulled arm 0, reward=2.23, regret=0.27
Round 7: Pulled arm 1, reward=0.00, regret=2.50
Round 8: Pulled arm 1, reward=1.00, regret=1.50
Round 9: Pulled arm 0, reward=0.33, regret=2.17
Round 10: Pulled arm 0, reward=0.83, regret=1.67
Round 11: Pulled arm 1, reward=1.00, regret=1.50
Round 12: Pulled arm 0, reward=1.52, regret=0.98
Round 13: Pulled arm 0, reward=-0.40, regret=2.90
Round 14: Pulled arm 0, reward=-0.41, regret=2.91
Round 15: Pulled arm 0, reward=1.26, regret=1.24
Round 16: Pulled arm 2, reward=2.23, regret=0.27
Round 17: Pulled arm 2, reward=2.76, regret=-0.26
Round 18: Pulled arm 1, reward=5.00, regret=-2.50
Round 19: Pulled arm 2, reward=1.48, regret=1.02
Round 20: Pulled arm 2,