In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Arm:
    def __init__(self, p):
        self.p = p
    
    def pull(self):
        return np.random.binomial(1, self.p)

class MultiBandit:
    def __init__(self, probs = [0.1, 0.2, 0.7, 0.5]):
        self.__arms = [Arm(p) for p in probs]
        self.__regret = 0
        self.__maxp = max(probs)

    def num_arms(self):
        return len(self.__arms)

    def pull(self, arm_num):
        reward = self.__arms[arm_num].pull()
        self.__regret += self.__maxp-self.__arms[arm_num].p
        return reward
    
    def regret(self):
        return self.__regret

In [None]:
class EpsilonGreedyAlgorithm:
    def __init__(self, num_arms, horizon, epsilon):
        # Initialize our algorithm
        self.num_arms = num_arms                    # Number of arms present in the bandit
        self.horizon = horizon                      # Total Horizon of the algorithm
        self.epsilon = epsilon                      # epsiolon value for the algorithm
        self.timestep = 0                           # The current timestep while running the algorithm
        self.arm_pulls = np.zeros(num_arms)         # History of number of times each arm was pulled
        self.arm_rewards = np.zeros(num_arms)       # History of the total reward accumulated by each arm
        self.regrets = np.zeros(horizon)            # Total regret at each timestep of the horizon

    def give_best_arm(self):
        # Return the arm which the algorithm considers to be the best arm at end of algorithm
        
        avg_rewards = np.divide(self.arm_rewards, self.arm_pulls, out=np.zeros_like(self.arm_rewards), where=self.arm_pulls!=0)
        return np.argmax(avg_rewards)
    def select_arm(self):
        # Select arm at each time step. You are supposed to return the index of which 
        # arm has been selected to pull at this timestep
       
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.num_arms)
        else:
            avg_rewards = np.divide(self.arm_rewards, self.arm_pulls, out=np.zeros_like(self.arm_rewards), where=self.arm_pulls!=0)
            return np.argmax(avg_rewards)
    def run_algorithm(self, bandit):
        # This is the proper algorithm. Already completed
        for _ in range(self.horizon):
            arm_to_pull = self.select_arm()             # Select the arm using the algorithm
            reward = bandit.pull(arm_to_pull)           # Pull the arm and find our the reward
            self.arm_pulls[arm_to_pull] += 1            # Update the arm pull count and arm reward count
            self.arm_rewards[arm_to_pull] += reward
            self.timestep += 1                          # Update the timestep
            self.regrets[_] = bandit.regret()           # Store the regret values at each timestep
    
    def plot(self):
        # Plot the regret graph. Label the X and Y Axis properly using matplotlib library
        pass
        plt.figure(figsize=(8,4))
        plt.plot(self.regrets)
        plt.title("Epsilon-Greedy: Total Regret vs Timesteps")
        plt.xlabel("Timestep")
        plt.ylabel("Total Regret")
        plt.grid(True)
        plt.show()

In [None]:
class UCBAlgorithm:
    def __init__(self, num_arms, horizon):
        self.num_arms = num_arms
        self.horizon = horizon
        self.timestep = 0
        self.arm_pulls = np.zeros(num_arms)
        self.arm_rewards = np.zeros(num_arms)
        self.regrets = np.zeros(horizon)
    def give_best_arm(self):
        average_rewards = self.arm_rewards / np.maximum(self.arm_pulls, 1)
        return int(np.argmax(average_rewards))
    def select_arm(self):
        if self.timestep < self.num_arms:
            return self.timestep
        else:
            total_counts = self.timestep
            average_rewards = self.arm_rewards / self.arm_pulls
            confidence_bounds = np.sqrt((2*np.log(total_counts)) / self.arm_pulls)
            ucb_values = average_rewards + confidence_bounds
            return int(np.argmax(ucb_values))     
    def run_algorithm(self, bandit):
        for t in range(self.horizon):
            arm_to_pull = self.select_arm()
            reward = bandit.pull(arm_to_pull)
            self.arm_pulls[arm_to_pull] += 1
            self.arm_rewards[arm_to_pull] += reward
            self.timestep += 1
            self.regrets[t] = bandit.regret()
    def plot(self):
        plt.figure(figsize=(8,4))
        plt.plot(self.regrets, label="UCB")
        plt.xlabel("Timestep")
        plt.ylabel("Total Regret")
        plt.title("UCB: Total Regret vs Timestep") 
        plt.grid(True)
        plt.legend()
        plt.show()


In [None]:
class ThompsonSamplingAlgorithm:
    def __init__(self, num_arms, horizon):
        self.num_arms = num_arms
        self.horizon = horizon
        self.timestep = 0
        self.successes = np.zeros(num_arms)
        self.failures = np.zeros(num_arms)
        self.regrets = np.zeros(horizon)
    def give_best_arm(self):
        beta_means = self.successes / (self.successes + self.failures + 1e-6)
        return int(np.argmax(beta_means))
    def select_arm(self):
        samples = [np.random.beta(self.successes[i] + 1, self.failures[i] + 1) for i in range(self.num_arms)] 
        return int(np.argmax(samples))
    def run_algorithm(self, bandit):
        for t in range(self.horizon):
            arm_to_pull = self.select_arm()
            reward = bandit.pull(arm_to_pull)
            if reward == 1:
                self.successes[arm_to_pull] += 1
            else:
                self.failures[arm_to_pull] +=1
            self.timestep +=1
            self.regrets[t] = bandit.regret()  
    def plot(self):
        plt.figure(figsize=(8,4))
        plt.plot(self.regrets, label="Thompson Sampling")
        plt.xlabel("Timestep")
        plt.ylabel("Total Regret")
        plt.title("Thompson Sampling: Total Regret vs Timestep")
        plt.grid(True)
        plt.legend()
        plt.show()                   

In [None]:
def evaluate_algorithms(custom_probs, horizon, epsilon=0.1):
    print(f"\n Evaluating on custom bandit: {custom_probs}")
    bandit1 = MultiBandit(custom_probs)
    bandit2 = MultiBandit(custom_probs)
    bandit3 = MultiBandit(custom_probs)
    algo1 = EpsilonGreedyAlgorithm(num_arms=len(custom_probs), horizon=horizon, epsilon=epsilon)
    algo2 = UCBAlgorithm(num_arms=len(custom_probs), horizon=horizon)
    algo3 = ThompsonSamplingAlgorithm(num_arms=len(custom_probs),horizon=horizon)
    algo1.run_algorithm(bandit1)
    algo2.run_algorithm(bandit2)
    algo3.run_algorithm(bandit3)
    print(f"\n Epsilon-Greedy (e={epsilon}): Total Regret = {bandit1.regret():.2f}, Best Arm = {algo1.give_best_arm()}")
    print(f"UCB : Total Regret = {bandit2.regret():.2f}, Best Arm = {algo2.give_best_arm()}")
    print(f"Thompson Sampling: Total Regret = {bandit3.regret():.2f}, Best Arm = {algo3.give_best_arm()}")
    plt.figure(figsize=(10,5))
    plt.plot(algo1.regrets, label=f"Epsilon-Greedy (e={epsilon})")
    plt.plot(algo2.regrets, label="UCB")
    plt.plot(algo3.regrets, label="Thompson Sampling")
    plt.xlabel("Timestep")
    plt.ylabel("Total Regret")
    plt.title("Total Regret vs Timestep for all Algorithms")
    plt.legend()
    plt.grid(True)
    plt.show()
    regrets = {
        f"Epsilon-Greedy (e={epsilon})": bandit1.regret(),
        "UCB": bandit2.regret(),
        "Thompson Sampling": bandit3.regret()
    }
    worst_algo = max(regrets, key=regrets.get)
    print(f"Based on the current run, the least effective algorithm is: {worst_algo} (Regret = {regrets[worst_algo]:.2f})")

custom_probs = [0.1, 0.4, 0.8, 0.3] #changeable
horizon = 100 #set the horizon size
evaluate_algorithms(custom_probs, horizon=horizon, epsilon=0.2)
