In [1]:
import numpy as np
import random


In [2]:

class KArmedBandit:
    def __init__(self, k_arms, epsilon=0.1):

        self.k_arms = k_arms    #k_arms: Number of arms (recommendation options).
        self.epsilon = epsilon  #epsilon: Exploration rate (probability of exploring a new arm).

        self.counts = np.zeros(k_arms)  # Count of how many times each arm was selected
        self.values = np.zeros(k_arms)  # Estimated value (reward) of each arm


    #Select an action (arm) using epsilon-greedy strategy.

    def select_action(self):

        if random.random() < self.epsilon:     # Exploration-Randomly select an arm
            return random.randint(0, self.k_arms - 1)


        else:     # Exploitation: Select the arm with the highest estimated reward
            return np.argmax(self.values)



    def update_estimates(self, arm, reward):    # Update the estimated reward for the selected arm.
    #(arm-The arm that was selected. reward-The reward received after selecting the arm.)

        self.counts[arm] += 1
        self.values[arm] += (reward - self.values[arm]) / self.counts[arm]



    def simulate(self, true_rewards, num_steps):

        rewards = []
        for _ in range(num_steps):
            #Select an action (explore or exploit)
            arm = self.select_action()


            reward = np.random.normal(true_rewards[arm])

            #Update reward estimates
            self.update_estimates(arm, reward)


            rewards.append(reward)

        return rewards

In [3]:
# Initialize bandit with k arms and an exploration rate (epsilon)
k = 5
epsilon = 0.1  # Exploration rate (10% of the time we explore new recommendations)
bandit = KArmedBandit(k_arms=k, epsilon=epsilon)

In [4]:

true_rewards = [0.1, 0.5, 0.7, 0.3, 0.9]  #true rewards for 5 recommendations



In [5]:
s
num_steps = 1000

In [6]:

rewards = bandit.simulate(true_rewards, num_steps)

In [7]:

print(f"Estimated values of each arm: {bandit.values}")
print(f"True rewards: {true_rewards}")


Estimated values of each arm: [0.40734469 0.54529859 0.61524108 0.13973754 0.89970458]
True rewards: [0.1, 0.5, 0.7, 0.3, 0.9]
