In [None]:
import numpy as np

class NewsValueMaximiser:
    def __init__(self, n_arms, epsilon=0.1):
        self.n_arms = n_arms
        self.epsilon = epsilon
        self.counts = np.zeros(n_arms)  # Number of times each article is selected
        self.values = np.zeros(n_arms)  # Average reward (clicks) for each article

    def select_arm(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.n_arms)  # Exploration
        else:
            return np.argmax(self.values)  # Exploitation

    def update(self, chosen_arm, reward):
        self.counts[chosen_arm] += 1
        n = self.counts[chosen_arm]
        current_value = self.values[chosen_arm]
        # Incremental update formula for the average
        new_value = current_value + (reward - current_value) / n
        self.values[chosen_arm] = new_value

    def run(self, rounds, reward_function):
        for _ in range(rounds):
            chosen_arm = self.select_arm()
            reward = reward_function(chosen_arm)
            self.update(chosen_arm, reward)
        return self.values

# Example reward function that simulates the reward based on chosen arm (article).
def reward_function(chosen_arm):
    # Simulate higher rewards for certain articles, for example:
    article_popularity = [0.3, 0.7, 0.5, 0.9, 0.4]  # Example probabilities of clicks
    return 1 if np.random.rand() < article_popularity[chosen_arm] else 0



Final click values (average reward) for each article: [0.33333333 0.65       0.60714286 0.87303371 0.25      ]


In [None]:
# Set up and run the bandit
n_arms = 5  # Suppose we have 5 articles
bandit = NewsValueMaximiser(n_arms, epsilon=0.1)
rounds = 1000  # Run the bandit for 1000 rounds

final_values = bandit.run(rounds, reward_function)
print("Final click values (average reward) for each article:", final_values)