In [1]:
import numpy as np

class NonStationaryBandit:
    def __init__(self, n_arms, n_periods, prob_changes):
        self.n_arms = n_arms
        self.n_periods = n_periods
        self.prob_changes = prob_changes
        self.current_period = 0
        self.rewards = np.zeros(n_arms)
        self.update_probabilities()

    def update_probabilities(self):
        self.probabilities = np.random.rand(self.n_arms)

    def step(self, action):
        if self.current_period % self.prob_changes == 0:
            self.update_probabilities()
        reward = 1 if np.random.rand() < self.probabilities[action] else 0
        self.current_period += 1
        return reward

def epsilon_greedy(bandit, n_steps, epsilon):
    n_arms = bandit.n_arms
    Q = np.zeros(n_arms)
    N = np.zeros(n_arms)
    rewards = []

    for step in range(n_steps):
        if np.random.rand() < epsilon:
            action = np.random.choice(n_arms)
        else:
            action = np.argmax(Q)

        reward = bandit.step(action)
        rewards.append(reward)

        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]

    return np.array(rewards)

# Define parameters
n_arms = 10
n_steps = 10000
epsilon = 0.1
prob_changes = 500
n_periods = n_steps // prob_changes

# Initialize non-stationary bandit
bandit = NonStationaryBandit(n_arms, n_periods, prob_changes)


rewards = epsilon_greedy(bandit, n_steps, epsilon)

# performance metrics
average_reward = np.mean(rewards)
print(f'Average Reward: {average_reward}')


Average Reward: 0.6255
