In [1]:
import numpy as np
import plotly.graph_objects as go

In [2]:
class NonstationaryBandit:
    def __init__(self, k, n_steps, epsilon):
        # Initialize parameters
        self.k = k  # Number of arms
        self.n_steps = n_steps  # Total number of steps to simulate
        self.epsilon = epsilon  # Probability for exploration
        self.q_star = np.random.rand(k)  # True value of each arm (randomly initialized)
        self.q_values = np.zeros(k)  # Estimated value of each arm (initialized to zero)
        self.action_counts = np.zeros(k)  # Counts how many times each arm has been pulled
        self.rewards = []  # To store cumulative rewards over the steps

    def update_rewards(self):
        """Update the true values of each arm to simulate the nonstationary nature."""
        # Small random change to each arm's true value
        self.q_star += np.random.normal(0, 0.01, self.k)

    def select_action(self):
        """Select an action based on the epsilon-greedy strategy."""
        if np.random.rand() < self.epsilon:
            # Explore: choose a random arm
            return np.random.randint(self.k)
        else:
            # Exploit: choose the arm with the highest estimated value
            return np.argmax(self.q_values)

    def run(self):
        """Run the bandit simulation for the specified number of steps."""
        for step in range(self.n_steps):
            self.update_rewards()  # Update the reward distributions
            action = self.select_action()  # Select an action
            reward = np.random.normal(self.q_star[action], 1)  # Simulate reward based on the true value
            self.rewards.append(reward)  # Record the obtained reward
            self.action_counts[action] += 1  # Increment the count for the chosen action
            
            # Update estimated value of the chosen arm
            self.q_values[action] += (reward - self.q_values[action]) / self.action_counts[action]

In [3]:
def plot_results(rewards):
    """Plot the cumulative rewards over time."""
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=np.cumsum(rewards), mode='lines', name='Cumulative Rewards'))
    fig.update_layout(title='Cumulative Rewards over Time', xaxis_title='Steps', yaxis_title='Cumulative Rewards')
    fig.show()

In [4]:
def main():
    # Hyperparameters
    k = 10  # Number of arms
    n_steps = 1000  # Total number of steps to simulate
    epsilon = 0.1  # Exploration probability

    # Create an instance of the NonstationaryBandit class
    bandit = NonstationaryBandit(k, n_steps, epsilon)

    # Run the simulation
    bandit.run()

    # Plot the results
    plot_results(bandit.rewards)

In [5]:
if __name__ == "__main__":
    main()