In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Callable, List
import Bandit

In [None]:
def estimate_reward(bandit: Bandit, arms: int, eps: float) -> float:
    reward = 0
    while True:
        # Random action is taken with p = eps, greedy action is taken otherwise
        if 0 == np.random.choice([0, 1], p=[eps, 1 - eps]):
            reward = bandit.play_random()
        else:
            reward = bandit.play_best()
        yield reward


def run_bandits_common(n_bandits: int, steps: int, arms: int, mean: float, stdev: float,
                       all_eps: List[float], stationary: bool, plot_title: str,
                       fn_step_size: Callable = None) -> None:
    all_bandits = []
    # Initiliaze bandits once. We will re-use same bandits for different values of eps
    for _ in range(n_bandits):
        all_bandits.append(
            Bandit(arms, mean, stdev, stationary=stationary, fn_step_size=fn_step_size))

    # Find best reward assuming we taking best actions every time
    all_true_q = []
    for bandit in all_bandits:
        all_true_q.append(np.max(bandit.get_true_Q()))
    best_reward = np.mean(all_true_q)

    # Estimate Q for different values of eps (eps-greedy)
    for eps in all_eps:
        all_rewards = []
        for bandit in all_bandits:
            # reset reward estimations Q
            bandit.reset()
            bandit_rewards = []
            gen_bandit = estimate_reward(bandit, arms, eps)
            for _ in range(steps):
                bandit_rewards.append(next(gen_bandit))
            all_rewards.append(bandit_rewards)
        # Get mean across all bandits for each step
        steps_mean = np.mean(all_rewards, axis=0)
        plt.plot(np.arange(steps), steps_mean, '-', label=("eps=%.2f" % eps))

    plt.title(plot_title)
    plt.plot(np.full(steps, best_reward), label="Optimal reward")
    plt.xlabel("Steps")
    plt.ylabel("Reward mean")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def const_step(alpha):
    def step(alpha):
        return alpha
    return step


def run_non_stationary_const_step():
    n_bandits = 1000
    steps = 1000
    arms = 10
    mean = 0
    stdev = 1
    all_eps = [0.1]
    plot_title = "Non-stationary with constans step size"
    stationary = False
    fn_step_size = const_step(0.1)

    run_bandits_common(n_bandits, steps, arms, mean, stdev,
                       all_eps, stationary, plot_title, fn_step_size)


def run_non_stationary_sample_average_step():
    n_bandits = 2000
    steps = 10000
    arms = 10
    mean = 0
    stdev = 1
    all_eps = [0.1]
    plot_title = "Non-stationary with average sample step size (1/n)"
    stationary = False
    fn_step_size = None

    run_bandits_common(n_bandits, steps, arms, mean, stdev,
                       all_eps, stationary, plot_title, fn_step_size)


def run_stationary():
    '''
    Run stationary eps-greedy bandit with step size = 1/n, for different values of eps
    '''
    n_bandits = 1000
    steps = 1000
    arms = 10
    mean = 0
    stdev = 1
    all_eps = [0., 0.1, 0.01]
    plot_title = "Stationary with average sample step size (1/n)"
    stationary = True
    fn_step_size = None

    run_bandits_common(n_bandits, steps, arms, mean, stdev,
                   all_eps, stationary, plot_title, fn_step_size)