# Preference Update Bandit

In [None]:
# Import modules.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec
%matplotlib inline
import import_ipynb
from abstract_bandit import Bandit

In [None]:
class PrefBandit(Bandit):

    def __init__(self, means, sigmas, arms=10, iters=10000, alpha=None, beta=None, deterministic=False, stationary=True):
        """Initializes the Upper Confidence Bound bandit."""
        super().__init__(means, sigmas, arms=arms, iters=iters, deterministic=deterministic, stationary=stationary)
        # Step size for preferences.
        self.alpha = alpha
        # Step size for average reward.
        self.beta = beta
        # Preference values of actions.
        self.prefs = np.zeros(arms)
        # Probability of choosing action.
        self.probs = np.full(arms, 1.0 / float(arms))
        # Matrix to store the trend of preferences.
        self.Hs = np.zeros((iters + 1, arms))
        # Matrix to store the trend of probabilities.
        self.Ps = np.zeros((iters + 1, arms))
        self.Ps[0, :] = self.probs

    def choose_action(self):
        """Implements the epsilon-greedy policy: exploits knowledge or explores new possibilities."""
        return np.random.choice(list(range(self.arms)), p=self.probs)

    def update_model(self, arm, reward, iteration):
        """Updates estimates and other data after an episode."""
        # Update choice count.
        self.Ns[arm] += 1
        # Update step size for preferences.
        if self.alpha == None:
            step_size_pref = 1.0 / float(self.Ns[arm])
        else:
            step_size_pref = self.alpha
        # Update step size for average rewards.
        if self.beta == None:
            step_size_rew = 1.0 / float(iteration + 1)
        else:
            step_size_rew = self.beta
        # Update average reward (note that now this has to be done here!).
        self.avg_rewards[iteration + 1] = self.avg_rewards[iteration] + step_size_rew * (reward - self.avg_rewards[iteration])
        # Update preferences.
        for i in range(arms):
            if i == arm:
                self.prefs[i] += step_size_pref * (reward - self.avg_rewards[iteration + 1]) * (1.0 - self.probs[i])
            else:
                self.prefs[i] -= step_size_pref * (reward - self.avg_rewards[iteration + 1]) * self.probs[i]
        # Update actions probabilities.
        exp_sum = np.sum(np.exp(self.prefs))
        self.probs = np.exp(self.prefs) / exp_sum
        # Eventually increase optimal actions counter.
        if arm == np.argmax(self.means):
            self.opt_actions += 1.0
        # Update real action values.
        if self.stationary == False:
            for i in range(self.arms):
                self.means[i] += self.means_rng.standard_normal()

    def update_sim_data(self, arm , reward, iteration):
        """Updates simulation data after an episode."""
        self.Hs[iteration + 1, :] = self.prefs
        self.Ps[iteration + 1, :] = self.probs
        self.qs[iteration + 1, :] = self.means

    def get_prefs(self):
        """Returns actions preferences."""
        return np.copy(self.Hs)
    
    def get_probs(self):
        """Returns actions probabilities."""
        return np.copy(self.Ps)

def pref_plots(iters, arms, means, sigmas, alphas, betas, det, stat):
    """Function that generates test plots for this Bandit."""
    # Simulation data arrays.
    rews = np.zeros((len(alphas), len(betas), iters + 1))
    actions = np.zeros((len(alphas), len(betas), arms))
    optimals = np.zeros((len(alphas), len(betas)))
    Hs_list = []
    Ps_list = []

    # Bandits execution.
    for i in range(len(alphas)):
        Hs_temp = []
        Ps_temp = []
        for j in range(len(betas)):
            bandit = PrefBandit(np.copy(means), sigmas, arms=arms, iters=iters, alpha=alphas[i], beta=betas[j], deterministic=det, stationary=stat)
            bandit.run()
            rews[i, j, :] = bandit.get_avg_rewards()
            actions[i, j, :] = bandit.get_actions()
            optimals[i, j] = bandit.get_opt_actions()
            Hs_temp.append(bandit.get_prefs())
            Ps_temp.append(bandit.get_probs())
        Hs_list.append(Hs_temp)
        Ps_list.append(Ps_temp)
    
    colors = cm.brg(np.linspace(0, 1, len(betas)))
    for i in range(len(alphas)):
        fig = plt.figure(figsize=(20,40))
        plt.subplots_adjust(top=0.92)
        fig.suptitle(r'$\alpha$ = ' + str(alphas[i]))
        gs = gridspec.GridSpec(1+arms, 2, figure=fig)
        # Average rewards plot.
        ax1 = plt.subplot(gs[0, 0])
        for j in range(len(betas)):
            ax1.plot(rews[i, j, :], color=colors[j], label=r'$\beta$ = ' + str(betas[j]))
        ax1.legend(loc='upper right')
        ax1.set(xlabel='Iterations', ylabel='Avg. rewards')
        ax1.set_title('Average rewards')
        # Actions taken plot
        ax2 = plt.subplot(gs[0, 1])
        x = np.arange(arms)
        width = 1
        pos = list(range(1 - len(betas), len(betas), 2))
        for j in range(len(betas)):
            ax2.bar(x * width + pos[j] / (2.0 * len(betas)), actions[i, j, :], width / len(betas), color=colors[j], label=r'$\beta$ = ' + str(betas[j]))
        ax2.legend(loc='upper right')
        ax2.set_yscale('log')
        ax2.set_xticks(x)
        ax2.set_xticklabels(np.arange(1, arms+1))
        ax2.set(xlabel='Actions', ylabel='Number of actions taken')
        ax2.set_title('Actions taken')
        # Real mean vs estimated means subplots.
        for k in range(arms):
            # Preferences plots.
            ax = plt.subplot(gs[k+1, 0])
            ax.set_title("Pref Arm " + str(k+1))
            for j in range(len(betas)):
                Hs_current = Hs_list[i][j]
                ax.plot(Hs_current[:, k], color=colors[j], label=r'$\beta$ = ' + str(betas[j]))
            ax.legend(loc='upper right')
            # Probabilities plots.
            ax = plt.subplot(gs[k+1, 1])
            ax.set_title("Prob Arm " + str(k+1))
            for j in range(len(betas)):
                Ps_current = Ps_list[i][j]
                ax.plot(Ps_current[:, k], color=colors[j], label=r'$\beta$ = ' + str(betas[j]))
            ax.legend(loc='upper right')
            
    # Optimal actions frequency heatmap.
    fig, ax = plt.subplots(figsize=(12, 12))
    im = ax.imshow(optimals, cmap='Blues')
    ax.set_xticks(np.arange(len(betas)))
    ax.set_yticks(np.arange(len(alphas)))
    ax.set_xticklabels(map(str, betas))
    ax.set_yticklabels(map(str, alphas))
    ax.set_title("Percentages of optimal actions taken")
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    for i in range(len(alphas)):
        for j in range(len(betas)):
            text = ax.text(j, i, optimals[i, j], ha="center", va="center", color='r')

## Deterministic, stationary case

In [None]:
# Simulation parameters (some MUST be floats!).
iters = 5000
arms = 10
means = np.arange(1.0, float(arms + 1), 1.0)
sigmas = None
alphas = [None, 0.0025, 0.005, 0.0075, 0.01]
betas = [None, 0.025, 0.05, 0.075, 0.1]
det = True
stat = True

pref_plots(iters, arms, means, sigmas, alphas, betas, det, stat)

## Deterministic, non-stationary case

In [None]:
# Simulation parameters (some MUST be floats!).
iters = 5000
arms = 10
means = np.full(arms, 0.0)
sigmas = None
alphas = [None, 0.0025, 0.005, 0.0075, 0.01]
betas = [None, 0.025, 0.05, 0.075, 0.1]
det = True
stat = False

pref_plots(iters, arms, means, sigmas, alphas, betas, det, stat)

## Stochastic, stationary case

In [None]:
# Simulation parameters (some MUST be floats!).
iters = 5000
arms = 10
means = np.arange(1.0, float(arms + 1), 1.0)
sigmas = np.full(arms, 1.0)
alphas = [None, 0.0025, 0.005, 0.0075, 0.01]
betas = [None, 0.025, 0.05, 0.075, 0.1]
det = False
stat = True

pref_plots(iters, arms, means, sigmas, alphas, betas, det, stat)

## Stochastic, non-stationary case

In [None]:
# Simulation parameters (some MUST be floats!).
iters = 5000
arms = 10
means = np.full(arms, 0.0)
sigmas = np.full(arms, 1.0)
alphas = [None, 0.0025, 0.005, 0.0075, 0.01]
betas = [None, 0.025, 0.05, 0.075, 0.1]
det = False
stat = False

pref_plots(iters, arms, means, sigmas, alphas, betas, det, stat)