# Multi Armed Bandit

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

os.mkdir('./Images')

For k-armed bandit problems, each of the k actions has a value for that action. $A_t$ is the action on time step $t$ and $R_t$ is the corresponding reward at that time step. The value of an arbitary action $a$ is denoted as $q(a)$. This is the expected reward given a:

$$
q(a) = E[R_t | A_t = a]
$$

In [None]:
def one_armed_bandit(data=False, save=False):
    if isinstance(data, np.ndarray):
            dataset = data
    else: 
        dataset = np.random.randn(100, 1) + np.random.randn(1)
    plt.violinplot(dataset)
    plt.xlabel("Action")
    plt.ylabel("Potential Reward Distribution")
    plt.show()
    if save== True:
        plt.savefig("One_Arm.png")

one_armed_bandit()

In [None]:
def multi_armed_bandit(data =False, save=False, arms=10):
    if isinstance(data, np.ndarray):
            dataset = data
    else: 
        dataset = np.random.randn(100, arms) + np.random.randn(arms)
    plt.violinplot(dataset)
    plt.xlabel("Action")
    plt.ylabel("Potential Reward Distribution")
    plt.show()
    if save== True:
        plt.savefig("Multi_Arms.png")

multi_armed_bandit()

For a one armed bandit, we can see how much reward we'd be making over time if we randomly pull the arm:

In [None]:
base = np.random.randn(100, 1) + np.random.randn(1)

one_armed_bandit(data = base)

x = []
for i in range(100):
    x.append(np.random.choice(base.squeeze()))

plt.plot(range(100), x)
plt.ylabel("Reward Output")
plt.xlabel("Trial #")
plt.show()

Next, we solve the multi-armed bandit problem through the use of the epsilon-greedy action value method for solving the k-armed bandit problem.

In [None]:
class GreedyBandit():
    def __init__(self, epsilon, alpha, n_arms, n_steps):
        self.epsilon = epsilon
        self.alpha = alpha 
        self.n_arms = n_arms
        self.n_steps = n_steps



    def greedy_epsilon(self, action_values):
        is_greedy = np.random.random() > self.epsilon
        if is_greedy:
            action = np.argmax(action_values)
        else:
            action = np.random.choice(len(action_values))
        return action

    def update_action(self, action_values, action, reward):
        q_t = action_values[action] + self.alpha * (reward - action_values[action])
        return q_t
    
    def multi_armed_bandit(self):
        all_rewards = np.random.normal(np.random.normal(size=self.n_arms))
        q, qs = np.zeros(self.n_arms), np.zeros((self.n_steps, self.n_arms))
        rewards = actions = best = np.zeros(self.n_steps)

        for step in range(self.n_steps):
            action = self.greedy_epsilon(q)
            actions[step] = action

            reward = all_rewards[action]
            rewards[step] = reward

            best_action = np.argmax(all_rewards)
            best[step] - action == best_action

            q[action] = self.update_action(q, action, reward)
            qs[step] = q
            
        results = {'qs': qs, 'actions': actions, 'rewards': rewards,'optimal': best}
        
        return results


In [None]:
n_arms = 10
epsilon = 0.1
alpha = 0.01
n_steps = 1000


result = GreedyBandit(epsilon, alpha, n_arms, n_steps)

results = result.multi_armed_bandit()

Let's visaulize the results of all this:

In [None]:
plt.plot(results['rewards'])
plt.xlabel('step')
plt.ylabel('reward')
plt.title(f'Observed Reward ($\epsilon$={epsilon}, $\\alpha$={alpha})')
plt.show()

In [None]:
plt.plot(results['qs'])
plt.xlabel('step')
plt.ylabel('value')
plt.title(f'Action Values ($\epsilon$={epsilon}, $\\alpha$={alpha})')
plt.legend(range(n_arms))
plt.show()