In [19]:
import numpy as np
from matplotlib import pyplot as plt
import time
%matplotlib inline

# 10-armed bandit

The bandit:
- Mean and variance for each arm. The means represent $Q^*(a)$
- Ability to interact with the bandit - specifically simulate a play on one of its arms
- Bandit represented as a 2D array - rows are arms, columns are (mean, variance)
- Bandit array built by generating 10 means (from a guassian, mean 0 and variance 1), variances are 1

The agent:
- Maintains:
  - list of estimates of $Q_t(a)$ (Estimated value of each action $a$)
  - Current "play" number $k$
- Determines action to take, given $Q_t(a)$ and $k$

Running plays:
- Ask agent for an agent
- Get response from bandit for action
- Give response to agent

In [26]:
class Bandit(object):
    def __init__(self, mean_variances):
        self.mean_variances = mean_variances
    
    def get_response(self, arm_index):
        mean, variance = self.mean_variances[arm_index]
        return np.random.normal(mean, variance)

def make_bandit(n_bandits, mean_reward_mean=0, var_reward_mean=1, mean_reward_var=1, var_reward_var=0):
    mean_vars =  [(np.random.normal(mean_reward_mean, var_reward_mean),
                   np.random.normal(mean_reward_var, var_reward_var)) for _ in range(n_bandits)]
    return Bandit(mean_vars)

In [30]:
class Agent(object):
    def __init__(self, estimated_action_values, explore_chance=0):
        self.estimated_action_values = estimated_action_values
        self.explore_chance = explore_chance
    
    def get_action(self):
        best = np.argmax(self.estimated_action_values)
        if self.should_explore():
            return self.exploratory_action(not_=best)
        else:
            return best
    
    def should_explore(self):
        return np.random.random() < self.explore_chance
    
    def exploratory_action(self, not_=-1):
        rand_action = np.random.randint(0, len(self.estimated_action_values))
        if rand_action == not_:
            return self.exploratory_action(not_)
        else:
            return rand_action
    
    def update(self, action, reward, play_number):
        action_values = self.estimated_action_values.copy()
        action_value = action_values[action]
        new_action_value = action_value + (1 / play_number) * (reward - action_value)
        action_values[action] = new_action_value
        return Agent(action_values, self.explore_chance)
    
def make_agent(bandit_num_arms, initial_estimated_action_value, explore_chance) -> Agent:
    return Agent([initial_estimated_action_value for _ in range(bandit_num_arms)], explore_chance)

In [37]:
def play(bandit_arms, num_plays=1000, print_=False):
    bandit = make_bandit(bandit_arms)
    agent = make_agent(bandit_arms, 0, 0)
    for play_number in range(1, num_plays+1):
        action = agent.get_action()
        print_ and print(f"Agent chose {action}")
        response = bandit.get_response(action)
        print_ and print(f"Reward: {response}")
        print_ and print("\n##################\n")
        agent = agent.update(action, response, play_number)
        print_ and time.sleep(1)

In [39]:
play(10)