In [28]:
import numpy as np
from enum import Enum
from typing import Tuple
from collections import namedtuple
from abc import ABC, abstractmethod

In [26]:
Rewards = namedtuple('Rewards', ['value', 'optimal'])
class TestbedType(Enum):
    STATIONARY = 0
    NON_STATIONARY = 1

class Testbed():
    def __init__(self, testbedType: TestbedType = TestbedType.STATIONARY, offset:int = 0, problems: int = 2000, arms: int = 10):
        self._type=testbedType
        self._offset=offset
        self._q = np.random.normal(offset, 1, (problems, arms))
        self._optimals = np.argmax(self._q, axis=1)

    def get_rewards(self, action: np.array) -> Rewards:
        # use property of normal distribution here: N(a, 1) = a + N(0, 1)
        reward_constant = np.take_along_axis(self._q, action[:, None], axis=1).flatten()
        reward_noise = np.random.normal(0, 1, action.shape[0])

        ret = Rewards(self, value=reward_constant+reward_noise, optimal=self._optimals)
        if (self._type == TestbedType.NON_STATIONARY):
            self._q += np.random.normal(0, 0.01, self._q.shape)
            self._optimals = np.argmax(self._q, axis=1)

        return ret


In [30]:
class Agent(ABC):
    def __init__(self, time:int):
        self._average_reward=np.zeros(time)
        self._percent_optimal=np.zeros(time)

    @abstractmethod
    def get_next_actions(self) -> np.array:
        pass

    @abstractmethod
    def process_rewards(self, rewards: Rewards) -> None:
        pass
        
    def get_average_rewards(self) -> np.array:
        return self._average_reward

    def get_percent_optimal_actions(self) -> np.array:
        return self._percent_optimal
    

In [44]:
class EGreedySampleAverage(Agent):
    def __init__(self, time:int, e: np.float64, problems: int = 2000, arms: int = 10):
        super().__init__(time)
        self._e: np.float64 = e
        self._problems: int = problems
        self._arms: int = arms
        self._action_count: np.array = np.zeros((problems, arms))
        self._estimates: np.array = np.zeros((problems, arms))

    def get_next_actions(self) -> np.array:
        greedy = np.argmax(self._estimates, axis=1)
        random = np.random.randint(0, self._arms, self._problems)

        # flip a biased coin for each problem, to either pick greedy (0) or random (1)
        coinflip = np.bernoulli(1, self._e, self._problems)

        # pick outputs based on coinflip result
        return np.where(coinflip, greedy, random)
    
        

In [34]:
# Create a 2D array
arr = np.array([[10, 20, 30],
                [40, 50, 60]])

# Indices of the elements to take from each row
indices = np.array([0, 2])

# Take one element from each row using the specified indices
result = np.take_along_axis(arr, indices[:, None], axis=1).flatten()

print(result)

[10 60]


In [35]:
indices[:, None]

array([[0],
       [2]])

In [36]:
np.argmax(arr, axis=0)

array([1, 1, 1])

In [37]:
np.argmax(arr, axis=1)

array([2, 2])

In [43]:
np.random.binomial(1, 0.1, 10)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [42]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
coin = np.array([0, 1, 0])

np.where(coin, a, b)

array([4, 2, 6])