# LinTS and LinUCB Dry run

## Setup

In [None]:
import numpy as np
from numpy.linalg import inv
from scipy.optimize import minimize

## Utilities

### Base Agent

In [None]:
class BaseAgent:
    """Implements the agent for an RL-Glue environment.
    Note:
        agent_init, agent_start, agent_step, agent_end, agent_cleanup, and
        agent_message are required methods.
    """

    __metaclass__ = ABCMeta

    def __init__(self):
        pass

    @abstractmethod
    def agent_init(self, agent_info={}):
        """Setup for the agent called when the experiment first starts."""

    @abstractmethod
    def agent_start(self, observation):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            observation (Numpy array): the state observation from the environment's evn_start function.
        Returns:
            The first action the agent takes.
        """

    @abstractmethod
    def agent_step(self, reward, observation):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            observation (Numpy array): the state observation from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """

    @abstractmethod
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the terminal state.
        """

    @abstractmethod
    def agent_cleanup(self):
        """Cleanup done after the agent ends."""

    @abstractmethod
    def agent_message(self, message):
        """A function used to pass information from the agent to the experiment.
        Args:
            message: The message passed to the agent.
        Returns:
            The response (or answer) to the message.
        """

### Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, size, seed):
        """
        Args:
            size (integer): The size of the replay buffer.
            minibatch_size (integer): The sample size.
            seed (integer): The seed for the random number generator.
        """
        self.buffer = []
        self.rand_generator = np.random.RandomState(seed)
        self.max_size = size

    def append(self, state, action, reward):
        """
        Args:
            state (Numpy array): The state.
            action (integer): The action.
            reward (float): The reward.
            terminal (integer): 1 if the next state is a terminal state and 0 otherwise.
            next_state (Numpy array): The next state.
        """
        if len(self.buffer) == self.max_size:
            del self.buffer[0]
        self.buffer.append([state, action, reward])

    def sample(self, last_action):
        """
        Returns:
            A list of transition tuples including state, action, reward, terinal, and next_state
        """
        state, action, reward = map(list, zip(*self.buffer))
        idxs = [elem == last_action for elem in action]
        X = [b for a, b in zip(idxs, state) if a]
        y = [b for a, b in zip(idxs, reward) if a]

        return X, y

    def size(self):
        return len(self.buffer)

### Softmax

In [None]:
def softmax(action_values, tau=1.0):
    """
    Args:
        action_values (Numpy array): A 2D array of shape (batch_size, num_actions).
                       The action-values computed by an action-value network.
        tau (float): The temperature parameter scalar.
    Returns:
        A 2D array of shape (batch_size, num_actions). Where each column is a probability distribution over
        the actions representing the policy.
    """

    # Compute the preferences by dividing the action-values by the temperature parameter tau
    preferences = action_values / tau
    # Compute the maximum preference across the actions
    max_preference = np.max(preferences, axis=1)

    # your code here

    # Reshape max_preference array which has shape [Batch,] to [Batch, 1]. This allows NumPy broadcasting
    # when subtracting the maximum preference from the preference of each action.
    reshaped_max_preference = max_preference.reshape((-1, 1))
    # print(reshaped_max_preference)

    # Compute the numerator, i.e., the exponential of the preference - the max preference.
    exp_preferences = np.exp(preferences - reshaped_max_preference)
    # print(exp_preferences)
    # Compute the denominator, i.e., the sum over the numerator along the actions axis.
    sum_of_exp_preferences = np.sum(exp_preferences, axis=1)
    # print(sum_of_exp_preferences)

    # your code here

    # Reshape sum_of_exp_preferences array which has shape [Batch,] to [Batch, 1] to  allow for NumPy broadcasting
    # when dividing the numerator by the denominator.
    reshaped_sum_of_exp_preferences = sum_of_exp_preferences.reshape((-1, 1))
    # print(reshaped_sum_of_exp_preferences)

    # Compute the action probabilities according to the equation in the previous cell.
    action_probs = exp_preferences / reshaped_sum_of_exp_preferences
    # print(action_probs)

    # your code here

    # squeeze() removes any singleton dimensions. It is used here because this function is used in the
    # agent policy when selecting an action (for which the batch dimension is 1.) As np.random.choice is used in
    # the agent policy and it expects 1D arrays, we need to remove this singleton batch dimension.
    action_probs = action_probs.squeeze()
    return action_probs

### LinUCB

In [None]:
class LinUCBAgent(BaseAgent):

    def __init__(self):
        super().__init__()
        self.name = "LinUCB"

    def agent_init(self, agent_info=None):

        if agent_info is None:
            agent_info = {}

        self.num_actions = agent_info.get('num_actions', 3)
        self.alpha = agent_info.get('alpha', 1)
        self.batch_size = agent_info.get('batch_size', 1)
        # Set random seed for policy for each run
        self.policy_rand_generator = np.random.RandomState(agent_info.get("seed", None))

        self.last_action = None
        self.last_state = None
        self.num_round = None

    def agent_policy(self, observation):
        p_t = np.zeros(self.num_actions)

        for i in range(self.num_actions):
            # initialize theta hat
            self.theta = inv(self.A[i]).dot(self.b[i])
            # get context of each arm from flattened vector of length 100
            cntx = observation
            # get gain reward of each arm
            p_t[i] = self.theta.T.dot(cntx) + self.alpha * np.sqrt(np.maximum(cntx.dot(inv(self.A[i]).dot(cntx)), 0))
        # action = np.random.choice(np.where(p_t == max(p_t))[0])
        action = self.policy_rand_generator.choice(np.where(p_t == max(p_t))[0])

        return action

    def agent_start(self, observation):
        # Specify feature dimension
        self.ndims = len(observation)

        self.A = np.zeros((self.num_actions, self.ndims, self.ndims))
        # Instantiate b as a 0 vector of length ndims.
        self.b = np.zeros((self.num_actions, self.ndims, 1))
        # set each A per arm as identity matrix of size ndims
        for arm in range(self.num_actions):
            self.A[arm] = np.eye(self.ndims)

        self.A_oracle = self.A.copy()
        self.b_oracle = self.b.copy()

        self.last_state = observation
        self.last_action = self.agent_policy(self.last_state)
        self.num_round = 0

        return self.last_action

    def agent_update(self, reward):
        self.A_oracle[self.last_action] = self.A_oracle[self.last_action] + np.outer(self.last_state, self.last_state)
        self.b_oracle[self.last_action] = np.add(self.b_oracle[self.last_action].T, self.last_state * reward).reshape(self.ndims, 1)

    def agent_step(self, reward, observation):
        if reward is not None:
            self.agent_update(reward)
            # it is a good question whether I should increment num_round outside
            # condition or not (since theoretical result doesn't clarify this
            self.num_round += 1

        if self.num_round % self.batch_size == 0:
            self.A = self.A_oracle.copy()
            self.b = self.b_oracle.copy()

        self.last_state = observation
        self.last_action = self.agent_policy(self.last_state)

        return self.last_action

    def agent_end(self, reward):
        if reward is not None:
            self.agent_update(reward)
            self.num_round += 1

        if self.num_round % self.batch_size == 0:
            self.A = self.A_oracle.copy()
            self.b = self.b_oracle.copy()

    def agent_message(self, message):
        pass

    def agent_cleanup(self):
        pass

### LinTS

In [None]:
class LinTSAgent(BaseAgent):

    def __init__(self):
        super().__init__()
        self.name = "LinTS"

    def agent_init(self, agent_info=None):

        if agent_info is None:
            agent_info = {}

        self.num_actions = agent_info.get('num_actions', 3)
        self.alpha = agent_info.get('alpha', 1)
        self.lambda_ = agent_info.get('lambda', 1)
        self.batch_size = agent_info.get('batch_size', 1)
        # Set random seed for policy for each run
        self.policy_rand_generator = np.random.RandomState(agent_info.get("seed", None))

        self.replay_buffer = ReplayBuffer(agent_info['replay_buffer_size'],
                                          agent_info.get("seed"))


        self.last_action = None
        self.last_state = None
        self.num_round = None

    def agent_policy(self, observation, mode='sample'):
        p_t = np.zeros(self.num_actions)
        cntx = observation

        for i in range(self.num_actions):
            # sampling weights after update
            self.w = self.get_weights(i)

            # using weight depending on mode
            if mode == 'sample':
                w = self.w  # weights are samples of posteriors
            elif mode == 'expected':
                w = self.m[i]  # weights are expected values of posteriors
            else:
                raise Exception('mode not recognized!')

            # calculating probabilities
            p_t[i] = 1 / (1 + np.exp(-1 * cntx.dot(w)))
            action = self.policy_rand_generator.choice(np.where(p_t == max(p_t))[0])
            # probs = softmax(p_t.reshape(1, -1))
            # action = self.policy_rand_generator.choice(a=range(self.num_actions), p=probs)

        return action

    def get_weights(self, arm):
        return np.random.normal(self.m[arm], self.alpha * self.q[arm] ** (-1.0), size=len(self.w))

        # the loss function
    def loss(self, w, *args):
        X, y = args
        return 0.5 * (self.q[self.last_action] * (w - self.m[self.last_action])).dot(w - self.m[self.last_action]) + np.sum(
            [np.log(1 + np.exp(-y[j] * w.dot(X[j]))) for j in range(y.shape[0])])

    # the gradient
    def grad(self, w, *args):
        X, y = args
        return self.q[self.last_action] * (w - self.m[self.last_action]) + (-1) * np.array(
            [y[j] * X[j] / (1. + np.exp(y[j] * w.dot(X[j]))) for j in range(y.shape[0])]).sum(axis=0)

    # fitting method
    def agent_update(self, X, y):
        # step 1, find w
        self.w = minimize(self.loss, self.w, args=(X, y), jac=self.grad, method="L-BFGS-B",
                          options={'maxiter': 20, 'disp': False}).x
        # self.m_oracle[self.last_action] = self.w
        self.m[self.last_action] = self.w

        # step 2, update q
        P = (1 + np.exp(1 - X.dot(self.m[self.last_action]))) ** (-1)
        #self.q_oracle[self.last_action] = self.q[self.last_action] + (P * (1 - P)).dot(X ** 2)
        self.q[self.last_action] = self.q[self.last_action] + (P * (1 - P)).dot(X ** 2)

    def agent_start(self, observation):
        # Specify feature dimension
        self.ndims = len(observation)

        # initializing parameters of the model
        self.m = np.zeros((self.num_actions, self.ndims))
        self.q = np.ones((self.num_actions, self.ndims)) * self.lambda_
        # initializing weights using any arm (e.g. 0) because they all equal
        self.w = np.array([0.]*self.ndims, dtype=np.float64)

        # self.m_oracle = self.m.copy()
        # self.q_oracle = self.q.copy()

        self.last_state = observation
        self.last_action = self.agent_policy(self.last_state)
        self.num_round = 0

        return self.last_action


    def agent_step(self, reward, observation):
        # Append new experience to replay buffer
        if reward is not None:
            self.replay_buffer.append(self.last_state, self.last_action, reward)
            # it is a good question whether I should increment num_round outside
            # condition or not (since theoretical result doesn't clarify this
            self.num_round += 1

            if self.num_round % self.batch_size == 0:
                X, y = self.replay_buffer.sample(self.last_action)
                X = np.array(X)
                y = np.array(y)
                self.agent_update(X, y)
                # self.m = self.m_oracle.copy()
                # self.q = self.q_oracle.copy()

        self.last_state = observation
        self.last_action = self.agent_policy(self.last_state)

        return self.last_action

    def agent_end(self, reward):
        # Append new experience to replay buffer
        if reward is not None:
            self.replay_buffer.append(self.last_state, self.last_action, reward)
            # it is a good question whether I should increment num_round outside
            # condition or not (since theoretical result doesn't clarify this
            self.num_round += 1

            if self.num_round % self.batch_size == 0:
                X, y = self.replay_buffer.sample(self.last_action)
                X = np.array(X)
                y = np.array(y)
                self.agent_update(X, y)
                # self.m = self.m_oracle.copy()
                # self.q = self.q_oracle.copy()

    def agent_message(self, message):
        pass

    def agent_cleanup(self):
        pass

## Jobs

In [None]:
print('Replay Buffer Dry run')

buffer = ReplayBuffer(size=100000, seed=1)
buffer.append([1, 2, 3], 0, 1)
buffer.append([4, 21, 3], 1, 1)
buffer.append([0, 1, 1], 0, 0)

print(buffer.sample(0))

Replay Buffer Dry run
([[1, 2, 3], [0, 1, 1]], [1, 0])


In [None]:
print('Softmax Dry run')

rand_generator = np.random.RandomState(0)
action_values = rand_generator.normal(0, 1, (2, 4))
tau = 0.5

action_probs = softmax(action_values, tau)
print("action_probs", action_probs)

assert (np.allclose(action_probs, np.array([
    [0.25849645, 0.01689625, 0.05374514, 0.67086216],
    [0.84699852, 0.00286345, 0.13520063, 0.01493741]
])))

action_values = np.array([[0.0327, 0.0127, 0.0688]])
tau = 1.
action_probs = softmax(action_values, tau)
print("action_probs", action_probs)

assert np.allclose(action_probs, np.array([0.3315, 0.3249, 0.3436]), atol=1e-04)

print("Passed the asserts! (Note: These are however limited in scope, additional testing is encouraged.)")

Softmax Dry run
action_probs [[0.25849645 0.01689625 0.05374514 0.67086216]
 [0.84699852 0.00286345 0.13520063 0.01493741]]
action_probs [0.33145968 0.32489634 0.34364398]
Passed the asserts! (Note: These are however limited in scope, additional testing is encouraged.)


In [None]:
print('LinUCB Dry run')

agent_info = {'alpha': 2,
                'num_actions': 4,
                'seed': 1}

# check initialization
linucb = LinUCBAgent()
linucb.agent_init(agent_info)
print(linucb.num_actions, linucb.alpha)

assert linucb.num_actions == 4
assert linucb.alpha == 2

# check policy
observation = np.array([1, 2, 5, 0])
linucb.A = np.zeros((linucb.num_actions, len(observation), len(observation)))
# Instantiate b as a 0 vector of length ndims.
linucb.b = np.zeros((linucb.num_actions, len(observation), 1))
# set each A per arm as identity matrix of size ndims
for arm in range(linucb.num_actions):
    linucb.A[arm] = np.eye(len(observation))

action = linucb.agent_policy(observation)
print(action)

assert action == 1

# check start
observation = np.array([1, 2, 5, 0])
linucb.agent_start(observation)
print(linucb.ndims)
print(linucb.last_state, linucb.last_action)

assert linucb.ndims == len(observation)
assert np.allclose(linucb.last_state, observation)
assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1)))
assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)),
                                        np.eye(len(observation)), np.eye(len(observation))]))
assert linucb.last_action == 3

# check step
observation = np.array([5, 3, 1, 2])
reward = 1

action = linucb.agent_step(reward, observation)
print(linucb.A)
print(linucb.b)
print(action)

true_A = np.array([[2., 2., 5., 0.],
                    [2., 5., 10., 0.],
                    [5., 10., 26., 0.],
                    [0., 0., 0., 1.]])

true_b = np.array([[1.],
                    [2.],
                    [5.],
                    [0.]])

for i in range(3):
    assert np.allclose(linucb.A[i], np.eye(4))
    assert np.allclose(linucb.b[i], np.zeros((linucb.num_actions, 4, 1)))
assert np.allclose(linucb.A[3], true_A)
assert np.allclose(linucb.b[3], true_b)
assert linucb.last_action == 0

observation = np.array([3, 1, 3, 5])
reward = None

action = linucb.agent_step(reward, observation)
print(linucb.A)
print(linucb.b)
print(action)

assert np.allclose(linucb.A[3], true_A)
assert np.allclose(linucb.b[3], true_b)
assert action == 0

# check batch size
agent_info = {'alpha': 2,
                'num_actions': 4,
                'seed': 1,
                'batch_size': 2}
linucb = LinUCBAgent()
linucb.agent_init(agent_info)
observation = np.array([1, 2, 5, 0])
linucb.agent_start(observation)
assert linucb.num_round == 0
assert linucb.last_action == 1

observation = np.array([5, 3, 1, 2])
reward = 1

action = linucb.agent_step(reward, observation)
assert linucb.num_round == 1
assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1)))
assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)),
                                        np.eye(len(observation)), np.eye(len(observation))]))

for i in [0, 2, 3]:
    assert np.allclose(linucb.A_oracle[i], np.eye(4))
    assert np.allclose(linucb.b_oracle[i], np.zeros((linucb.num_actions, 4, 1)))
assert np.allclose(linucb.A_oracle[1], true_A)
assert np.allclose(linucb.b_oracle[1], true_b)

observation = np.array([3, 1, 3, 5])
reward = None
action = linucb.agent_step(reward, observation)
# sinse reward is None, nothing should happen
assert linucb.num_round == 1
assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1)))
assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)),
                                        np.eye(len(observation)), np.eye(len(observation))]))

for i in [0, 2, 3]:
    assert np.allclose(linucb.A_oracle[i], np.eye(4))
    assert np.allclose(linucb.b_oracle[i], np.zeros((linucb.num_actions, 4, 1)))
assert np.allclose(linucb.A_oracle[1], true_A)
assert np.allclose(linucb.b_oracle[1], true_b)

observation = np.array([3, 0, 2, 5])
reward = 0
action = linucb.agent_step(reward, observation)

assert linucb.num_round == 2
assert np.allclose(linucb.b, linucb.b_oracle)
assert np.allclose(linucb.A, linucb.A_oracle)

LinUCB Dry run
4 2
1
4
[1 2 5 0] 3
[[[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 2.  2.  5.  0.]
  [ 2.  5. 10.  0.]
  [ 5. 10. 26.  0.]
  [ 0.  0.  0.  1.]]]
[[[0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]]

 [[1.]
  [2.]
  [5.]
  [0.]]]
0
[[[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 1.  0.  0.  0.]
  [ 0.  1.  0.  0.]
  [ 0.  0.  1.  0.]
  [ 0.  0.  0.  1.]]

 [[ 2.  2.  5.  0.]
  [ 2.  5. 10.  0.]
  [ 5. 10. 26.  0.]
  [ 0.  0.  0.  1.]]]
[[[0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]]

 [[1.]
  [2.]
  [5.]
  [0.]]]
0


In [None]:
print('LinTS Dry run')

agent_info = {'alpha': 2,
            'num_actions': 3,
            'seed': 1,
            'lambda': 2,
            'replay_buffer_size': 100000}

np.random.seed(1)
# check initialization
lints = LinTSAgent()
lints.agent_init(agent_info)
print(lints.num_actions, lints.alpha, lints.lambda_)

assert lints.num_actions == 3
assert lints.alpha == 2
assert lints.lambda_ == 2

# check agent policy
observation = np.array([1, 2, 5, 0])
lints.m = np.zeros((lints.num_actions, len(observation)))
lints.q = np.ones((lints.num_actions, len(observation))) * lints.lambda_
lints.w = np.random.normal(lints.m[0], lints.alpha * lints.q[0] ** (-1.0), size=len(observation))
print(lints.w)
action = lints.agent_policy(observation)
print(action)

# check agent start
observation = np.array([1, 2, 5, 0])
lints.agent_start(observation)
# manually reassign w to np.random.normal, because I np.seed doesn't work inside the class
np.random.seed(1)
lints.w = np.random.normal(lints.m[0], lints.alpha * lints.q[0] ** (-1.0), size=len(observation))
print(lints.ndims)
print(lints.last_state, lints.last_action)
print(lints.last_action)
assert lints.ndims == len(observation)
assert np.allclose(lints.last_state, observation)
assert np.allclose(lints.m, np.zeros((lints.num_actions, lints.ndims)))
assert np.allclose(lints.q, np.ones((lints.num_actions, lints.ndims)) * lints.lambda_)
assert np.allclose(lints.w, np.array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862]))
# assert lints.last_action == 1

# check step
observation = np.array([5, 3, 1, 2])
reward = 1
action = lints.agent_step(reward, observation)
print(action)

observation = np.array([1, 3, 2, 1])
reward = 0
action = lints.agent_step(reward, observation)
print(action)

LinTS Dry run
3 2 2
[ 1.62434536 -0.61175641 -0.52817175 -1.07296862]
1
4
[1 2 5 0] 1
1
1
1
