# Contextual Bandits for Booking.com


### Goal of the project
* Implement multiple Agents, such as LinUCB, Eps-Greedy and Deep Regression Model
* Apply what we learned in class to our project, but **add something new**
* Something new for booking.com could be, e.g. that the margin for different hotels is different. We will consider the margin to be fixed, hence stationary. The reward of the agent is hence not only a rating or whether the user booked or not.
* At a later point the margin *could* also change over time turning the problem into a non-stationary problem!
* We could also add information about availability, so that the hotel will be booked out after a while.

### Setup
#### Environment
* Gets a list of hotels and then returns the margin that the booking platform made. This depends on:
    1. Did the user book the hotel / Did he like it
    2. What margin can I make
* The margin is hence *did_book* * *margin*
#### Agent
* Gets a list of hotels and selects one that he will propose to the user

#### ToDos
* Fix LinUCB
* Add constraints to the hotel capacity 

### Tips for working with Tensorflow
Use Tensorflow 2. This provides more / better options for debugging
```
tf.config.experimental_run_functions_eagerly(true)
```

This allows debugging via
```
dpb.set_trace()
```

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from copy import copy, deepcopy
import random
from scipy.stats import norm
from scipy.stats import logistic

In [2]:
def random_argmax(rng, list_):
    """ similar to np.argmax but return a random element among max
        when multiple max exists."""
    return rng.choice(np.argwhere(list_ == list_.max()).flatten())

In [3]:
np.set_printoptions(precision=2)

## Environment

In [9]:
class ExplicitFeedback:
    """ A rating environment with explicit feedback.
        User and hotels are represented by points in R^k
        User interest in a hotel is modeled by a parametric function
        R_{u,h} = f(u,h) = f(W_u, W_h)
        Example of function include dot product (cosine similarity), but then take margin into account
        f(W_u, W_h) = sigmoid ( \sum_k w_{u,k} . w_{h,k} ) 
        R_{u,h} = m_{h} * (f(W_u, W_h) > threshold)
        action: Recommend one hotel, which maximizes the margin
    """

    def __init__(self, nb_users=30, nb_hotels=10, 
                 internal_embedding_size=5,
                 displayed_users_embedding_size=5,
                 displayed_hotels_embedding_size=5,
                 noise_size=3,
                 threshold=.2,
                 max_avail=400,
                 seed=None):
        self.nb_users = nb_users
        self.nb_hotels = nb_hotels
        self.internal_embedding_size = internal_embedding_size
        self.displayed_users_embedding_size = displayed_users_embedding_size
        self.displayed_hotels_embedding_size = displayed_hotels_embedding_size
        self.noise_size = noise_size
        self._rng = np.random.RandomState(seed)
        self.threshold = threshold
        self.max_avail = max_avail
        
        self.action_size = self.nb_hotels
        self.sampling_limit = self.nb_users * self.nb_hotels
        self.user_mean = np.ones(self.internal_embedding_size)
        self.user_var = np.ones(self.internal_embedding_size)
        self.hotels_mean = np.ones(self.internal_embedding_size)
        self.hotels_var = np.ones(self.internal_embedding_size)
        self.users_embedding = None
        self.hotels_embedding = None
        self.user_hotels_history = np.zeros((self.nb_users, self.nb_hotels))
        self.z_cut_points = None
        self.done = False
        ###############################
        self.hotel_rooms = np.random.randint(low=self.max_avail/10, high=self.max_avail, size=self.nb_hotels)

    def step(self, action):
        # check if behind done
        if self.done: #self.user_hotels_history.sum() >= self.sampling_limit:
            print("You are calling step after it return done=True.\n"
                  "You should reset the environment.")

        assert action < self.action_size
        self.action = action

        #update rooms available
        self.hotel_rooms[action] = self.hotel_rooms[action] - 1

        # see if there are still rooms available
        available = np.argwhere(self.hotel_rooms > 0)
        if not available.any():
          print('All rooms are already booked')
          print('Moving to the day after')
          self.hotel_rooms = np.random.randint(low=4, high=40, size=self.nb_hotels)
        
        # compute potential rewards
        potential_rewards = [self._get_user_hotels_rating(self.current_user, i) 
                      for i in np.argwhere(self.hotel_rooms > 0).flatten()]
        optimal_return = np.max(potential_rewards)

        # map action to hotels
        #self.recommended_item = np.argwhere(self.user_item_history[self.current_user, :] == 0)[action][0]       
        self.recommended_hotel = np.argwhere(self.hotel_rooms > 0)[action][0]

        # mark hotels as rated
        self.user_hotels_history[self.current_user, self.recommended_hotel] += 1

        # compute reward R_t
        self.current_rating = self._get_user_hotels_rating(self.current_user, self.recommended_hotel)
        self.reward = self.current_rating
        
        # check if done
        if self.user_hotels_history.sum() == self.sampling_limit:
            self.done = True

        # compute next state S_{t+1}
        self._next_state()

        # update action space t+1
        self.action_size = len(self.available_hotels)

        return self.reward, self.state, self.done, optimal_return

    def reset(self, seed=None):
        self._rng = np.random.RandomState(seed)
        self.action_size = self.nb_hotels
        self.hotel_rooms = np.random.randint(low=self.max_avail/10, high=self.max_avail, size=self.nb_hotels)

        
        # create users and hotels embedding matrix
        self.users_embedding = self._rng.normal(loc=self.user_mean,
                                                scale=self.user_var,
                                                size=(self.nb_users, self.internal_embedding_size))
        self.hotels_embedding = self._rng.normal(loc=self.hotels_mean,
                                                scale=self.hotels_var,
                                                size=(self.nb_hotels, self.internal_embedding_size))
        self.margin = self._rng.rand(self.nb_hotels)


        # Let X = users_embedding and Y = hotels_embedding
        # In order to properly map float into integers, we need to know the distribution of
        # Z = \sum_k X_k.Y_k
        # E[Z] = \sum_k E[X_k.Y_k] = \sum_k E[X_k]E[Y_k]
        # Var[Z] = \sum_k Var[X_k.Y_k] = \sum_k Var[X_k]Var[Y_k] + Var[X_k]E[Y_k]^2 + Var[Y_k]E[X_k]^2
        z_mean = self.user_mean.dot(self.hotels_mean)
        z_var = self.user_var.dot(self.hotels_var) + self.user_var.dot(np.square(self.hotels_mean)) + \
                self.hotels_var.dot(np.square(self.user_mean))
        z = norm(z_mean, np.sqrt(z_var))
        # to get 5 values, we need 4 cut points
        self.z_cut_points = z.ppf([self.threshold]) # 0.2, 0.4, 0.6, 0.8 # you can control the distribution of ratings here.
        self.user_hotels_history = np.zeros((self.nb_users, self.nb_hotels))
        self.done = False

        self._next_state()
        return self.state

    def _get_user_hotels_rating(self, user, hotel):
        real_score = (self.users_embedding[user].dot(self.hotels_embedding[hotel]))   
        booking_score = np.searchsorted(self.z_cut_points, real_score)
        return int((booking_score * self.margin[hotel] * 100) / 10)

    def _get_variables(self, user, hotel):
        user_embedding = self.users_embedding[user]
        hotel_embedding = self.hotels_embedding[hotel]
        if self.displayed_users_embedding_size + self.displayed_hotels_embedding_size > 0:
            variables = np.array([user_embedding[:self.displayed_users_embedding_size],
                                  hotel_embedding[:self.displayed_hotels_embedding_size]])

            if self.noise_size > 0:
                noise = self._rng.normal(loc=np.ones(self.noise_size),
                                         scale=np.ones(self.noise_size),
                                         size=self.noise_size)
                variables = np.append(variables, noise)

            return variables

    def _get_new_user(self):
        for i in range(10):
            user = self._rng.randint(0, self.nb_users)
            # check it remain at least one hotel
            if np.sum(self.user_hotels_history[user, :]) < self.nb_hotels:
                return user
        return self._rng.choice(np.argwhere(self.user_hotels_history <= self.max_vail))[0]

    def _next_state(self):
        # Pick a user
        if self.user_hotels_history.sum() < self.sampling_limit:
            self.current_user = self._get_new_user()
        else:
            self.current_user = None

        # List available hotels
        self.available_hotels = np.argwhere(self.user_hotels_history[self.current_user, :] <= self.max_avail)
        #print(self.available_hotels)
        #

        self.state = list()
        for i in self.available_hotels:
            hotel = i[0]
            # Compute variables
            variables = self._get_variables(self.current_user, hotel)
            self.state.append([self.current_user, hotel, variables])

In [10]:
env = ExplicitFeedback()

In [12]:
env.reset(seed=2020)

[[0, 0, array([-0.77,  1.08, -0.13,  0.35,  0.11,  1.56,  0.81, -0.43,  1.46,
          1.93,  2.38,  0.61,  1.08])],
 [0, 1, array([-0.77,  1.08, -0.13,  0.35,  0.11,  2.02,  1.49,  0.2 ,  0.56,
          3.11,  1.66,  0.27,  1.16])],
 [0, 2, array([-0.77,  1.08, -0.13,  0.35,  0.11,  0.28,  2.7 ,  0.31,  0.14,
          3.2 ,  1.2 ,  2.09, -0.03])],
 [0, 3, array([-0.77,  1.08, -0.13,  0.35,  0.11,  0.28,  1.25,  1.3 ,  1.18,
          2.07, -0.62,  1.64,  3.12])],
 [0, 4, array([-0.77,  1.08, -0.13,  0.35,  0.11, -0.86,  1.45,  0.84, -0.53,
          0.75,  0.42, -0.1 ,  2.06])],
 [0, 5, array([-0.77,  1.08, -0.13,  0.35,  0.11,  1.23,  3.47,  1.63,  0.58,
          2.01,  0.72,  1.51,  1.44])],
 [0, 6, array([-0.77,  1.08, -0.13,  0.35,  0.11, -0.83,  0.64, -0.09,  1.04,
          1.32,  2.76,  1.63,  0.98])],
 [0, 7, array([-0.77,  1.08, -0.13,  0.35,  0.11, -0.33,  0.81,  0.56,  1.53,
          1.3 , -1.26,  1.12,  1.79])],
 [0, 8, array([-0.77,  1.08, -0.13,  0.35,  0.11,  1.56,

user 11 is connecting to your platform and we shoud recommend him one item among the ten availables.
We also observe a vector of features that could depend on the user, the item and/or some context(like time, weather, etc).

In [13]:
reward, next_state, done, optimal_return = env.step(3)
print('reward: ', reward)

reward:  0


We recommend the first hotel of the list (hotel 0) to the user and the margin as reward. If the margin is 0 he did not book it!

We also get the next state, that is the next user connect to our application, the list of available hotels for recommendations and a list of features.

In [14]:
next_state

[[25, 0, array([ 0.66,  2.24,  1.22, -0.11,  0.39,  1.56,  0.81, -0.43,  1.46,
          1.93,  2.54, -0.59,  0.74])],
 [25, 1, array([ 0.66,  2.24,  1.22, -0.11,  0.39,  2.02,  1.49,  0.2 ,  0.56,
          3.11, -0.  ,  0.9 , -0.38])],
 [25, 2, array([ 0.66,  2.24,  1.22, -0.11,  0.39,  0.28,  2.7 ,  0.31,  0.14,
          3.2 ,  1.45,  1.84,  1.81])],
 [25, 3, array([ 0.66,  2.24,  1.22, -0.11,  0.39,  0.28,  1.25,  1.3 ,  1.18,
          2.07,  1.3 ,  2.02,  0.4 ])],
 [25, 4, array([ 0.66,  2.24,  1.22, -0.11,  0.39, -0.86,  1.45,  0.84, -0.53,
          0.75,  2.77, -0.63, -0.52])],
 [25, 5, array([ 0.66,  2.24,  1.22, -0.11,  0.39,  1.23,  3.47,  1.63,  0.58,
          2.01,  0.6 ,  1.49,  1.01])],
 [25, 6, array([ 0.66,  2.24,  1.22, -0.11,  0.39, -0.83,  0.64, -0.09,  1.04,
          1.32,  0.44,  2.52,  0.06])],
 [25, 7, array([ 0.66,  2.24,  1.22, -0.11,  0.39, -0.33,  0.81,  0.56,  1.53,
          1.3 ,  0.77,  0.72,  2.2 ])],
 [25, 8, array([ 0.66,  2.24,  1.22, -0.11,  0.3

### User-Hotel recommender system

Let start with a recommender system that use only user_id and hotel_id.

But before let generate some historical data from a random agent.

## Run experiment
In order to make Agent and Environment interract, we can create an experiment, parametrized by the number of step we will be running.

In [15]:
def run_exp(agent, env, nb_steps, env_seed):
    rewards = np.zeros(nb_steps)
    regrets = np.zeros(nb_steps)
    actions = np.zeros(nb_steps)  #hotels
    users   = np.zeros(nb_steps) 
    context = env.reset(env_seed)
    rating_matrix = np.zeros((env.nb_users, env.nb_hotels))
    for i in range(nb_steps):
        # Select action from agent policy.
        action = agent.act(context)
        
        # Play action in the environment and get reward.
        reward, next_context, done, optimal_return = env.step(action)
        
        # Update history
        user = context[0][0]
        item = context[action][1]
        rating = reward
        rating_matrix[user, item] = rating
        
        # Update agent.
        agent.update(context, action, reward)
        context = next_context
        
        # Save history.
        #context[i] = context
        rewards[i] = reward
        actions[i] = action
        users[i]   = user
        regrets[i] = optimal_return - reward

    reward = rewards.sum()
    regret = np.sum(regrets)
    return {'reward': reward, 
            'regret': regret,
            'rewards': rewards,
            'users' : users,            
            'regrets': regrets,
            'actions': actions,
            'cum_rewards': np.cumsum(rewards), 
            'cum_regrets': np.cumsum(regrets),
            'rating_matrix': rating_matrix
            }


You can see that on this experiment the total reward is 285 (401) and the regret is 112 (495). You can also have a look at individual rewards, actions or regrets.

Thanks to the historical data, we can train a matrix factorization algorithm to try to predict the non observed rating values.

## Agent
Here we create a very basic agent that will pull arm, i.e.play action, at random.

In [16]:
class Random:
    """ Random agent. """
    def __init__(self, nb_arms, seed=None):
        self._nb_arms = nb_arms
        self._rng = np.random.RandomState(seed)
        
    def act(self, context):
        action = self._rng.randint(len(context)) # note that action size is changing
        return action
        
    def update(self, context, action, reward):
        pass


In [17]:
agent = Random(None, seed=2020)

In [18]:
nb_exp = 100
nb_steps = 100
regret = np.zeros(nb_exp)
regrets = np.zeros((nb_exp, nb_steps))
for i in range(nb_exp):
    env = ExplicitFeedback()
    agent = Random(None, seed=i)
    exp = run_exp(agent, env, nb_steps, env_seed=i)
    regret[i] = exp['regret'] 
    regrets[i] = exp['cum_regrets']

In [19]:
plt.plot(regrets.mean(axis=0), color='blue')
plt.plot(np.quantile(regrets, 0.05,axis=0), color='grey', alpha=0.5)
plt.plot(np.quantile(regrets, 0.95,axis=0), color='grey', alpha=0.5)
plt.title('Mean regret: {:.2f}'.format(regret.mean()))
plt.xlabel('steps')
plt.ylabel('regret')
plt.show()

## Epsilon Greedy

In [None]:
class EpsilonGreedy:
    """ Epsilon greedy agent. """
    def __init__(self, nb_arms, context_size, lr=.1, epsilon=0, seed=None):
        self._nb_arms = nb_arms
        self._p = context_size
        self._lr = lr
        self._epsilon = epsilon
        self._rng = np.random.RandomState(seed)
        self._beta = np.zeros((nb_arms, self._p)) 
        self._n = np.zeros(nb_arms)
        self._pred_reward = np.zeros(nb_arms)
        
    def act(self, context):
        ###############
        context_v = np.asarray(context)[:,2]
        context_f = []
        for i in context_v:
            context_f.append(np.asarray(i))
        context_f = np.asarray(context_f)
        ###############
        #print('c', context_f.shape)
        #print('b', self._beta.shape)

        if self._rng.rand() < self._epsilon:
            action = self._rng.randint(self._nb_arms)
        else:
            pred_reward = np.einsum('ij,ij->i', context_f, self._beta)
            action = random_argmax(self._rng, pred_reward)
        return action
        
    def update(self, context, action, reward):
        """ Simple gradient descent. """
        ################
        context_v = np.asarray(context)[:,2]
        context_f = []
        for i in context_v:
            context_f.append(np.asarray(i))
        context_f = np.asarray(context_f)
        ################
        
        self._n[action] += 1
        grad = - context_f[action] * (reward - context_f[action].dot(self._beta[action]))
        self._beta[action] = self._beta[action] - self._lr/self._n[action] * grad


In [None]:
eps_agent = EpsilonGreedy(10, 13)
run_exp(eps_agent, env, nb_steps=100, env_seed=2020)

<font color='red'> For the rest of this notebook, we will assume that when the Agent start it has access to some historical data generated by a random policy. </font>

In [None]:
nb_exp = 500
nb_steps = 100
regret = np.zeros(nb_exp)
regrets = np.zeros((nb_exp, nb_steps))
for i in range(nb_exp):
    env.reset(seed=2020)
    eps_agent = EpsilonGreedy(10, 13)
    res = run_exp(eps_agent, env, nb_steps, env_seed=2020)
    regret[i] = res['regret'] 
    regrets[i] = res['cum_regrets']

In [None]:
plt.plot(regrets.mean(axis=0), color='blue')
plt.plot(np.quantile(regrets, 0.05, axis=0), color='grey', alpha=0.5)
plt.plot(np.quantile(regrets, 0.95, axis=0), color='grey', alpha=0.5)
plt.title('Mean regret: {:.2f}'.format(regret.mean()))
plt.xlabel('steps')
plt.ylabel('regret')
plt.show()

## LinUCB (to completE)

In [None]:
class LinUCB():

  def __init__(self, T, d, delta=0.1):
    self.T = T
    self.sigma = 1                    # subgaussianity
    self.d = d                        # dimension

    self.lambd =  0.1               # regularization  < 1
    self.L = 1                       # upperbound for actions
    self.S = 1                        # upperbound for parameters 0.1
    self.alpha = 0.8                    # weight [default = 1] 1
    self.t = 0                         # iteration number
    self.delta = delta
     


  def start(self):

    self.t = 0
    self.V = np.identity(self.d) * self.lambd          # covariance or design matrix
    self.b = np.zeros(self.d)
    self.theta = np.zeros(self.d)
    self.V_inv = np.linalg.inv(self.V)          
    

  def act(self, context): #(self, arms)
    ###############
    context_v = np.asarray(context)[:,2]
    context_f = []
    for i in context_v:
        context_f.append(np.asarray(i))
    arms = np.asarray(context_f)
    ###############

    ucb = []

    part1 = np.sqrt(self.lambd) * self.S
    part2 = np.sqrt(2*np.log(1/self.delta) + self.d * np.log( (self.lambd * self.d + self.t * self.L**2)/ (self.lambd * self.d) ))
    #part3 = np.sqrt(2*np.log(1/delta) + self.d * np.log( np.linalg.det(self.V)/ self.lambd**self.d) )
    self.beta = part1 + part2
    self.t = self.t + 1
    
    for act in arms:
      print(act.shape, self.V_inv.shape)
      elliptic_norm = np.matmul(np.matmul(act.T,  self.V_inv), act)
      ucb_value = np.matmul(act.T, self.theta) + self.alpha * np.sqrt( self.beta ) * np.sqrt( elliptic_norm )
      ucb.append(ucb_value)
    
    ucb = np.asarray(ucb)
    #ucb[np.random.choice(len(ucb))] += 0.000001  # per valori uguali
    action = np.argmax(ucb)
    print('act', action)
    return action


  def update(self, context, action, reward):
    chosen_arm = context
    self.V = self.V + np.matmul(chosen_arm, chosen_arm.T)
    self.V_inv = self.V_inv - ((self.V_inv * chosen_arm * chosen_arm.T * self.V_inv) / (1 + chosen_arm.T * self.V_inv * chosen_arm))
    self.b = self.b + reward * chosen_arm
    self.theta = np.matmul(self.V_inv, self.b)
    return

In [None]:
linucb = LinUCB(10, 13)
linucb.start()
run_exp(linucb, env, nb_steps=100, env_seed=2020)

## Thompson agent

In [None]:
class BetaBernoulli(object):
    """ Beta Bernoulli Bayesian distribution. """
    def __init__(self, a=1, b=1, prior=np.ones(10)):
        self.a = a
        self.b = b
        self.n = copy(prior)  # number of {0,1} rewards

    def update(self, reward):
        self.n[int(reward)] += 1

    def sample(self, np_random):
        return np_random.beta(self.a + self.n[1], self.b + self.n[0])

from scipy.stats import t as student
class NormalGamma(object):
    """ NG(m, λ|µ = ·, κ = 0, α = −1/2, β = 0) """
    def __init__(self):
        self.mean = 0
        self.ss = 0
        self.n = 0

    def update(self, reward):
        self.n += 1
        # update mean and sum of square
        old_mean = self.mean
        self.mean +=  (reward - self.mean)/self.n
        self.ss += (reward - old_mean) * (reward - self.mean) # Welford's algorithm

    def sample(self, np_random):
        if self.n <= 2:
            return np.Inf
        else:
            return student.rvs(df=self.n-1, loc=self.mean, 
                               scale=self.ss/(self.n*self.n-1),
                               random_state=np_random)

class Thompson_agent():
    def __init__(self, nb_arms, a=1, b=1, seed=None):
        self._nb_arms = nb_arms
        self._rng = np.random.RandomState(seed)
        #self.dist = BetaBernoulli(a,b, prior =np.ones(nb_arms))
        dist=NormalGamma
        self._posterior = [dist() for i in range(self._nb_arms)]
        
    def act(self, context):
        sample = np.array([self._posterior[i].sample(self._rng)
                           for i in range(self._nb_arms)])
        action = np.argmax(sample)
        return action
        
    def update(self, context, action, reward):
        self._posterior[action].update(reward)
        return

In [None]:
thompson = Thompson_agent(10)
run_exp(thompson, env, nb_steps=100, env_seed=2020)

In [None]:
nb_exp = 1000
nb_steps = 100
regret = np.zeros(nb_exp)
regrets = np.zeros((nb_exp, nb_steps))
for i in range(nb_exp):
    env.reset(seed=2020)
    thompson = Thompson_agent(10)
    res = run_exp(thompson, env, nb_steps, env_seed=2020)
    regret[i] = res['regret'] 
    regrets[i] = res['cum_regrets']

In [None]:
plt.plot(regrets.mean(axis=0), color='blue')
plt.plot(np.quantile(regrets, 0.05, axis=0), color='grey', alpha=0.5)
plt.plot(np.quantile(regrets, 0.95, axis=0), color='grey', alpha=0.5)
plt.title('Mean regret: {:.2f}'.format(regret.mean()))
plt.xlabel('steps')
plt.ylabel('regret')
plt.show()

## Recommender System with historical data: Embedding Agent

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Dot, Concatenate
import tensorflow as tf

class RegressionModel(Model):
    def __init__(self, embedding_size, nb_users, nb_hotels):
        super().__init__()
        
        self.user_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=nb_users,
                                        input_length=1,
                                        name='user_embedding')
        self.hotel_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=nb_hotels,
                                        input_length=1,
                                        name='hotel_embedding')
        self.flatten = Flatten()
        self.dot = Dot(axes=1)
        
    def call(self, inputs):
        user_inputs = inputs[0]
        hotel_inputs = inputs[1]
        
        user_vecs = self.flatten(self.user_embedding(user_inputs))
        hotel_vecs = self.flatten(self.hotel_embedding(hotel_inputs))
        
        y = self.dot([user_vecs, hotel_vecs])

        # Multiplying it by 10 making it closer to our rewards helps to speed up training
        a = tf.constant([10.])
        y = tf.multiply(a,y)

        return y

In [None]:
class EmbeddingAgent:
    def __init__(self, X, Y, nb_users, nb_hotels):
        self._model = RegressionModel(64, nb_users, nb_hotels)
        self._model.compile(optimizer="adam", loss='mae')
        self._model.fit(X, Y,batch_size=64, epochs=120, validation_split=0.1,shuffle=True)
        self._user_embeddings = self._model.get_weights()[0]
        self._hotels_embeddings = self._model.get_weights()[1]
    
    def act(self, context):
        user = context[0][0]
        user_embedding = self._user_embeddings[user]
        dot_products = np.dot(self._hotels_embeddings, user_embedding)
        best_hotels = np.argsort(dot_products)[::-1] #[::-1] reverses order so now the element with the highest expected reward is at first position
        hotel_selected = best_hotels[0]
        return hotel_selected

    def update(self, context, action, reward):
      pass


In [None]:
## General way to train the Embedding Agent by first creating Historical data and then training with it
steps = 250

eps_agent = EpsilonGreedy(10, 13)
rand_agent = Random(10)

#to create historical data
res = run_exp(rand_agent, env, nb_steps=steps, env_seed=2020)
users_values = res['users']
hotels_values = res['actions']
rewards_values = res['rewards']
############

Y = rewards_values
X = [users_values, hotels_values]

embagent = EmbeddingAgent(X, Y, env.nb_users, env.nb_hotels)

In [None]:
nb_exp = 10
nb_steps = 100
regret = np.zeros(nb_exp)
regrets = np.zeros((nb_exp, nb_steps))
for i in range(nb_exp):
   
    # Generate historical data and train model
    steps = 30

    eps_agent = EpsilonGreedy(10, 13)
    rand_agent = Random(10)

    seed = np.random.randint(1000)

    res = run_exp(rand_agent, env, nb_steps=steps, env_seed=seed)
    users_values = res['users']
    hotels_values = res['actions']
    rewards_values = res['rewards']
    

    Y = rewards_values
    X = [users_values, hotels_values]

    embagent = EmbeddingAgent(X, Y, env.nb_users, env.nb_hotels)
    ############

    # Run experiment
    res = run_exp(embagent, env, nb_steps, env_seed=seed)
    regret[i] = res['regret'] 
    regrets[i] = res['cum_regrets']


plt.plot(regrets.mean(axis=0), color='blue')
plt.plot(np.quantile(regrets, 0.05,axis=0), color='grey', alpha=0.5)
plt.plot(np.quantile(regrets, 0.95,axis=0), color='grey', alpha=0.5)
plt.title('Mean regret: {:.2f}'.format(regret.mean()))
plt.xlabel('steps')
plt.ylabel('regret')
plt.show()