In [1]:
import numpy as np
from matplotlib import pyplot as plt
import plotly.graph_objects as go

In [2]:
class EpsilonGreedyAgent():
    def __init__(self, eps , update_type , machine,  initial_value, alpha=None):
        self.eps = eps
        self.initial_value = initial_value    
        
        self.update_type = update_type
        
        self.machine = machine
        self.nArms = machine.nArms
        self.alpha = alpha
        
        self.n_action = np.ones(self.nArms)
        self.reward_action = np.ones(self.nArms)* self.initial_value
        self.q = np.ones(self.nArms)* self.initial_value
        
        
    def __str__(self):
        text = f'eps = {self.eps}'
        return text + f' Initial Value = {self.initial_value}, Update: {self.update_type}'
    
        
    def action(self):
        '''
        Action function select an action in each step. 
        '''
        
        # choosing 
        p = np.random.random()
        if p<self.eps: 
            # chose random action
            self.last_action = np.random.choice(range(self.nArms)) 
            
        else:
            # chose greedy action
            greedy_actions = np.where(self.q == np.max(self.q))[0]
            self.last_action = np.random.choice(greedy_actions, 1)[0]
        
        self.last_reward = self.machine.get_reward(self.last_action)
        return self.last_action, self.last_reward
    
    def update(self):
        # update total reward for the selected action
        self.r_action[self.last_action] += self.last_reward
        
        # update the counter for the selected action
        self.n_action[self.last_action]+=1
        
        if self.update_type == 'sample-average':
            self.q[self.last_action] = self.r_action[self.last_action] / self.n_action[self.last_action]
        
        elif self.update_type == 'constant-step-size':
            self.q[self.last_action] += self.alpha * (self.last_reward - self.q[self.last_action])
        
        return self.q
    
        
    def reset(self):
        self.n_Action = np.zeros(self.nArms)
        self.r_action = np.zeros(self.nArms)
        self.q = np.ones(self.nArms)* self.initial_value
        self.machine.reset()

In [3]:
class SupervisedAgent():
    def __init__(self, n_train, machine, initial_value ):
        self.n_train = n_train
        self.t = 0
        
        self.machine = machine
        self.nArms = machine.nArms
        self.initial_value = initial_value
        self.n_action = np.ones(self.nArms)
        self.reward_action = np.ones(self.nArms)* self.initial_value
        self.q = np.ones(self.nArms)* self.initial_value
        
    def __str__(self):
        text = 'Supervised Agent'
        return text
        
        
    def action(self):
        if self.t<self.n_train: 
            # choose actions repeatedly
            self.last_action = (self.t*self.nArms) // self.n_train
            
        else:
            # chose greedy action
            greedy_actions = np.where(self.q == np.max(self.q))[0]
            self.last_action = np.random.choice(greedy_actions, 1)[0]
        
        self.last_reward = self.machine.get_reward(self.last_action)
        return self.last_action, self.last_reward
    
    def update(self):
        
        if self.t<self.n_train: 
            self.reward_action[self.last_action] += self.last_reward
            self.n_action[self.last_action]+=1
            self.q[self.last_action] = self.reward_action[self.last_action] / self.n_action[self.last_action]
        
        self.t +=1
        return self.q
        
        
    def reset(self):
        self.t = 0
        self.n_action = np.ones(self.nArms)
        self.reward_action = np.ones(self.nArms)* self.initial_value
        self.q = np.ones(self.nArms)* self.initial_value

In [4]:
class N_arms_machine():
    # This class represent a N-Arm Machine. 
    
    def __init__(self,nArms,means , stds):
        """    
        This class represent a N-Arm Machine. 
    
        Parameters
        ----------
        nArms : int
            Number of arms or handles .
        means : array
            An array of size 1xnArms that specifies the means
        stds : array
            An array of size 1xnArms that specifies the standard deviations
        """        
        self.first_means = means
        self.first_stds = stds
        self.nArms = nArms
        
        self.initial_means = means
        self.initial_stds = stds
        
        self.means = means
        self.stds = stds 
        
        
    def get_reward(self,arm_number, many = 1):
        '''
        generates and returns a random reward
        
        Parameters
        ----------
        arm_number : int
            arm number
        many : int
            the number of rewards that you need to generate. 1 is default value.
        
        
        returns
        -------
        rewards: float
            the generated reward
        '''
            
        reward = np.random.randn(many)*self.stds[arm_number] + self.means[arm_number]
        return reward
    
    def plot(self):
        '''
        Plot the distribution of reward for each arm.
        '''
        fig = go.Figure()
        for iarm in range(self.nArms):
            fig.add_trace(go.Violin(y = self.get_reward(iarm, 100000),
                                    name=f'Action {iarm+1}',
                                    box_visible=True,
                                    meanline_visible=True))
            
        fig.update_layout(title='Reward Discibution',
                   xaxis_title='Action',
                   yaxis_title='Reward', 
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        fig.show()
        
    def reset(self):
        self.means = self.initial_means
        self.stds = self.first_stds

In [5]:
class Bandit_Problem():
        
    def __init__(self, agents ,  max_iter=1000 , rounds=1000):
        '''
        A Class for a Bandit problem. This class takes in a machine and some agents,
        and run the bandit problem for all agents for specific iteration.
        
        
        Parameters
        ----------    
        agents: list
            a list of agents with different behavior
            
        max_iter: int
            the number of independent runs
            
        rounds : int
            the number of steps in each run
        '''
        
        self.agents = agents
        self.max_iter = max_iter
        self.rounds = rounds
        
        self.total_rewards = np.zeros((self.rounds ,len(self.agents) ))
        
        self.optimum_selection = np.zeros((self.rounds ,len(self.agents) ))
        
        
    def run(self,vis_count):
        for i in range(self.max_iter):
            
            if (i+1)%vis_count ==0:
                print(f'Iteration {i+1} is completed')
            # reset all agents
            for agent in self.agents:
                agent.reset()
                
            
            for j in range(self.rounds):
                
                for agent_cnt , agent in enumerate(self.agents):
                    action , reward = agent.action()
                    if action == np.argmax(agent.machine.means):
                        self.optimum_selection[j,agent_cnt] +=1
                    agent.update()
                    self.total_rewards[j , agent_cnt] += reward
        
    
    
    def plot_average_reward(self):
        cumulative_rewards = np.cumsum(self.total_rewards,0)
        t = np.arange(1,self.rounds+1).reshape(self.rounds,1)
        outcome = cumulative_rewards/(t*self.max_iter) 
        
        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=outcome[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Average Reward',
                   xaxis_title='Step',
                   yaxis_title='Reward', 
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        
        fig.show()

    
    
    def plot_instance_reward(self):
        rewards= self.total_rewards / self.max_iter

        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=rewards[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Immediate Reward',
                   xaxis_title='Step',
                   yaxis_title='Reward',
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        
        fig.show()


        
        
    def plot_optimum_selections(self):
        opt = self.optimum_selection / self.max_iter

        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=opt[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Optimum Selection',
                   xaxis_title='Step',
                   yaxis_title='#Optimum Selection',
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1)
                   )
        
        fig.show()
    
    
        

In [6]:
class Dynamic_Bandit_Problem():
        
    def __init__(self, agents , new_means , new_stds, change_time ,  max_iter=1000 , rounds=1000, ):
        '''
        A Class for a Bandit problem. This class takes in a machine and some agents,
        and run the bandit problem for all agents for specific iteration.
        
        
        Parameters
        ----------    
        agents: list
            a list of agents with different behavior
            
        max_iter: int
            the number of independent runs
            
        rounds : int
            the number of steps in each run
        '''
        
        self.agents = agents
        self.max_iter = max_iter
        self.rounds = rounds
        self.new_means = new_means
        self.new_stds = new_stds
        self.change_time = change_time 
        
        self.total_rewards = np.zeros((self.rounds ,len(self.agents) ))
        
        self.optimum_selection = np.zeros((self.rounds ,len(self.agents) ))
        
        
        
    def run(self,vis_count):
        for i in range(self.max_iter):
            
            if (i+1)%vis_count ==0:
                print(f'Iteration {i+1} is completed')
            # reset all agents
            for agent in self.agents:
                agent.reset()
                
                
            for j in range(self.rounds):
                
                # if this is the time for changing the dynamics
                if j==self.change_time:
                    for agent in self.agents:
                        agent.machine.means = self.new_means
                        agent.machine.stds = self.new_stds
                
                for agent_cnt , agent in enumerate(self.agents):
                    
                    action , reward = agent.action()
                    if action == np.argmax(agent.machine.means):
                        self.optimum_selection[j,agent_cnt] +=1
                    agent.update()
                    self.total_rewards[j , agent_cnt] += reward
        
    
    
    def plot_average_reward(self):
        cumulative_rewards = np.cumsum(self.total_rewards,0)
        t = np.arange(1,self.rounds+1).reshape(self.rounds,1)
        outcome = cumulative_rewards/(t*self.max_iter) 
        
        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=outcome[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Average Reward',
                   xaxis_title='Step',
                   yaxis_title='Reward',
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        
        fig.show()

    
    
    def plot_instance_reward(self):
        rewards= self.total_rewards / self.max_iter

        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=rewards[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Immediate Reward',
                   xaxis_title='Step',
                   yaxis_title='Reward',
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        
        fig.show()


        
        
    def plot_optimum_selections(self):
        opt = self.optimum_selection / self.max_iter

        fig = go.Figure()
        x = np.arange(0, self.rounds+1)
        for i in range(len(self.agents)):   
            fig.add_trace(go.Scatter(x=x, y=opt[:,i],
                                mode='lines', name=str(self.agents[i])))
            
        fig.update_layout(title='Optimum Selection',
                   xaxis_title='Step',
                   yaxis_title='#Optimum Selection',
                   legend=dict(
                       orientation="h",
                       yanchor="bottom",
                       y=1.02,
                       xanchor="auto",
                       x=1))
        
        fig.show()
    

In [53]:
agents = [EpsilonGreedyAgent(eps=0, update_type ='sample-average' , 
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=0),
          EpsilonGreedyAgent(eps=0.05, update_type ='sample-average' ,
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=0),
          EpsilonGreedyAgent(eps=0.1, update_type ='sample-average' ,
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=0),
          SupervisedAgent(n_train=100, initial_value=0,
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),)]

bandit_problem = Bandit_Problem(agents ,  rounds=1000, max_iter= 1000)

In [28]:
machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1])
machine.plot()

Output hidden; open in https://colab.research.google.com to view.

In [54]:
out= bandit_problem.run(vis_count=100)

Iteration 100 is completed
Iteration 200 is completed
Iteration 300 is completed
Iteration 400 is completed
Iteration 500 is completed
Iteration 600 is completed
Iteration 700 is completed
Iteration 800 is completed
Iteration 900 is completed
Iteration 1000 is completed


In [55]:
bandit_problem.plot_average_reward()

In [56]:
bandit_problem.plot_instance_reward()

In [57]:
bandit_problem.plot_optimum_selections()

In [36]:
machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1])

agents = [EpsilonGreedyAgent(eps=0, update_type ='sample-average' , machine = machine  ,initial_value=0),
          EpsilonGreedyAgent(eps=0.1, update_type ='sample-average' , machine = machine  ,initial_value=0),
          EpsilonGreedyAgent(eps=0.1, update_type ='constant-step-size' , machine = machine  ,initial_value=0,alpha = 0.2)]

dynamic_bandit_problem = Dynamic_Bandit_Problem(agents , 
                                        new_means=[0.1, 0.7, -0.4, 0.3, 0.5],
                                        new_stds= [1,1,1,1,1],
                                        change_time=500,
                                        rounds=1000, 
                                        max_iter= 1000)

In [37]:
out= dynamic_bandit_problem.run(vis_count=100)

Iteration 100 is completed
Iteration 200 is completed
Iteration 300 is completed
Iteration 400 is completed
Iteration 500 is completed
Iteration 600 is completed
Iteration 700 is completed
Iteration 800 is completed
Iteration 900 is completed
Iteration 1000 is completed


In [38]:
dynamic_bandit_problem.plot_average_reward()

In [39]:
dynamic_bandit_problem.plot_instance_reward()

In [40]:
dynamic_bandit_problem.plot_optimum_selections()

In [41]:
agents = [EpsilonGreedyAgent(eps=0, update_type ='sample-average' , 
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=5),
          EpsilonGreedyAgent(eps=0.05, update_type ='sample-average' ,
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=5),
          EpsilonGreedyAgent(eps=0.1, update_type ='sample-average' ,
                             machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1]),
                             initial_value=5)]

bandit_problem = Bandit_Problem(agents ,  rounds=1000, max_iter= 1000)

In [42]:
out= bandit_problem.run(vis_count=100)

Iteration 100 is completed
Iteration 200 is completed
Iteration 300 is completed
Iteration 400 is completed
Iteration 500 is completed
Iteration 600 is completed
Iteration 700 is completed
Iteration 800 is completed
Iteration 900 is completed
Iteration 1000 is completed


In [43]:
bandit_problem.plot_average_reward()

In [44]:
bandit_problem.plot_instance_reward()

In [45]:
bandit_problem.plot_optimum_selections()

In [46]:
machine = N_arms_machine(5,[0.1, -0.7, 0.8, 0.3, 0.5], [1,1,1,1,1])

agents = [EpsilonGreedyAgent(eps=0, update_type ='sample-average' , machine = machine  ,initial_value=5),
          EpsilonGreedyAgent(eps=0.1, update_type ='sample-average' , machine = machine  ,initial_value=5),
          EpsilonGreedyAgent(eps=0.1, update_type ='constant-step-size' , machine = machine  ,initial_value=5,alpha = 0.2)]

dynamic_bandit_problem = Dynamic_Bandit_Problem(agents , 
                                        new_means=[0.1, 0.7, -0.4, 0.3, 0.5],
                                        new_stds= [1,1,1,1,1],
                                        change_time=500,
                                        rounds=1000, 
                                        max_iter= 1000)

In [47]:
out= dynamic_bandit_problem.run(vis_count=100)

Iteration 100 is completed
Iteration 200 is completed
Iteration 300 is completed
Iteration 400 is completed
Iteration 500 is completed
Iteration 600 is completed
Iteration 700 is completed
Iteration 800 is completed
Iteration 900 is completed
Iteration 1000 is completed


In [48]:
dynamic_bandit_problem.plot_average_reward()

In [49]:
dynamic_bandit_problem.plot_instance_reward()

In [50]:
dynamic_bandit_problem.plot_optimum_selections()