In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
class Armed_Bandits(object):
    def __init__(self, bandits, Q):
        self.bandits = bandits
        self.Q = Q
        self.arms = len(bandits[:,0])

    def sample_bandit(self, bandit_index):
        expected_value = np.random.uniform(self.bandits[bandit_index,0],self.bandits[bandit_index,1])
        return expected_value
    
    def average_reward(self,iterations):
        rewards = []
        for i in range(iterations):
            bandit_index = np.random.randint(0,self.arms)
            sample = [bandit_index, self.sample_bandit(bandit_index)]
            rewards.append(sample)
        average_rewards = np.mean(np.array(rewards)[:,1])
        print('Average reward = ' + str(average_rewards))
    
    def print_Q(self):
        for i in range(self.arms):
            print('Arm ' + str(i+1) + ' = ' + str(100*self.Q[i,1]/np.sum(self.Q[:,1])) + '%')
        
    def e_greedy(self, iterations, frequency, e):
        rewards = []
        for i in range(iterations):
            if np.random.random() > e:
                bandit_index = np.max(Q[:,0])
            else:
                bandit_index = np.random.randint(0,self.arms)
            average_reward = self.sample_bandit(bandit_index)
            sample = [bandit_index, average_reward]
            rewards.append(sample)
            self.Q[bandit_index,1] = self.Q[bandit_index,1] + 1
            self.Q[bandit_index,0] = self.Q[bandit_index,0] + (average_reward - self.Q[bandit_index,0])/self.Q[bandit_index,1]
            if ((i + 1) % frequency) == 0:
                self.print_Q()
                print('Average reward = ' + str(np.mean(np.array(rewards)[:,1])))
        plt.figure()
        plt.plot(range(0,iterations),np.array(rewards)[:,0])
        plt.show()

In [8]:
reward_distribution = np.array([[2,3],
                                [-1,5],
                                [1,5],
                                [-2,4],
                                [0,3],
                                [2,6]])
Q = np.array([[0.0, 0],
             [0.0, 0],
             [0.0, 0],
             [0.0, 0],
             [0.0, 0],
             [0.0, 0]])
bandits = Armed_Bandits(reward_distribution, Q)
bandits.average_reward(100000)

Average reward = 2.33549202361


In [162]:
bandits.e_greedy(1000,100,0.1)



Arm 1 = 2.0%
Arm 2 = 2.0%
Arm 3 = 2.0%
Arm 4 = 91.0%
Arm 5 = 3.0%
Arm 6 = 0.0%
Average reward = 0.856314222866
Arm 1 = 1.0%
Arm 2 = 2.5%
Arm 3 = 3.0%
Arm 4 = 91.0%
Arm 5 = 2.0%
Arm 6 = 0.5%
Average reward = 0.843407711096
Arm 1 = 1.0%
Arm 2 = 3.0%
Arm 3 = 2.66666666667%
Arm 4 = 91.0%
Arm 5 = 2.0%
Arm 6 = 0.333333333333%
Average reward = 0.868098295585
Arm 1 = 1.25%
Arm 2 = 3.25%
Arm 3 = 2.25%
Arm 4 = 90.25%
Arm 5 = 2.0%
Arm 6 = 1.0%
Average reward = 0.967071942506
Arm 1 = 1.0%
Arm 2 = 2.8%
Arm 3 = 2.2%
Arm 4 = 90.2%
Arm 5 = 2.4%
Arm 6 = 1.4%
Average reward = 1.02666364449
Arm 1 = 0.833333333333%
Arm 2 = 2.66666666667%
Arm 3 = 2.16666666667%
Arm 4 = 90.6666666667%
Arm 5 = 2.33333333333%
Arm 6 = 1.33333333333%
Average reward = 1.06947047722
Arm 1 = 1.0%
Arm 2 = 2.42857142857%
Arm 3 = 2.0%
Arm 4 = 90.8571428571%
Arm 5 = 2.0%
Arm 6 = 1.71428571429%
Average reward = 1.08469917202
Arm 1 = 1.125%
Arm 2 = 2.5%
Arm 3 = 2.125%
Arm 4 = 90.75%
Arm 5 = 1.75%
Arm 6 = 1.75%
Average reward = 1.0818450