In [1]:
import math

class UCB1:
    """Class for running a toy example of MAB UCB1 problem.
    
    :param money: How much money the gambler has to spend.
    :type money: int
    :param n_bandits: How many bandits there are to play.
    :type n_bandits: int
    """
    def __init__(self, money, n_bandits):
        self.money = money
        self.n_bandits = n_bandits
        self.bandits_played = []
        self.bandits_returns = {x:0 for x in range(self.n_bandits)}
        self.n_selections = {x:0 for x in range(self.n_bandits)}
        self.total_return = 0
        self.bounds = {x:[] for x in range(self.n_bandits)}
        
    def calc_ucb(self, i, t):
        if self.n_selections[i] > 0:
            Qa = self.bandits_returns[i] / self.n_selections[i]
            Ut = math.sqrt(2 * math.log(t + 1) / self.n_selections[i])
            ub = Qa + Ut
            print('Upper bound: ', ub)
            self.bounds[i].append(ub)
        else:
            ub = 10
            self.bounds[i].append(ub)
        return {i:ub}
        
    def play_round(self, t):
        res = {k: v for d in [self.calc_ucb(x, t) for x in range(self.n_bandits)] for k, v in d.items()}
        bandit = max(res, key=lambda v: res[v])
        self.bandits_played.append(bandit)
        print('Bandit: ', bandit)
        self.n_selections[bandit] += 1
        cash_return = int(input())
        self.bandits_returns[bandit] += cash_return
        self.total_return += cash_return
        
    def run(self):
        [self.play_round(x) for x in range(self.money)]
        print(f'Bandit {max(self.bandits_returns, key=lambda v: self.bandits_returns[v])} was the optimal machine.')

In [2]:
ucb = UCB1(10, 3)

In [3]:
ucb.run()

Bandit:  0
1
Upper bound:  2.177410022515475
Bandit:  1
0
Upper bound:  2.4823038073675114
Upper bound:  1.4823038073675112
Bandit:  2
0
Upper bound:  2.6651092223153956
Upper bound:  1.6651092223153954
Upper bound:  1.6651092223153954
Bandit:  0
1
Upper bound:  2.26863624117952
Upper bound:  1.7941225779941015
Upper bound:  1.7941225779941015
Bandit:  0
1
Upper bound:  2.0929347248663586
Upper bound:  1.8930184728248454
Upper bound:  1.8930184728248454
Bandit:  0
1
Upper bound:  1.9863848511243756
Upper bound:  1.9727697022487511
Upper bound:  1.9727697022487511
Bandit:  0
1
Upper bound:  1.9120178817720266
Upper bound:  2.039333980337618
Upper bound:  2.039333980337618
Bandit:  1
0
Upper bound:  1.9374912431241627
Upper bound:  1.4823038073675112
Upper bound:  2.09629414793641
Bandit:  2
0
Upper bound:  1.9597051824376162
Upper bound:  1.5174271293851465
Upper bound:  1.5174271293851465
Bandit:  0
1
Bandit 0 was the optimal machine.
