In [2]:
import numpy as np


class BanditArm:

    def __init__(self, p):
        self.p = p                          # true win probability
        self.p_estimate = 0.                # estimate win probability
        self.N = 0.                         # total collected

    def pull(self):
        return np.random.random() < self.p  # success if random number (0, 1) < p

    def update(self, x):
        self.N += 1.
        self.p_estimate = ((self.N - 1) * self.p_estimate + x) / self.N

In [5]:
num_trials = 1000

performance_avg = 0

total_performance_eps = []

EPSILON = 0.1

bandit_probabilities = [0.25, 0.5, 0.75]

bandits = [BanditArm(p) for p in bandit_probabilities]

investment_avg = 0

for _ in range(100):
    
    investment = 1000
    
    rewards = np.zeros(num_trials)

    for i in range(num_trials):

        if np.random.random() < EPSILON:

            j = np.random.randint(len(bandits))

        else:

            j = np.argmax([b.p_estimate for b in bandits])

        x = bandits[j].pull()

        rewards[i] = x
        
        if x == 1:
            
            investment += i + 1
            
        else:
            
            investment -= i + 1

        bandits[j].update(x)
        
    performance = np.cumsum(rewards) / (np.arange(num_trials) + 1)
    
    total_performance_eps.append(performance[-1])
        
    if np.sum(performance) == 0:
        
        performance_avg = performance
        
    else:
        
        performance_avg = np.mean([performance_avg, performance], axis=0)
        
    if investment_avg == 0:
        
        investment_avg = investment
        
    else:
        
        investment_avg = np.mean([investment_avg, investment], axis=0)
        
print(investment_avg)

233452.0828621453


In [14]:
import plotly.graph_objs as go

d = {'algo': ['Greedy', 'Epsilon Greedy', 'Optimistic Initial Values', 'UCB1', 'Thompson Sampling'], 'investment': [-57881.7734375, 55070.74609375, 64009.96484375, 51362.109375, 61576.33203125]}

colors = ['lightslategray', ] * len(d['investment'])
most_profitable_index = np.argmax(d['investment'])
colors[most_profitable_index] = 'crimson'

fig = go.Figure(data=[go.Bar(
    x=d['algo'],
    y=d['investment'],
    marker_color=colors # marker color can be a single color value or an iterable
)])
fig.update_layout(title_text='Most Profitable Algo')

In [16]:
# pull is given as a fraction of the pie radius

pull = np.zeros(len(d['investment']))
pull[most_profitable_index] = 0.2

fig = go.Figure(data=[go.Pie(labels=d['algo'], values=d['investment'], pull=pull)])
fig.show()