<a href="https://colab.research.google.com/github/sanikamal/reinforcement-learning-atoz/blob/master/notebook/thompson_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The multi-armed bandit problem

## The Thompson Sampling Model

### Problem:Trying to find the best slot machine with the highest winning chance out of many. 

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
# setting conversion rates and the number of sample
conversion_rates = [0.15,0.04,0.13,0.11,0.05]
N = 10000
d = len(conversion_rates)

In [2]:
# creating the dataset
X = np.zeros((N,d))

for i in range(N):
    for j in range(d):
        if np.random.rand() < conversion_rates[j]:
            X[i][j] = 1

In [3]:
# making arrays to count our losses and wins
no_of_pos_reward = np.zeros(d)
no_of_neg_reward = np.zeros(d)

In [4]:
# taking best slot machine through beta distribution and updating its losse and wins
for i in range(N):
    selected = 0
    max_random = 0
    for j in range(d):
        random_beta = np.random.beta(no_of_pos_reward[j] + 1,no_of_neg_reward[j] + 1)
        if random_beta > max_random:
            max_random = random_beta
            selected = j
    if X[i][selected] == 1:
        no_of_pos_reward[selected] += 1
    else:
        no_of_neg_reward[selected] += 1

In [5]:
# Showing which slot machine is considered the best
n_selected = no_of_pos_reward + no_of_neg_reward
for i in range(d):
    print('Machine number ' + str(i + 1)+ ' was selected '+ str(n_selected[i])+ ' times')
print('Conclusion: Best machine is machine number '+ str(np.argmax(n_selected)+ 1))

Machine number 1 was selected 5486.0 times
Machine number 2 was selected 61.0 times
Machine number 3 was selected 3752.0 times
Machine number 4 was selected 560.0 times
Machine number 5 was selected 141.0 times
Conclusion: Best machine is machine number 1


In [None]:
# Models comparison
N = [200, 1000, 5000]
D = 20
convRanges = [(0., 0.1), (0., 0.3), (0., 0.5)]

results = list()
for n in N:
    for ranges in convRanges:
        results.append([])
        for d  in range(3, D + 1):
            p1 = 0
            p2 = 0

            for rounds in range(1000):
                
                conversionRates = list()
                for i in range(d):
                    conversionRates.append(np.random.uniform(low = ranges[0], high = ranges[1]))
                    
                X = np.zeros((n,d))
                for i in range(n):
                    for j in range(d):
                        if np.random.rand() < conversionRates[j]:
                            X[i][j] = 1
                
                nPosReward = np.zeros(d)
                nNegReward = np.zeros(d)
                
                for i in range(n):
                    selected = 0
                    maxRandom = 0
                    
                    for j in range(d):
                        randomBeta = np.random.beta(nPosReward[j] + 1, nNegReward[j] + 1)
                        if randomBeta > maxRandom:
                            maxRandom = randomBeta
                            selected = j
                        
                    if X[i][selected] == 1:
                        nPosReward[selected] += 1
                    else:
                        nNegReward[selected] += 1
                
                nSelected = nPosReward + nNegReward
                
                left = n - max(nSelected)
                
                countStandard = np.zeros(d)
                
                x = int(left / d)
                for i in range(x):
                    for j in range(d):
                        if X[i][j] == 1:
                            countStandard[j] += 1
                
                bestStandard = np.argmax(countStandard)
                bestReal = np.argmax(conversionRates)
                bestTS = np.argmax(nSelected)

                if bestTS == bestReal:
                    p1 += 1
                if bestStandard == bestReal:
                    p2 += 1
                
            print('N = ' + str(n) + ' d = ' + str(d) + ' range = ' + str(ranges) + ' | result Thompson Sampling = ' + str(p1) + ' result Standard solution = ' + str(p2))
            results.append([n, ranges, d, p1, p2])
                
df = pd.DataFrame(results)
df.to_excel('results.xlsx', sheet_name = 'Result', index = False)

N = 200 d = 3 range = (0.0, 0.1) | result Thompson Sampling = 652 result Standard solution = 591
N = 200 d = 4 range = (0.0, 0.1) | result Thompson Sampling = 571 result Standard solution = 497
N = 200 d = 5 range = (0.0, 0.1) | result Thompson Sampling = 478 result Standard solution = 400
N = 200 d = 6 range = (0.0, 0.1) | result Thompson Sampling = 401 result Standard solution = 352
N = 200 d = 7 range = (0.0, 0.1) | result Thompson Sampling = 378 result Standard solution = 322
N = 200 d = 8 range = (0.0, 0.1) | result Thompson Sampling = 325 result Standard solution = 305
N = 200 d = 9 range = (0.0, 0.1) | result Thompson Sampling = 305 result Standard solution = 269
N = 200 d = 10 range = (0.0, 0.1) | result Thompson Sampling = 271 result Standard solution = 210
N = 200 d = 11 range = (0.0, 0.1) | result Thompson Sampling = 226 result Standard solution = 234
N = 200 d = 12 range = (0.0, 0.1) | result Thompson Sampling = 244 result Standard solution = 192
N = 200 d = 13 range = (0.0