### Бейзлайн

In [3]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
# from task_2_multiarmed_bandit.sim_lib import simulation
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [4]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [5]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
CPU times: user 13min 4s, sys: 872 ms, total: 13min 5s
Wall time: 13min 7s


787.5589392185211

In [6]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

## Regret для 200к итераций у бейзлайна равен 2792.237649427154

In [7]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


### Запускаем бейзлайн для 4к итераций, чтобы можно было быстро сравнить с другими политиками

In [22]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=4000)
end = time.time()
end - start

1 impressions have been simulated


16.897266387939453

In [23]:
output['regret'], output['regret']/output['rounds'], output['total_banners']

(87.21295201233723, 0.021803238003084306, 12)

## Regret для 4к итераций у бейзлайна равен 87.21295201233723

In [24]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
0,1074.0,60.0,13599.571491,0.059342
1,34.0,1.0,4595.305296,0.047485
2,30.0,0.0,4427.377986,0.016141
3,2647.0,318.0,6730.054398,0.123087
4,38.0,2.0,162.019874,0.025441
5,67.0,5.0,15826.394049,0.111589
6,24.0,0.0,5467.54237,0.020404
8,21.0,0.0,8658.518663,0.000177
9,29.0,3.0,2314.029538,0.120562
10,17.0,0.0,4590.172737,0.029191


### Алгоритм UCB

Так как в бейзлайновой политике используется глобальный генератор случайных чисел, который используется и в симуляции, то в реализованных политиках делаю такие же вызовы глобального генератора случайных чисел, как и в бейзлайновой политике.

А для самих политик использую отдельный генератор случайных чисел, если необходимо.

In [25]:
def ucb(history: pd.DataFrame, c: float):
    # just copy code from epsilon-greedy algo to reproduce the same clicks and banners generation
    eps = 0.08
    if uniform.rvs() < eps:
        n = history.shape[0]
        history.index[randint.rvs(0, n)]
    
    t = np.sum(history['impressions']) + 1
    # ns = np.maximum(history['impressions'], 1)
    ns = history['impressions'] + 1
    ctr = history['clicks'] / ns
    
    ctr_corrected = ctr + c * np.sqrt(np.log(t) / ns)
    
    n = np.argmax(ctr_corrected)
    
    return history.index[n]

ucb_policy = partial(ucb, c=np.sqrt(2))

In [26]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output_ucb_1 = simulation(ucb_policy, n=4000)
end = time.time()
end - start

1 impressions have been simulated


23.580195665359497

In [27]:
output_ucb_1['regret'], output_ucb_1['regret']/output_ucb_1['rounds'],  output_ucb_1['total_banners']

(210.82386888906808, 0.05270596722226702, 12)

## Regret для 4к итераций у UCB с $c=\sqrt{2}$ равен 210.82386888906808, это хуже бейзлайна (87.21295201233723)

In [28]:
output_ucb_1['history']

Unnamed: 0,impressions,clicks,lifetime,p
0,342.0,19.0,13599.571491,0.059342
1,318.0,15.0,4595.305296,0.047485
2,225.0,1.0,4427.377986,0.016141
3,613.0,68.0,6730.054398,0.123087
4,273.0,8.0,162.019874,0.025441
5,480.0,43.0,15826.394049,0.111589
6,246.0,4.0,5467.54237,0.020404
8,218.0,0.0,8658.518663,0.000177
9,714.0,88.0,2314.029538,0.120562
10,311.0,14.0,4590.172737,0.029191


#### Обращаем внимание, что сгенерированные банеры получились такими же, как и в бейзлайне, так что вызовы к глобальному генератору случайных чисел правильные.

In [29]:
ucb_policy_2 = partial(ucb, c=1.1)

In [34]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output_ucb_2 = simulation(ucb_policy_2, n=4000)
end = time.time()
end - start

1 impressions have been simulated


19.58591103553772

In [35]:
output_ucb_2['regret'], output_ucb_2['regret']/output_ucb_2['rounds'],  output_ucb_2['total_banners']

(183.0095686378254, 0.04575239215945635, 12)

## Regret для 4к итераций у UCB с $c=1.1$ равен 183.0095686378254, это чуть лучше, но тоже хуже бейзлайна (87.21295201233723)

In [36]:
output_ucb_2['history']

Unnamed: 0,impressions,clicks,lifetime,p
0,354.0,24.0,13599.571491,0.059342
1,242.0,8.0,4595.305296,0.047485
2,196.0,2.0,4427.377986,0.016141
3,680.0,78.0,6730.054398,0.123087
4,242.0,8.0,162.019874,0.025441
5,534.0,53.0,15826.394049,0.111589
6,220.0,5.0,5467.54237,0.020404
8,179.0,0.0,8658.518663,0.000177
9,886.0,115.0,2314.029538,0.120562
10,250.0,9.0,4590.172737,0.029191


### Thompson sampling

Подсматриваем в скрипте sim_lib.py априорное распределение на вероятность клика, используем такое же априорное распределение в thompson sampling'е

И опять же дублируем использование глобального генератора случайных чисел как в бейзлайне.

In [83]:
def thompson(history: pd.DataFrame, rng: np.random.RandomState):
    # just copy code from epsilon-greedy algo to reproduce the same clicks and banners generation
    eps = 0.08
    if uniform.rvs() < eps:
        n = history.shape[0]
        history.index[randint.rvs(0, n)]
        
    alpha = 1 + history['clicks']
    beta = 20 + history['impressions'] - history['clicks']
    
    samples = rng.beta(alpha, beta)
    n = np.argmax(samples)
    
    return history.index[n]

In [84]:
rng = np.random.RandomState(seed=42)
thompson_policy = partial(thompson, rng=rng)

# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output_thompson_1 = simulation(thompson_policy, n=4000)
end = time.time()
end - start

1 impressions have been simulated


15.850241899490356

In [85]:
output_thompson_1['regret'], output_thompson_1['regret']/output_thompson_1['rounds'],  output_thompson_1['total_banners']

(28.249565731336613, 0.007062391432834153, 12)

### regret для 4к итераций равен 28.249565731336613, это лучше, чем у бейзлайна (87.21295201233723)

In [86]:
output_thompson_1['history']

Unnamed: 0,impressions,clicks,lifetime,p
0,60.0,3.0,13599.571491,0.059342
1,27.0,0.0,4595.305296,0.047485
2,32.0,1.0,4427.377986,0.016141
3,2959.0,371.0,6730.054398,0.123087
4,40.0,1.0,162.019874,0.025441
5,205.0,21.0,15826.394049,0.111589
6,32.0,1.0,5467.54237,0.020404
8,26.0,0.0,8658.518663,0.000177
9,527.0,57.0,2314.029538,0.120562
10,21.0,0.0,4590.172737,0.029191


#### Обращаем внимание, что сгенерированные банеры получились такими же, как и в бейзлайне, так что вызовы к глобальному генератору случайных чисел правильные.

In [87]:
rng = np.random.RandomState(seed=42)
thompson_policy = partial(thompson, rng=rng)

# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output_thompson_2 = simulation(thompson_policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


855.947244644165

In [88]:
output_thompson_2['regret'], output_thompson_2['regret']/output_thompson_2['rounds'],  output_thompson_2['total_banners']

(1003.6264206811237, 0.005018132103405619, 174)

## Regret для 200к итераций у Thompson Sampling равен 1003.6264206811237, это лучше бейзлайна (2792.237649427154)

In [89]:
output_thompson_2['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,15743.0,1048.0,10870.812904,0.064972
162,126.0,2.0,18594.827945,0.017514
163,66.0,0.0,5153.010937,0.000849
164,171.0,5.0,5092.571727,0.041281
166,93.0,1.0,5340.55207,0.007253
167,456.0,21.0,1474.181162,0.033849
168,81.0,2.0,4900.260295,0.027273
169,71.0,1.0,5007.022458,0.030857
170,78.0,1.0,8920.324215,0.034653
171,104.0,2.0,1080.025985,0.012549


#### Обращаем внимание, что сгенерированные банеры получились такими же, как и в бейзлайне, так что вызовы к глобальному генератору случайных чисел правильные.

### Запускаем UCB для 200к итераций, может быть на этой дистанции всё-таки получится побить бейзлайн

In [94]:
# coefficient from slides
ucb_policy_3 = partial(ucb, c=np.sqrt(2))

In [95]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output_ucb_3 = simulation(ucb_policy_3, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


1040.8300166130066

In [96]:
output_ucb_3['regret'], output_ucb_3['regret']/output_ucb_3['rounds'],  output_ucb_3['total_banners']

(8983.02485414273, 0.04491512427071365, 174)

## Regret для 200к итераций у UCB с коэффицентом из лекции ($\sqrt{2}$) равен 8983.02485414273, это хуже бейзлайна (2792.237649427154)

In [97]:
output_ucb_3['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,3758.0,252.0,10870.812904,0.064972
162,1247.0,21.0,18594.827945,0.017514
163,562.0,0.0,5153.010937,0.000849
164,838.0,27.0,5092.571727,0.041281
166,617.0,5.0,5340.55207,0.007253
167,864.0,34.0,1474.181162,0.033849
168,745.0,21.0,4900.260295,0.027273
169,745.0,21.0,5007.022458,0.030857
170,773.0,24.0,8920.324215,0.034653
171,610.0,7.0,1080.025985,0.012549


### Пробуем запустить UCB для разных коэффициентов

In [98]:
coefs = np.logspace(np.log10(0.1), np.log10(4.0), num=15)
coefs

array([0.1       , 0.13014661, 0.1693814 , 0.22044415, 0.28690058,
       0.37339138, 0.48595621, 0.63245553, 0.82311943, 1.07126202,
       1.39421119, 1.81451859, 2.36153441, 3.07345695, 4.        ])

In [99]:
%%time

ucb_outputs = []
for coef in coefs:
    # seed for homework
    np.random.seed(seed=384758917)

    start = time.time()
    current_policy = partial(ucb, c=coef)
    current_output = simulation(current_policy, n=200000)
    end = time.time()
    ucb_outputs.append((coef, end - start, current_output))
    
    print(coef, " done")

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
0.1  done
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated


80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
1.39421119370234  done
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions

### Табличка результатов для UCB

Получается, что чем меньше коэффициент, тем лучше regret.

В районе нуля скорее всего результаты ухудшатся (вообще, думал, что и в районе 0.1 ухудшится), но это долго работает, запускать для других коэффицентов уже не буду.

In [107]:
ucb_outputs_df = pd.DataFrame(
    [
        [
            item[0],
            item[1],
            item[2]['regret'],
            item[2]['regret'] / item[2]['rounds'],
            item[2]['total_banners']
        ]
        for item in ucb_outputs],
    columns=['coef', 'time', 'regret', 'regret_ratio', 'total_banners']
)
ucb_outputs_df.set_index('coef', inplace=True)
ucb_outputs_df

Unnamed: 0_level_0,time,regret,regret_ratio,total_banners
coef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,1047.006599,409.318048,0.002047,174
0.130147,1042.430779,540.050068,0.0027,174
0.169381,1046.064792,654.696245,0.003273,174
0.220444,1042.927195,856.172128,0.004281,174
0.286901,1050.652262,1308.619232,0.006543,174
0.373391,1042.091846,2037.603967,0.010188,174
0.485956,1034.204015,2751.127047,0.013756,174
0.632456,1048.138402,3828.955779,0.019145,174
0.823119,1053.066488,5249.750977,0.026249,174
1.071262,1028.188408,6994.293282,0.034971,174


In [108]:
ucb_outputs[0][0], ucb_outputs[0][1]

(0.1, 1047.006599187851)

In [109]:
ucb_outputs[0][2]['regret'], ucb_outputs[0][2]['regret']/ucb_outputs[0][2]['rounds'],  ucb_outputs[0][2]['total_banners']

(409.31804750937135, 0.002046590237546857, 174)

### Regret для 200к итераций у UCB с коэффицентом 0.1 равен 409.31804750937135, это лучше бейзлайна (2792.237649427154) и даже лучше, чем Thompson Sampling (1003.6264206811237)

In [110]:
ucb_outputs[0][2]['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,16576.0,1109.0,10870.812904,0.064972
162,20.0,0.0,18594.827945,0.017514
163,20.0,0.0,5153.010937,0.000849
164,65.0,2.0,5092.571727,0.041281
166,20.0,0.0,5340.55207,0.007253
167,121.0,5.0,1474.181162,0.033849
168,44.0,1.0,4900.260295,0.027273
169,192.0,9.0,5007.022458,0.030857
170,44.0,1.0,8920.324215,0.034653
171,65.0,2.0,1080.025985,0.012549


### Итоговая табличка для 200к итераций:

| method | regret   |
|------|------|
|   UCB (coeff = $\sqrt{2}$) | 8983.02 | 
|   epsilon-greedy (**baseline**)  | 2792.24 |
|   Thompson Sampling | 1003.63 |
|   UCB (coeff = $0.1$) | 409.32 | 