In [3]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None


In [4]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [5]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


746.4046931266785

In [6]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [7]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


## Попробуем полиси на основе Upper Bounds

In [51]:
def upper_confidence_bound(history: pd.DataFrame, c: float):
    eps = 1e-7
    Q_t = history['clicks']  / (history['impressions'] + 10)
    A_t = Q_t + (c * np.sqrt(2 * (np.log(len(history) + 1)) / (history['impressions'] + eps)))
    n = np.argmax(A_t)
    return history.index[n]

In [52]:
policy = partial(upper_confidence_bound, c=0.1)
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=10000)
end = time.time()
end - start

1 impressions have been simulated


41.608248233795166

In [55]:
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(8.714946351058789, 0.0008714946351058788, 18)

In [56]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
0,5.0,0.0,7599.571491,0.059342
3,1470.0,190.0,730.054398,0.123087
5,56.0,5.0,9826.394049,0.111589
8,4.0,0.0,2658.518663,0.000177
11,3.0,0.0,1840.920703,0.006756
12,3.0,0.0,3399.839917,0.003828
13,3.0,0.0,11307.15983,0.030773
17,3.0,0.0,918.927311,0.007524


**Подберем параметр C**

In [57]:
Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

In [58]:
for c in Cs:
    print(f'Upper confidence with C: {c}')
    policy = partial(upper_confidence_bound, c=c)
    np.random.seed(seed=384758917)
    output = simulation(policy, n=10000)
    print(output['regret'], output['regret']/output['rounds'],  output['total_banners'])
    print("--------------------")
    

Upper confidence with C: 0.001
1 impressions have been simulated
545.4960488746067 0.05454960488746067 18
--------------------
Upper confidence with C: 0.005
1 impressions have been simulated
546.6945453664349 0.05466945453664349 18
--------------------
Upper confidence with C: 0.01
1 impressions have been simulated
545.5003445609573 0.05455003445609573 18
--------------------
Upper confidence with C: 0.05
1 impressions have been simulated
547.2239978853239 0.05472239978853239 18
--------------------
Upper confidence with C: 0.1
1 impressions have been simulated
8.714946351058789 0.0008714946351058788 18
--------------------
Upper confidence with C: 0.5
1 impressions have been simulated
94.425870144572 0.0094425870144572 18
--------------------


Лучший найденный параметр c - 0.1
Теперь смоделируем 200000 запусков с данным параметром. 

In [59]:
policy = partial(upper_confidence_bound, c=0.1)
output = simulation(policy, n=200000)
print(output['regret'], output['regret']/output['rounds'],  output['total_banners'])
print(output['history'])

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
1074.4735844261265 0.0053723679221306326 188
     impressions  clicks     lifetime         p
161        487.0    11.0  2548.660923  0.014629
170        435.0    16.0  8515.925424  0.055495
180       6186.0   862.0  4158.056921  0.1385

## Попробуем полиси на основе Thompson Sampling

In [40]:
from numpy.random import beta

def thompson_sampling(history: pd.DataFrame):
    total = history['impressions'].values
    positive = history['clicks'].values
    n = beta(history['clicks'] +1, history['impressions'] - history['clicks'] + 1).argmax()
    return history.index[n]


In [41]:
policy = partial(thompson_sampling)
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=10000)
end = time.time()
end - start

1 impressions have been simulated


37.09839701652527

In [42]:
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(94.47049026535649, 0.00944704902653565, 14)

**Теперь смоделируем 200000 запусков Thompson sampling.**

In [60]:
policy = partial(thompson_sampling)
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


737.3813881874084

In [61]:
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1471.230717361168, 0.00735615358680584, 179)

**Краткие выводы**
С помощью Upper Confidence Bound и Thompson Sampling удалось побить бэйслайн эпсилон-жадного алгоритма. 
При этом, Upper Confidence Bound показал regret ~ 1070, 
a Thompson Sampling ~ 1470