In [1]:
import sys

In [2]:
sys.path.append('D:/Machine Learning/Recsys-course-homework/')

In [3]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
from task_2_multiarmed_bandit.sim_lib import simulation

pd.options.mode.chained_assignment = None

Baseline выданный в домашней работе.

In [4]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [5]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


448.2382836341858

In [6]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [7]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


Давайте посмотрим насколько будет хуже если каждый раз выбирать баннер случайно.

In [8]:
def random(history: pd.DataFrame):

    n = history.shape[0]
    return history.index[randint.rvs(0, n)]

policy = random

In [9]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


368.2881557941437

In [10]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(15835.566358110453, 0.07917783179055227, 175)

In [11]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
163,1929.0,1.0,5510.399303,0.001248
165,1935.0,433.0,2517.806009,0.215128
166,800.0,23.0,3838.509219,0.040386
167,665.0,3.0,2837.780103,0.012479
168,439.0,18.0,12398.343903,0.039489
169,411.0,22.0,13317.457722,0.043771
170,355.0,15.0,5292.910363,0.022217
171,291.0,26.0,7599.212206,0.075305
172,236.0,2.0,6214.746865,0.009648
173,117.0,16.0,2905.281546,0.104239


Разница весьма ощутима.

Реализуем UCB. Установим дефолтное значение CTR = 1 для каждого баннера. Это поможет нам не зациклиться на показе одного баннера в случае если у него будет клик и будут такие баннеры, которые еще не были показаны. Новые баннеры будут появляться с CTR = 1, что обеспечит им несколько гарантированных показов.

In [12]:
def ucb(history: pd.DataFrame, critical_coeff: float):
    
    # установим дефолтное значение ctr равное 1.
    ctr = (history['clicks'] + 1) / (history['impressions'] + 1)
    upper_interval = ctr + critical_coeff*np.sqrt(ctr*(1-ctr)/(history['impressions'] + 1))
    return history.index[np.argmax(upper_interval)]

policy = partial(ucb, critical_coeff=1.96)

In [13]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


587.1272361278534

In [14]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(772.4228763804388, 0.0038621143819021937, 185)

In [15]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
169,43.0,1.0,12570.660923,0.014629
173,27.0,0.0,2590.948318,0.07785
174,112.0,6.0,1630.594595,0.05994
177,3173.0,336.0,1959.841208,0.109108
178,27.0,0.0,18537.925424,0.055495
181,27.0,0.0,660.380946,0.005876
182,26.0,0.0,3253.772281,0.032968
184,25.0,0.0,3025.997858,0.00822


Вполне неплохой результат, но можно ли сделать лучше? Однозначно да, а именно можно попробовать изменять ширину доверительного интервала. Уменьшая коээфициент перед доверительным интервалом мы уменьшаем вероятность показа баннера с низким CTR.

In [16]:
policy = partial(ucb, critical_coeff=1.5)

In [17]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


587.4005143642426

In [18]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(578.86214219008, 0.0028943107109504003, 185)

In [19]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
169,24.0,0.0,12570.660923,0.014629
173,39.0,1.0,2590.948318,0.07785
174,105.0,6.0,1630.594595,0.05994
177,3151.0,340.0,1959.841208,0.109108
178,80.0,4.0,18537.925424,0.055495
181,24.0,0.0,660.380946,0.005876
182,23.0,0.0,3253.772281,0.032968
184,21.0,0.0,3025.997858,0.00822


Стало еще лучше! До какой степени можно наглеть и уменьшать коэффициент? Очевидно, что уменьшать до 0 его нельзя. Если так сделать, то мы полностью потеряем exploration. Есть какой-то оптимальный коэффициент и его нужно подбирать для конкретной задачи.

Попробуем подобрать коэффициент для нашей задачи.

In [20]:
start = time.time()

outputs = []
for coeff in np.linspace(0, 2, 10):
    policy = partial(ucb, critical_coeff=coeff)
    outputs.append(simulation(policy, n=200000))

end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impr

5917.216552734375

In [21]:
regrets = [output['regret'] for output in outputs]
regrets

[707.592228528264,
 1460.7532537053837,
 576.131908401853,
 509.45558439623176,
 569.228033567672,
 624.286446536965,
 798.6905611374374,
 832.8615707097406,
 742.818071696186,
 911.4934588067567]

Явной монотонной зависимости от коэффициента не наблюдается. Возможно это вызвано тем, что порой ctr оптимального баннера не сильно отличается от ctr баннера, который показывается в данный момент. Если ctr нового баннера чуть лучше и выборка выпала так, что нам не повезло и ucb существенно ниже ucb оптимального баннера, то у нас уменьшается вероятность показа более оптимального баннера.

Реализуем формулу UCB из лекции.

In [22]:
def ucb(history: pd.DataFrame):

    ctr = (history['clicks'] + 1) / (history['impressions'] + 1)
    upper_interval = ctr + np.sqrt(2*np.log(history['impressions'].sum()+1)/(history['impressions'] + 1))
    
    return history.index[np.argmax(upper_interval)]

policy = ucb

In [23]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


558.5961089134216

In [24]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(8679.388286689758, 0.04339694143344879, 185)

Получилось лучше чем просто случайно показывать баннеры, но все же хуже чем e-greedy. UCB увеличивает вероятность показа баннеров, которые показываются реже относительно других и из-за этого мы в холостую тратим показы на банеры с низким CTR. CTR принимает значение от 0 до 1 в то время как поправочный коэффициет для верней границы принимает значение от 0 до бесконечности. Чем меньше основание логарифма тем чаще будут показываться баннеры с низким CTR. Отсуда следует, что можно попробовать менять основание логарфма и смотреть как поменяется регрет. Предположительно с увеличением основания будет уменьшаться регрет. Фактически основие логарифма может отвечать за exploration. Чем выше основание тем ниже exploration.

Посчитаем регрет на основании 10

In [25]:
def ucb(history: pd.DataFrame):

    ctr = (history['clicks'] + 1) / (history['impressions'] + 1)
    upper_interval = ctr + np.sqrt(2*np.log10(history['impressions'].sum()+1)/(history['impressions'] + 1))
    
    return history.index[np.argmax(upper_interval)]

policy = ucb

In [26]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


556.4097170829773

In [27]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(6083.475915072266, 0.03041737957536133, 185)

In [28]:
def ucb(history: pd.DataFrame):

    ctr = (history['clicks']) / (history['impressions'] + 1)
    upper_interval = ctr + np.sqrt(2*np.log10(history['impressions'].sum()+1)/(history['impressions'] + 1))
    
    return history.index[np.argmax(upper_interval)]

policy = ucb

In [29]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


536.2837431430817

In [30]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(5968.386699993273, 0.029841933499966368, 185)

Сделаем бандита Томпсона

In [31]:
def ts(history: pd.DataFrame):

    probabilities = np.random.beta(history['clicks'] + 1, history['impressions'] - history['clicks'] + 1)
    
    return history.index[np.argmax(probabilities)]

policy = ts

In [32]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


449.0408968925476

In [33]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1471.230717361168, 0.00735615358680584, 179)

In [34]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
154,156.0,0.0,16595.715595,0.000442
169,3736.0,144.0,11415.767481,0.039232
175,4496.0,596.0,11381.152627,0.137817
178,42.0,1.0,4959.290645,0.030834


Итого:

1) Реализован UCB по самой базовой формуле для доверительного интервала, выставленна оптимальная инициализация, произведен подбор параметра. Подбор не увенчался серьезными успехами.

2) Реализован UBC по формуле из лекции. Данная реализация оказалось хуже чем UBC из п1. в конкретной задаче.

3) Реализован TS бандит. Данная реализация оказалось хуже чем UBC из п1. в конкретной задаче.