In [1]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform, beta
from sim_lib import simulation

pd.options.mode.chained_assignment = None

Epsilon-greedy бандит

In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [3]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


518.4103791713715

Регрет для epsilon-greedy бандита, принятый бейзлайном

In [4]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


Сначала я решил реализовать UCB-бандита, правда с константой 2 под корнем он давал результат намного хуже бейзлайна, поэтому я попробовал сделать её параметром, и, уменьшив его, побил бейзлайн.

In [6]:
def ucb(history: pd.DataFrame, c: int) -> int:
    ctr = history['clicks'] / (history['impressions'] + 10) + np.sqrt(c * np.log(np.sum(history['impressions']) + 10) / (history['impressions'] + 10))
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(ucb, c=0.1)

In [7]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


665.0214273929596

In [8]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1390.3899944198477, 0.006951949972099239, 185)

In [9]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
169,74.0,2.0,12570.660923,0.014629
173,309.0,23.0,2590.948318,0.07785
174,74.0,2.0,1630.594595,0.05994
177,2539.0,267.0,1959.841208,0.109108
178,168.0,10.0,18537.925424,0.055495
181,45.0,0.0,660.380946,0.005876
182,148.0,8.0,3253.772281,0.032968
184,60.0,1.0,3025.997858,0.00822


Регрет получился ниже бейзлайна, при этом в доживших до конца баннерах видно, что перевес impressions в пользу баннеров с большим p не такой радикальный, как в бейзлайновом решении, где мы почти всегда показывали один и тот же баннер.

Из-за сильного изменения константы под корнем, а значит, значительного уменьшения эксплорации в пользу эксплуатации, я решил реализовать также Thompson sampling. За начальное распределение я принял Beta(1, 1) у каждой ручки

In [10]:
def thompson(history: pd.DataFrame) -> int:
    ctr = []
    for i, c in history[['impressions', 'clicks']].values:
        distr = beta(c + 1, i - c + 1)
        ctr.append(distr.rvs())
    n = np.argmax(ctr)
    return history.index[n]

In [11]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(thompson, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


1841.9364120960236

In [12]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1471.230717361168, 0.00735615358680584, 179)

In [13]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
154,156.0,0.0,16595.715595,0.000442
169,3736.0,144.0,11415.767481,0.039232
175,4496.0,596.0,11381.152627,0.137817
178,42.0,1.0,4959.290645,0.030834


Регрет получился ниже бейзлайна, при этом в оставшихся на конец симуляции баннерах видно, что показ 169 и 175 баннеров был практически одинаково часто, хотя у 175-го р гораздо выше.