In [1]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
from task_2_multiarmed_bandit.sim_lib import simulation

pd.options.mode.chained_assignment = None



# Baseline

In [3]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [4]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


468.60773515701294

In [5]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [6]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


# Upper confidence bound

In [7]:
def UPC(history: pd.DataFrame):
    # для каждого из баннеров изначальное число показов - 10, число кликов - 0
    # учтём начальные условия при рассчёте t
    if UPC.invocations == 0:
        UPC.invocations += len(history) * 10
    else:
        UPC.invocations += 1
    u = history['clicks'] / (history['impressions'] + 10) + np.sqrt(2 * np.log(UPC.invocations) / (history['impressions'] + 10))
    n = np.argmax(u)
    return history.index[n]

policy_upc = UPC
policy_upc.invocations = 0

In [8]:
np.random.seed(seed=384758917)
policy_upc.invocations = 0

start = time.time()
output_upc = simulation(policy_upc, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


568.4171450138092

In [9]:
# upc regret
output_upc['regret'], output_upc['regret'] / output_upc['rounds'],  output_upc['total_banners']

(9074.17827233135, 0.04537089136165675, 185)

In [10]:
output_upc['history']

Unnamed: 0,impressions,clicks,lifetime,p
169,446.0,7.0,12570.660923,0.014629
173,976.0,88.0,2590.948318,0.07785
174,590.0,27.0,1630.594595,0.05994
177,1432.0,168.0,1959.841208,0.109108
178,570.0,24.0,18537.925424,0.055495
181,415.0,3.0,660.380946,0.005876
182,469.0,10.0,3253.772281,0.032968
184,415.0,3.0,3025.997858,0.00822


Regret для UPC получился выше, чем для eps-greedy бандита.

# Thompson sampling с помощью распределения Бернулли

Зададим для каждого баннера одинаковые начальные условия - 10 показов и 1 клик.

In [12]:
def Bernoulli_TS(history: pd.DataFrame):
    b = np.random.beta(history['clicks'] + 1, history['impressions'] - history['clicks'] + 10)
    n = np.argmax(b)
    return history.index[n]

policy_bts = partial(Bernoulli_TS)

In [13]:
np.random.seed(seed=384758917)

start = time.time()
output_bts = simulation(policy_bts, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


441.36949920654297

In [14]:
# upc regret
output_bts['regret'], output_bts['regret'] / output_bts['rounds'],  output_bts['total_banners']

(1480.3777724072131, 0.0074018888620360655, 206)

In [15]:
output_bts['history']

Unnamed: 0,impressions,clicks,lifetime,p
192,518.0,30.0,6451.363381,0.060716
194,3043.0,254.0,525.284023,0.076855
199,467.0,34.0,7527.644217,0.070508
201,6617.0,601.0,15834.134874,0.096081
203,57.0,0.0,2089.791836,0.011017
204,117.0,5.0,2114.332979,0.045239


Regret получился ниже, чем для UCB и eps-greedy бандитов.