In [1]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
from task_2_multiarmed_bandit.sim_lib import simulation

pd.options.mode.chained_assignment = None


In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [3]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


319.86749291419983

In [4]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


In [9]:
def my_thompson_policy(history: pd.DataFrame):
    def random_beta(arg1, arg2):
        return np.random.beta(arg1 + 1, arg2 - arg1 + 1)

    return history.index[np.argmax(random_beta(history.clicks, history.impressions))]

policy = my_thompson_policy

In [10]:
start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


359.4777030944824

In [11]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(1491.955086882202, 0.007459775434411011, 185)

In [12]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
174,784.0,47.0,40510.78316,0.049383
176,73.0,0.0,13436.226969,0.007151
177,101.0,1.0,18777.102075,0.016482
178,2589.0,193.0,46712.715488,0.070027
179,91.0,1.0,15108.962948,0.01454
181,61.0,7.0,9344.706171,0.041464
182,1100.0,306.0,6646.235163,0.270187
183,62.0,9.0,18373.862192,0.155934
184,33.0,4.0,27434.001243,0.099512


Реализована Thompson sampling policy, прогнана через симуляцию 200000 событий, сразу же без дополнительного тюнинга получилось побить бейзлайн – .