In [1]:
import numpy as np
import pandas as pd
import time

from functools import partial
from scipy.stats import randint, uniform
from sim_lib import simulation

pd.options.mode.chained_assignment = None

In [2]:
def eps_greedy(history: pd.DataFrame, eps: float):
    if uniform.rvs() < eps:
        n = history.shape[0]
        return history.index[randint.rvs(0, n)]

    ctr = history['clicks'] / (history['impressions'] + 10)
    n = np.argmax(ctr)
    return history.index[n]

policy = partial(eps_greedy, eps=0.08)

In [3]:
# seed for homework
np.random.seed(seed=384758917)

start = time.time()
output = simulation(policy, n=200000)
end = time.time()
end - start

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated


556.1680109500885

In [4]:
# baseline regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(2792.237649427154, 0.01396118824713577, 174)

In [5]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
132,19843.0,1332.0,10870.812904,0.064972
162,154.0,2.0,18594.827945,0.017514
163,86.0,0.0,5153.010937,0.000849
164,68.0,2.0,5092.571727,0.041281
166,48.0,0.0,5340.55207,0.007253
167,33.0,0.0,1474.181162,0.033849
168,23.0,0.0,4900.260295,0.027273
169,14.0,0.0,5007.022458,0.030857
170,18.0,0.0,8920.324215,0.034653
171,14.0,0.0,1080.025985,0.012549


# UCB policy exploration

In [6]:
def ucb1(history: pd.DataFrame) -> int:
    N = history.shape[0]
    ctr_quantile = history['clicks']/(history['impressions']+10) + \
    np.sqrt(
        2*np.log(N) / (history['impressions'] + 10)
    )
    n = np.argmax(ctr_quantile)
    return history.index[n]

In [7]:
policy = ucb1

In [8]:
%%time

output = simulation(policy, n=200000)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
CPU times: user 10min 39s, sys: 1.12 s, total: 10min 40s
Wall time: 10min 38s


In [9]:
# UCB1 regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(4892.203243302422, 0.024461016216512112, 208)

In [10]:
output['history']

Unnamed: 0,impressions,clicks,lifetime,p
152,963.0,56.0,3542.628628,0.060567
170,1659.0,120.0,8646.852102,0.074247
193,325.0,7.0,22108.414021,0.014121
200,207.0,2.0,2271.421589,0.013661
201,195.0,1.0,16738.531244,0.006106
202,6780.0,884.0,3972.106269,0.128287
203,174.0,0.0,14699.823825,0.002851
204,569.0,39.0,17806.294214,0.064651
207,362.0,17.0,1155.749592,0.047443


## Tune simple UCB

In [11]:
def ucb_tuned(history: pd.DataFrame, alpha: float) -> int:
    N = history.shape[0]
    ctr_quantile = history['clicks'] / (history['impressions']+10) + \
    alpha*np.sqrt(
        2*np.log(N) / (history['impressions']+10)
    )
    n = np.argmax(ctr_quantile)
    return history.index[n]

In [12]:
%%time

N = 1000*10
for alpha in np.logspace(-2, 2, 5):
    policy = partial(ucb_tuned, alpha=alpha)
    output = simulation(policy, n=N)
    print(f'alpha={alpha}, regret={output["regret"]}')

1 impressions have been simulated
alpha=0.01, regret=419.9766236552938
1 impressions have been simulated
alpha=0.1, regret=126.81829984154231
1 impressions have been simulated
alpha=1.0, regret=183.8343509475723
1 impressions have been simulated
alpha=10.0, regret=204.71019120767733
1 impressions have been simulated
alpha=100.0, regret=507.3740699718395
CPU times: user 2min 58s, sys: 281 ms, total: 2min 58s
Wall time: 2min 57s


In [13]:
%%time

# compute the regret for the long run
policy = partial(ucb_tuned, alpha=0.1)
output = simulation(policy, n=200000)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated
140001 impressions have been simulated
150001 impressions have been simulated
160001 impressions have been simulated
170001 impressions have been simulated
180001 impressions have been simulated
190001 impressions have been simulated
CPU times: user 11min 24s, sys: 1.45 s, total: 11min 26s
Wall time: 11min 23s


In [14]:
# tuned UCB1 regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

(5270.415355859229, 0.026352076779296144, 182)

## Tune Bernoulli UCB

In [16]:
def ucb_tuned_bernoulli(history: pd.DataFrame, alpha: float) -> int:
    N = history.shape[0]
    theta = history['clicks'] / (history['impressions'] + 10)
    sigma = (history['clicks'] - theta)**2 / (history['impressions'] + 10)
    a = theta + np.sqrt(
        np.log(N) / history['impressions'] * np.min(
            np.array(
                [
                    np.ones(history.shape[0])*0.25,
                    sigma + 2*np.log(N) / (history['impressions'] + 10),
                ],
            ),
            axis=0
    ))
    n = np.argmax(a)
    return history.index[n]

In [17]:
N = 1000*5
for alpha in np.logspace(-2, 2, 5):
    policy = partial(ucb_tuned_bernoulli, alpha=alpha)
    output = simulation(policy, n=N)
    print(f'alpha={alpha}, regret={output["regret"]}')

1 impressions have been simulated
alpha=0.01, regret=305.80696211189957
1 impressions have been simulated
alpha=0.1, regret=47.51375230365192
1 impressions have been simulated
alpha=1.0, regret=41.664524623376984
1 impressions have been simulated
alpha=10.0, regret=62.04486736080505
1 impressions have been simulated
alpha=100.0, regret=38.76944451081966


In [None]:
%%time

# compute the regret for the long run
policy = partial(ucb_tuned, alpha=10.)
output = simulation(policy, n=200000)

1 impressions have been simulated
10001 impressions have been simulated
20001 impressions have been simulated
30001 impressions have been simulated
40001 impressions have been simulated
50001 impressions have been simulated
60001 impressions have been simulated
70001 impressions have been simulated
80001 impressions have been simulated
90001 impressions have been simulated
100001 impressions have been simulated
110001 impressions have been simulated
120001 impressions have been simulated
130001 impressions have been simulated


In [None]:
# tuned UCB1 bernoulli regret
output['regret'], output['regret']/output['rounds'],  output['total_banners']

tuned UCB1 regret: 5270

e-greedy baseline: 2792

:(