# Netflixも使っている！Contextual Banditアルゴリズムを徹底解説！(Part 2)

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge as RR, LogisticRegression as LR
from sklearn.ensemble import GradientBoostingRegressor as GBR, GradientBoostingClassifier as GBC

from pandas import DataFrame
from plotly.offline import iplot, plot

In [2]:
from pymab.bandit import BernoulliBandit, GaussianBandit
from pymab.evaluation import DataGenerator, ReplayMethod, DirectMethod, IPSEstimator, DREstimator, MRDREstimator
from pymab.policy import EpsilonGreedy, UCB1, ThompsonSampling, GaussianThompsonSampling, LinUCB, LinTS
from pymab.sim import BanditSimulator

## Offline Evaluation用のデータ生成

In [34]:
n_arms, n_features = 10, 15
gb = GaussianBandit(n_arms=n_arms, n_features=n_features, noise=0.1, contextual=True)
dg = DataGenerator(policy=EpsilonGreedy(n_arms=n_arms, epsilon=0.2),#LinUCB(n_arms=n_arms, n_features=n_features, alpha=1, batch_size=300), 
                   bandit=gb,
                   n_rounds=50000, 
                   randomized=True)

In [35]:
gb.params

array([[-0.39244344,  2.06788533,  0.3809838 ,  0.51071165, -0.31969997,
         2.8494498 , -0.63480366, -0.07718706,  0.57521326,  0.59200947],
       [ 1.38578514,  0.34390889,  0.57038567, -1.50438295,  1.67235906,
         1.12670131,  0.65444776, -0.66996064,  0.03958925,  0.19293391],
       [ 0.27837618, -0.5350718 ,  0.03919948,  0.66230424, -0.03983438,
        -0.56784731, -0.24027589, -1.24024157, -1.17741276, -0.80745904],
       [-0.80026478,  0.0618781 , -0.32618562, -0.67094959, -0.53313986,
         0.70677535,  0.65771951,  0.00840465, -0.65296488, -0.97961696],
       [ 0.11209069, -0.67819596,  0.15414815,  0.80996131, -0.5280649 ,
        -0.64160139, -1.04060848, -0.37328818, -1.18196713,  0.03658343],
       [ 1.02071406,  1.29538814,  0.37439648,  0.6095346 , -0.18668084,
         0.80895926, -2.1393028 , -1.40652994,  1.14244055,  1.54986767],
       [-1.03378877,  0.15389153,  0.97722771,  3.25691291,  0.25013692,
        -0.83088654,  0.33189278,  0.78766855

In [36]:
data, a, r = dg.generate_data()

## Policy Candidates

In [37]:
pols =  [EpsilonGreedy(n_arms=n_arms, epsilon=0.2, batch_size=500),
         UCB1(n_arms=n_arms, batch_size=500),
         ThompsonSampling(n_arms=n_arms),
         LinUCB(n_arms=n_arms, n_features=n_features, alpha=1, batch_size=500),
         LinTS(n_arms=n_arms, n_features=n_features, sigma=1, sample_batch=20, batch_size=500)]

## Replay Method

In [38]:
rm = ReplayMethod(policy_list=pols, n_iter=3)
result_rm = rm.estimate(data, a, r)
result_rm.describe()

Unnamed: 0,EpsilonGreedy(ε=0.2),UCB1,ThompsonSampling,LinUCB(α=1),LinTS(σ=1)
count,3.0,3.0,3.0,3.0,3.0
mean,1.675525,1.039165,-0.057573,3.428132,3.594647
std,0.071208,0.203444,0.043122,0.02315,0.022957
min,1.622532,0.857191,-0.103672,3.403071,3.57294
25%,1.635053,0.929341,-0.077247,3.417838,3.582632
50%,1.647575,1.001492,-0.050822,3.432606,3.592324
75%,1.702021,1.130152,-0.034523,3.440662,3.605501
max,1.756468,1.258813,-0.018225,3.448718,3.618678


## Direct Method

In [39]:
dm = DirectMethod(policy_list=pols, n_iter=3, regression=True)
dm.fit(pom=GBR(n_estimators=100, max_depth=5, learning_rate=0.3), X=data, a=a, r=r)
result_dm = dm.estimate(data, a, r)
result_dm.describe()

Unnamed: 0,EpsilonGreedy(ε=0.2),UCB1,ThompsonSampling,LinUCB(α=1),LinTS(σ=1)
count,3.0,3.0,3.0,3.0,3.0
mean,-0.194336,0.235216,-0.099462,-0.134459,0.224009
std,0.183403,0.304989,0.25137,0.584115,0.397452
min,-0.33042,-0.046851,-0.331062,-0.802894,-0.045822
25%,-0.29862,0.073393,-0.233122,-0.340575,-0.004199
50%,-0.26682,0.193637,-0.135182,0.121744,0.037423
75%,-0.126294,0.376249,0.016337,0.199759,0.358924
max,0.014232,0.558861,0.167856,0.277773,0.680424


## IPS Estimator

In [40]:
ips = IPSEstimator(policy_list=pols, n_iter=3)
ips.fit(pse=GBC(n_estimators=100, max_depth=5, learning_rate=0.3), 
        X=data, a=a)
result_ips = ips.estimate(data, a, r)
result_ips.describe()

Unnamed: 0,EpsilonGreedy(ε=0.2),UCB1,ThompsonSampling,LinUCB(α=1),LinTS(σ=1)
count,3.0,3.0,3.0,3.0,3.0
mean,12.375572,11.541715,-0.338223,32.181894,34.177062
std,7.474416,1.487769,0.191259,0.274219,0.246163
min,3.749416,10.51032,-0.510668,31.880733,33.93812
25%,10.097608,10.688966,-0.441077,32.064257,34.050662
50%,16.445801,10.867612,-0.371487,32.247781,34.163205
75%,16.68865,12.057412,-0.252,32.332474,34.296533
max,16.931498,13.247213,-0.132513,32.417167,34.429861


## DR Estimator

In [None]:
dr = DREstimator(policy_list=pols, n_iter=3, regression=True)
dr.fit(pom=GBR(n_estimators=100, max_depth=5, learning_rate=0.3), 
       pse=GBC(n_estimators=100, max_depth=5, learning_rate=0.3), 
       X=data, a=a, r=r)
result_dr = dr.estimate(data, a, r)
result_dr.describe()

Unnamed: 0,EpsilonGreedy(ε=0.2),UCB1,ThompsonSampling,LinUCB(α=1),LinTS(σ=1)
count,3.0,3.0,3.0,3.0,3.0
mean,1.444595,1.207335,-0.028024,3.342227,3.586722
std,0.512807,0.185961,0.003531,0.114393,0.009849
min,0.852663,1.045373,-0.030172,3.21871,3.57913
25%,1.289843,1.105796,-0.030061,3.291078,3.581158
50%,1.727024,1.166219,-0.029951,3.363447,3.583185
75%,1.740561,1.288316,-0.02695,3.403985,3.590518
max,1.754099,1.410413,-0.023949,3.444524,3.597851


## MRDR Estimator

In [None]:
mrdr = MRDREstimator(policy_list=pols, n_iter=3, regression=True)
mrdr.fit(pom=GBR(n_estimators=100, max_depth=5, learning_rate=0.3), 
         pse=GBC(n_estimators=100, max_depth=5, learning_rate=0.3), 
         X=data, a=a, r=r)
result_mrdr = mrdr.estimate(data, a, r)
result_mrdr.describe()

## Online

In [28]:
gb_online = GaussianBandit(n_arms=n_arms, n_features=n_features, noise=0.1, contextual=True)
gb_online.params = gb.params

In [29]:
bs = BanditSimulator(policy_list=pols, 
                     bandit=gb, 
                     num_sims=5, n_rounds=10000, contextual=True)

In [30]:
bs.run_sim()

Avg Elapsed Time(10000 iter) EpsilonGreedy(ε=0.2) : 0.124s
Avg Elapsed Time(10000 iter) UCB1 : 0.23s
Avg Elapsed Time(10000 iter) ThompsonSampling : 0.22s
Avg Elapsed Time(10000 iter) LinUCB(α=1) : 0.524s
Avg Elapsed Time(10000 iter) LinTS(σ=1) : 0.471s


In [31]:
rewards_plot, regret_plot, bingo_plot = bs.plots()

In [32]:
iplot(rewards_plot)
iplot(regret_plot)
iplot(bingo_plot)