In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from replay.session_handler import get_spark_session, State

from replay.models import RandomRec
from replay.obp_evaluation.replay_offline import RePlayOfflinePolicyLearner
from replay.obp_evaluation.utils import bandit_subset, get_est_rewards_by_reg

import obp
from obp.dataset import OpenBanditDataset
from obp.policy import IPWLearner
from obp.ope import (
    OffPolicyEvaluation,
    DirectMethod,
    InverseProbabilityWeighting,
    DoublyRobust
)

In [None]:
spark = State(get_spark_session()).session
spark.sparkContext.setLogLevel('ERROR')

23/08/18 16:37:06 WARN Utils: Your hostname, hdilab01-X299-UD4-Pro resolves to a loopback address: 127.0.1.1; using 172.21.136.90 instead (on interface enp0s31f6)
23/08/18 16:37:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/08/18 16:37:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
23/08/18 16:37:07 WARN DependencyUtils: Local jar /home/hdilab01/hdiRecSys/obp_connector/notebooks/jars/replay_2.12-0.1_spark_3.1.jar does not exist, skipping.
23/08/18 16:37:07 INFO SparkContext: Running Spark version 3.1.3
23/08/18 16:37:07 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/08/18 16:37:07 INFO ResourceUtils: No custom resources configured for spark.driver.
23/08/18 16:37:07

Lets define OpenBanditDataset class with random policy. For the purpose of demonstration we won't use the whole dataset but only subset of size 10000.

In [None]:
data_path = None # Path to the Open Bandit Dataset
behavior_policy = "random"

# Define OBP dataset and split it into train and test
dataset = OpenBanditDataset(behavior_policy=behavior_policy, data_path=data_path, campaign='all')
bandit_feedback_train, bandit_feedback_test = dataset.obtain_batch_bandit_feedback(test_size=0.3, is_timeseries_split=True)

print(bandit_feedback_train["n_rounds"])
print(bandit_feedback_test["n_rounds"])

INFO:obp.dataset.real:When `data_path` is not given, this class downloads the small-sized version of Open Bandit Dataset.


7000
3000


In [None]:
bandit_feedback_train.keys()

dict_keys(['n_rounds', 'n_actions', 'action', 'position', 'reward', 'pscore', 'context', 'action_context'])

The keys of the dictionary are as follows.
- n_rounds: number of rounds, data size of the logged bandit data;
- n_actions: number of actions $|\mathcal{A}|$;
- action: action variables sampled by the behavior policy;
- position: positions where actions are recommended, there are three positions in the ZOZOTOWN rec interface;
- reward: binary reward variables, click indicators;
- pscore: action choice probabilities by the behavior policy, propensity scores;
- context: context vectors such as user-related features and user-item affinity scores;
- action_context: item-related context vectors

In [None]:
#Define replay model
model = RandomRec(seed=42)

#Define learner which connects OBP data format with replay
learner = RePlayOfflinePolicyLearner(n_actions=dataset.n_actions,
                                     replay_model=model,
                                     len_list=dataset.len_list,) #len_list is the number of predicted items per user

**RePlayOfflinePolicyLearner** has the following methods
- *fit(action, reward, timestamp, context, action_context)*;
- *predict(n_rounds, context)* (context can be None thus n_rounds is **required**);
- *optimize(bandit_feedback, val_size, param_borders, criterion, budget, new_study)*

In [None]:
#Define borders for the optimized parameters
param_borders = {
    "distribution": ["popular_based", "relevance"],
    "alpha": [-0.5, 1000],
}

#Take subset of train data to validate our model with OBP
bandit_feedback_subset = bandit_subset([0, 7000], bandit_feedback_train) #The first parameter is a slice of subset [a, b]
print(learner.optimize(bandit_feedback_subset, val_size=0.3, param_borders=param_borders, budget=15))

[32m[I 2023-08-18 16:37:08,919][0m A new study created in memory with name: no-name-1018a8ca-3204-4d3e-a432-dabd14af6f56[0m
[32m[I 2023-08-18 16:37:15,599][0m Trial 0 finished with value: 0.004010039096860206 and parameters: {'distribution': 'relevance', 'alpha': 169.64919642896555}. Best is trial 0 with value: 0.004010039096860206.[0m
[32m[I 2023-08-18 16:37:17,655][0m Trial 1 finished with value: 0.004010039096860206 and parameters: {'distribution': 'relevance', 'alpha': 583.812879578577}. Best is trial 0 with value: 0.004010039096860206.[0m
[32m[I 2023-08-18 16:37:19,417][0m Trial 2 finished with value: 0.004010039096860206 and parameters: {'distribution': 'relevance', 'alpha': 681.236753135904}. Best is trial 0 with value: 0.004010039096860206.[0m
[32m[I 2023-08-18 16:37:21,216][0m Trial 3 finished with value: 0.004010039096860206 and parameters: {'distribution': 'relevance', 'alpha': 786.2407660894739}. Best is trial 0 with value: 0.004010039096860206.[0m
[32m[I 20

{'distribution': 'popular_based', 'alpha': 993.1865302583232}


In [None]:
#Fit replay model inside our learner
learner.fit(
    action=bandit_feedback_train["action"],
    reward=bandit_feedback_train["reward"],
    timestamp=np.arange(bandit_feedback_train["n_rounds"]),
    context=bandit_feedback_train["context"],
    action_context=bandit_feedback_train["action_context"]
)

#Predict distribution over actions: shape (n_rounds, n_actions, len_list)
action_dist = learner.predict(bandit_feedback_test["n_rounds"], bandit_feedback_test["context"])

print(action_dist.shape)

(3000, 80, 3)


When we get distribution over actions - we can run any evaluation procedure from the OBP. Here we use three estimators
- *IPW*: Average rewards with importance weights
- *DM*: Average predicted rewards using the classifier
- *DR*: Combination of the above methods with zero bias and lower variance

Also, we can construct confidence intervals for each of these methods.

In [None]:
ope = OffPolicyEvaluation(
    bandit_feedback=bandit_feedback_test,
    ope_estimators=[InverseProbabilityWeighting(), DirectMethod(), DoublyRobust()]
)

estimated_rewards_by_reg_model = get_est_rewards_by_reg(dataset.n_actions,
                                                        dataset.len_list,
                                                        bandit_feedback_train,
                                                        bandit_feedback_test)

estimated_policy_value = ope.estimate_policy_values(
    action_dist=action_dist,
    estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
)

estimated_ci = ope.estimate_intervals(
    action_dist=action_dist,
    estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    n_bootstrap_samples=10000,
    random_state=12345,)

In [None]:
out_str = f"Scores for LinUCB"
for key, val in estimated_policy_value.items():
    out_str += f" {key} : {(1e3 * val):.3f},"

out_str = out_str[:-1]

print(out_str)
print("Estimated confidence intervals:")
print(pd.DataFrame(estimated_ci).to_string())

Scores for LinUCB ipw : 3.345, dm : 3.952, dr : 3.370
Estimated confidence intervals:
                       ipw        dm        dr
mean              0.003338  0.003952  0.003363
95.0% CI (lower)  0.001343  0.003896  0.001398
95.0% CI (upper)  0.005661  0.004008  0.005622
