# Dataset Loader

The first part of the Open Bandit Pipeline (OBP) is the dataset loader. For the Open Bandit Dataset (OBD), the loader is `opb.dataset.OpenBanditDataset` ([docs](https://zr-obp.readthedocs.io/en/latest/_autosummary/obp.dataset.real.html#obp.dataset.real.OpenBanditDataset)). 

As with many classes in the OBP, the dataset modules are implemented with [dataclasses](https://docs.python.org/3.7/library/dataclasses.html).

The dataset module inherits from `obp.dataset.base.BaseRealBanditDatset` ([docs](https://zr-obp.readthedocs.io/en/latest/_autosummary/obp.dataset.base.html#module-obp.dataset.base)) and should implement three methods:
- `load_raw_data()`: Load an on-disk representation of the dataset into the module. Used during initialization.
- `pre_process()`: Perform any preprocessing needed to transform the raw data representation into a final representation.
- `obtain_batch_bandit_feedback()`: Return a dictionary containing (at least) keys: `["action","position","reward","pscore","context","n_rounds"]`

It is also helpful if the dataset module exposes a property `len_list`, which is how many items the bandit shows the user at a time. Often the answer is 1, though in the case of OBD it's 3.

In [None]:
import sys 
sys.path.append('..')

import os
import obp
from src.data.obp_dataset import MovieLensDataset

In [None]:
dataset = MovieLensDataset(
    data_path=os.path.join(os.getcwd(), "../data/"), 
    embedding_network_weights_path="../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt", 
    embedding_dim=50,
    users_num=943,
    items_num=1682,
    state_size=5
)

In [None]:
bandit_feedback = dataset.obtain_batch_bandit_feedback()
print("feedback dict:")
for key, value in bandit_feedback.items():
    print(f"  {key}: {type(value)}")

In [None]:
exp_rand_reward = round(bandit_feedback["reward"].mean(),4)
print(f"Expected reward for uniform random actions: {exp_rand_reward}")

# Off-Policy Evaluation (OPE)

The next step is OPE which attempts to estimate the performance of online bandit algorithms using the logged bandit feedback and ReplayMethod(RM).

In [None]:
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression

import obp
from obp.ope import (
    OffPolicyEvaluation, 
    RegressionModel,
    InverseProbabilityWeighting as IPS,
    SelfNormalizedInverseProbabilityWeighting as SNIPS,
    DirectMethod as DM,
    DoublyRobust as DR,
    DoublyRobustWithShrinkage as DRos,
)

from src.model.simulator import run_bandit_simulation
from src.model.bandit import EpsilonGreedy, LinUCB, WFairLinUCB, FairLinUCB

In [None]:
n_groups = 10
fairness_weight = {k: 1.0 for k in range(1, n_groups + 1)}
with open("../data/ml-100k/movies_groups.pkl", "rb") as pkl_file:
    movies_groups = pickle.load(pkl_file)

In [None]:
epsilon_greedy = EpsilonGreedy(
    n_actions=dataset.n_actions,
    epsilon=0.1,
    n_group=n_groups,
    item_group=movies_groups,
    fairness_weight=fairness_weight
)
eg_action_dist, eg_aligned_cvr, eg_cvr, eg_propfair, eg_ufg, eg_group_count = run_bandit_simulation(
    bandit_feedback=bandit_feedback,
    policy=epsilon_greedy,
    epochs=5,
)

In [None]:
lin_ucb = FairLinUCB(
    dim=dataset.dim_context,
    n_actions=dataset.n_actions,
    epsilon=0.1,
    n_group=n_groups,
    item_group=movies_groups,
    fairness_weight=fairness_weight,
    batch_size=1
)
linucb_action_dist, linucb_aligned_cvr, linucb_cvr, linucb_propfair, linucb_ufg, linucb_group_count = run_bandit_simulation(
    bandit_feedback=bandit_feedback,
    policy=lin_ucb,
    epochs=5,
)

In [None]:
# # estimate the policy value of the online bandit algorithms using RM
# ope = OffPolicyEvaluation(
#     bandit_feedback=bandit_feedback,
#     ope_estimators=[
#         IPS(estimator_name="IPS"), 
#         SNIPS(estimator_name="SNIPS"),
#         DM(estimator_name="DM"), 
#         DR(estimator_name="DR"),
#         DRos(estimator_name="DRos")
#     ]
# )

In [None]:
# # obp.ope.RegressionModel
# regression_model = RegressionModel(
#     n_actions=dataset.n_actions, # number of actions; |A|
#     len_list=dataset.len_list, # number of items in a recommendation list; K
#     base_model=LogisticRegression(C=100, max_iter=100000), 
# )

In [None]:
# estimated_rewards = regression_model.fit_predict(
#     context=bandit_feedback["context"],
#     action=bandit_feedback["action"],
#     reward=bandit_feedback["reward"],
#     position=bandit_feedback["position"],
# )

In [None]:
# eg_estimated_policy_value = ope.estimate_policy_values(
#     action_dist=eg_action_dist, # \pi_e(a|x)
#     estimated_rewards_by_reg_model=estimated_rewards, # \hat{q}
# )
# eg_estimated_policy_value

In [None]:
# linucb_estimated_policy_value = ope.estimate_policy_values(
#     action_dist=linucb_action_dist, # \pi_e(a|x)
#     estimated_rewards_by_reg_model=estimated_rewards, # \hat{q}
# )
# linucb_estimated_policy_value

In [None]:
import plotly.express as px
import plotly.graph_objs as go

In [None]:
fig = go.Figure([
    go.Scatter(
        x=[i + 1 for i in range(len(eg_aligned_cvr))],
        y=eg_aligned_cvr,
        name="CVR"
    ),
    go.Scatter(
        x=[i + 1 for i in range(len(eg_aligned_cvr))],
        y=[exp_rand_reward for i in range(len(eg_aligned_cvr))],
        name="Mean Reward"
    )
])
fig.update_layout(title="EGreedy")
fig.update_yaxes(range=[0, 1])
fig.show()

In [None]:
fig = go.Figure([
    go.Scatter(
        x=[i + 1 for i in range(len(linucb_aligned_cvr))],
        y=linucb_aligned_cvr,
        name="CVR"
    ),
    go.Scatter(
        x=[i + 1 for i in range(len(linucb_aligned_cvr))],
        y=[exp_rand_reward for i in range(len(linucb_aligned_cvr))],
        name="Mean Reward"
    )
])
fig.update_layout(title="WFair LinUCB")
fig.update_yaxes(range=[0, 1])
fig.show()