In [None]:
import sys 
sys.path.append('..')

#Dependencies
import os
from tqdm import tqdm
import numpy as np
import torch

import json 
import pickle

from src.model.pmf import PMF
from src.model.recommender import DRRAgent, FairRecAgent

from src.data.obp_dataset import MovieLensDataset
from obp.ope import (
    OffPolicyEvaluation, 
    InverseProbabilityWeighting as IPS,
    SelfNormalizedInverseProbabilityWeighting as SNIPS,
    DirectMethod as DM,
    DoublyRobust as DR,
    DoublyRobustWithShrinkage as DRos,
)

In [None]:
dataset_path = "../data/movie_lens_100k_output_path.json"
with open(dataset_path) as json_file:
    _dataset_path = json.load(json_file)


dataset = {}
with open(os.path.join("..", _dataset_path["eval_users_dict"]), "rb") as pkl_file:
    dataset["eval_users_dict"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_dict_positive_items"]), "rb") as pkl_file:
    dataset["eval_users_dict_positive_items"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_history_lens"]), "rb") as pkl_file:
    dataset["eval_users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["users_history_lens"]), "rb") as pkl_file:
    dataset["users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["movies_groups"]), "rb") as pkl_file:
    dataset["movies_groups"] = pickle.load(pkl_file)

obp_dataset = MovieLensDataset(
    data_path=os.path.join(os.getcwd(), "../data/"), 
    embedding_network_weights_path="../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt", 
    embedding_dim=50,
    users_num=943,
    items_num=1682,
    state_size=5,
    filter_ids=list(dataset["eval_users_dict"].keys())
)

In [None]:
train_ids = [
    "egreedy_0.1_2021-10-29_23-50-32.pkl",
    "linear_ucb_0.1_2021-11-04_15-01-07.pkl",
    "wfair_linear_ucb_0.1_2021-11-04_15-01-15.pkl"
]

In [None]:
algorithm = "drr"
train_version = "bandits"
train_id = train_ids[1]
output_path = "../model/{}/{}".format(train_version, train_id)

users_num = 943
items_num = 1682
state_size = 5
embedding_dim = 50
emb_model = "user_movie"
embedding_network_weights = "../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt"
n_groups = 10
fairness_constraints = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

top_k = None
done_count = 10

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)

reward_model = PMF(users_num, items_num, embedding_dim).to(device)
reward_model.load_state_dict(
    torch.load(embedding_network_weights, map_location=torch.device(device))
)
user_embeddings = reward_model.user_embeddings.weight.data
item_embeddings = reward_model.item_embeddings.weight.data

In [None]:
import torch
from tqdm import tqdm
from obp.utils import convert_to_action_dist
from obp.policy.policy_type import PolicyType
dim_context = 150

with open(output_path, "rb") as pkl_file:
    bandit = pickle.load(pkl_file)

selected_actions_list = list()
estimated_rewards = list() 
for index, row in tqdm(obp_dataset.data.iterrows(), total=obp_dataset.data.shape[0]):

    action_ = row["movie_id"]
    reward_ = 0 if row["rating"] < 4 else 1
    user_eb = user_embeddings[row["user_id"]]
    items_eb = item_embeddings[row["item_id_history"]]
    item_ave = torch.mean(items_eb, 0)
    context_ = torch.cat((user_eb, user_eb * item_ave, item_ave), 0).cpu().numpy()

    # select a list of actions
    if bandit.policy_type == PolicyType.CONTEXT_FREE:
        selected_actions = bandit.select_action()
    elif bandit.policy_type == PolicyType.CONTEXTUAL:
        selected_actions = bandit.select_action(
            context_.reshape(1, dim_context)
        )
    action_match_ = action_ == selected_actions[0]
    # update parameters of a bandit policy
    # only when selected actions&positions are equal to logged actions&positions
    if action_match_:
        if bandit.policy_type == PolicyType.CONTEXT_FREE:
            bandit.update_params(action=action_, reward=reward_)
        elif bandit.policy_type == PolicyType.CONTEXTUAL:
            bandit.update_params(
                action=action_,
                reward=reward_,
                context=context_.reshape(1, dim_context),
            )

    

    selected_actions_list.append(selected_actions)

    # _rewards = []
    # for item in range(obp_dataset.action.max() + 1):
    #     _rewards.append(
    #         reward_model.predict(
    #             torch.tensor([row["user_id"]]).long().to(device),
    #             torch.tensor([item]).long().to(device),
    #         )
    #         .detach()
    #         .cpu()
    #         .numpy()[0]
    #     )
    # estimated_rewards.append(_rewards)

action_dist = convert_to_action_dist(
    n_actions=obp_dataset.action.max() + 1,
    selected_actions=np.array(selected_actions_list),
)

In [None]:
ope = OffPolicyEvaluation(
    bandit_feedback=obp_dataset.obtain_batch_bandit_feedback(),
    ope_estimators=[
        IPS(estimator_name="IPS"), 
        DM(estimator_name="DM"), 
        IPS(lambda_=5, estimator_name="CIPS"), 
        SNIPS(estimator_name="SNIPS"),
        DR(estimator_name="DR"), 
        DRos(lambda_=500, estimator_name="DRos"), 
    ]
)

estimated_policy_value = ope.estimate_policy_values(
    action_dist=action_dist,
    estimated_rewards_by_reg_model=np.expand_dims(np.array(estimated_rewards), axis=2), 
)
estimated_policy_value

In [None]:
for key, value in estimated_policy_value.items():
    print(key, round(value, 4))