In [None]:
import sys 
sys.path.append('..')

#Dependencies
import os
from tqdm import tqdm
import numpy as np
import torch

import json 
import pickle

from src.model.pmf import PMF
from src.model.recommender import DRRAgent, FairRecAgent

from src.data.obp_dataset import MovieLensDataset
from obp.ope import (
    OffPolicyEvaluation, 
    InverseProbabilityWeighting as IPS,
    SelfNormalizedInverseProbabilityWeighting as SNIPS,
    DirectMethod as DM,
    DoublyRobust as DR,
    DoublyRobustWithShrinkage as DRos,
)

In [None]:
dataset_path = "../data/movie_lens_100k_output_path.json"
with open(dataset_path) as json_file:
    _dataset_path = json.load(json_file)


dataset = {}
with open(os.path.join("..", _dataset_path["eval_users_dict"]), "rb") as pkl_file:
    dataset["eval_users_dict"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_dict_positive_items"]), "rb") as pkl_file:
    dataset["eval_users_dict_positive_items"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["eval_users_history_lens"]), "rb") as pkl_file:
    dataset["eval_users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["users_history_lens"]), "rb") as pkl_file:
    dataset["users_history_lens"] = pickle.load(pkl_file)

with open(os.path.join("..", _dataset_path["movies_groups"]), "rb") as pkl_file:
    dataset["movies_groups"] = pickle.load(pkl_file)

obp_dataset = MovieLensDataset(
    data_path=os.path.join(os.getcwd(), "../data/"), 
    embedding_network_weights_path="../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt", 
    embedding_dim=50,
    users_num=943,
    items_num=1682,
    state_size=5,
    filter_ids=list(dataset["eval_users_dict"].keys())
)

In [None]:
_done_count = {i: length for i, length in zip(dataset["eval_users_dict"].keys(), dataset["eval_users_history_lens"])}

In [None]:
train_ids = [
    "movie_lens_100k_2021-10-26_11-45-48",
    "movie_lens_100k_2021-10-24_01-42-57", # long training
]

In [None]:
algorithm = "drr"
train_version = "movie_lens_100k"
train_id = train_ids[1]
output_path = "../model/{}/{}".format(train_version, train_id)

users_num = 943
items_num = 1682

state_size = 5
srm_size = 3

embedding_dim = 50
actor_hidden_dim = 512
actor_learning_rate = 0.0001
critic_hidden_dim = 512
critic_learning_rate = 0.001
discount_factor = 0.9
tau = 0.01
learning_starts=100
replay_memory_size = 1000000
batch_size = 32
emb_model = "user_movie"
embedding_network_weights = "../model/pmf/emb_50_ratio_0.800000_bs_1000_e_258_wd_0.100000_lr_0.000100_trained_pmf.pt"
n_groups = 10
fairness_constraints = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
done_count = 10

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
print(device)

reward_model = PMF(users_num, items_num, embedding_dim).to(device)
reward_model.load_state_dict(
    torch.load(embedding_network_weights)
)

In [None]:
AGENT = dict(drr=DRRAgent, fairrec=FairRecAgent)

In [None]:
actor_checkpoint = sorted(
    [
        int((f.split("_")[1]).split(".")[0])
        for f in os.listdir(output_path)
        if f.startswith("actor_")
    ]
)[-1]
critic_checkpoint = sorted(
    [
        int((f.split("_")[1]).split(".")[0])
        for f in os.listdir(output_path)
        if f.startswith("critic_")
    ]
)[-1]

print(actor_checkpoint, critic_checkpoint)

In [None]:
recommender = AGENT[algorithm](
    env=None,
    users_num=users_num,
    items_num=items_num,
    genres_num=0,
    movies_genres_id={}, 
    srm_size=srm_size,
    state_size=state_size,
    train_version=train_version,
    is_test=True,
    embedding_dim=embedding_dim,
    actor_hidden_dim=actor_hidden_dim,
    actor_learning_rate=actor_learning_rate,
    critic_hidden_dim=critic_hidden_dim,
    critic_learning_rate=critic_learning_rate,
    discount_factor=discount_factor,
    tau=tau,
    learning_starts=learning_starts,
    replay_memory_size=replay_memory_size,
    batch_size=batch_size,
    model_path=output_path,
    emb_model=emb_model,
    embedding_network_weights_path=embedding_network_weights,
    n_groups=n_groups,
    fairness_constraints=fairness_constraints,
)

recommender.load_model(
    os.path.join(output_path, "actor_{}.h5".format(actor_checkpoint)),
    os.path.join(
        output_path, "critic_{}.h5".format(critic_checkpoint)
    ),
)

In [None]:
import torch
from tqdm import tqdm
from obp.utils import convert_to_action_dist

_recommended_items = {user: set() for user in np.unique(obp_dataset.data.user_id.values)}

selected_actions_list = list()
estimated_rewards = list() 
for index, row in tqdm(obp_dataset.data.iterrows(), total=obp_dataset.data.shape[0]):

    for i in row["item_id_history"]:
        _recommended_items[row["user_id"]].add(i)

    # observe current state & Find action
    user_eb = recommender.user_embeddings[row["user_id"]]
    items_eb = recommender.get_items_emb(row["item_id_history"])

    with torch.no_grad():
        ## SRM state
        state = recommender.srm_ave(
            [
                user_eb.unsqueeze(0),
                items_eb.unsqueeze(0),
            ]
        )

        ## action(ranking score)
        action = recommender.actor.network(state)

    ## item
    recommended_item = recommender.recommend_item(
        action, _recommended_items[row["user_id"]],
    )

    action_match_ = row["movie_id"] == recommended_item
    # update parameters of a bandit policy
    # only when selected actions&positions are equal to logged actions&positions
    if action_match_:
        next_items = list(row["item_id_history"])
        if ((row["rating"] - 3 )  *  0.5 > 0 )and (recommended_item not in next_items):
            next_items = (
                next_items[1 :] + [recommended_item]
            )

        # observe current state & Find action
        next_items_eb = recommender.get_items_emb(next_items)
        with torch.no_grad():
            ## SRM state
            next_state = recommender.srm_ave(
                [
                    user_eb.unsqueeze(0),
                    next_items_eb.unsqueeze(0),
                ]
            ) 
        
        recommender.buffer.append(
            state.detach().cpu().numpy(),
            action.detach().cpu().numpy(),
            (row["rating"] - 3 )  *  0.5,
            next_state.detach().cpu().numpy(),
            len(_recommended_items[row["user_id"]]) >= _done_count[row["user_id"]],
        )

        if recommender.buffer.crt_idx > recommender.learning_starts or recommender.buffer.is_full:
            _actor_loss, _critic_loss = recommender.update_model()
            print("Update Model! ", _actor_loss, _critic_loss)

    selected_actions_list.append([recommended_item])
    _recommended_items[row["user_id"]].add(recommended_item)

    _rewards = []
    for item in range(obp_dataset.action.max() + 1):
        _rewards.append(
            reward_model.predict(
                torch.tensor([row["user_id"]]).long().to(device),
                torch.tensor([item]).long().to(device),
            )
            .detach()
            .cpu()
            .numpy()[0]
        )
    estimated_rewards.append(_rewards)

action_dist = convert_to_action_dist(
    n_actions=obp_dataset.action.max() + 1,
    selected_actions=np.array(selected_actions_list),
)

In [None]:
ope = OffPolicyEvaluation(
    bandit_feedback=obp_dataset.obtain_batch_bandit_feedback(),
    ope_estimators=[
        IPS(estimator_name="IPS"), 
        DM(estimator_name="DM"), 
        IPS(lambda_=5, estimator_name="CIPS"), 
        SNIPS(estimator_name="SNIPS"),
        DR(estimator_name="DR"), 
        DRos(lambda_=500, estimator_name="DRos"), 
    ]
)

estimated_policy_value = ope.estimate_policy_values(
    action_dist=action_dist,
    estimated_rewards_by_reg_model=np.expand_dims(np.array(estimated_rewards), axis=2), 
)
estimated_policy_value

In [None]:
for key, value in estimated_policy_value.items():
    print(key, round(value, 4))