In [None]:
import gymnasium as gym
from gymnasium import spaces, Space

import random
import pickle
import numpy as np
from tqdm import tqdm
import time
import d3rlpy
import pandas as pd

In [None]:
np.random.seed(42)
random.seed(42)

### Discrete Actions

In [None]:
import random
import pickle
import numpy as np
import gym
from gym.spaces import Box, Space
from tqdm import tqdm
import time
from sklearn.metrics.pairwise import cosine_similarity


class CustomActionSpace(Space):
    def __init__(self, shape=None, dtype=None):
        super().__init__(shape, dtype)
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.actions = list(self.actions_map.keys())
    
class MovieLensEnv(gym.Env):
    
    def __init__(self, test_data, movie_embed_to_id, movies_ratings_and_tags, action_vocab):
        # with open(test_traj_path, 'rb') as f:
        super(MovieLensEnv, self).__init__()
        self.dataset = test_data
        self.action_vocab = action_vocab
        self.current_step = 0
        self.max_steps = sum(len(traj['observations']) for traj in self.dataset)
        self.action_space = CustomActionSpace(shape=(1, 1))  # You need to define CustomActionSpace
        self.observation_space = Box(low=0, high=1, shape=(self.dataset[0]['observations'].shape[1],), dtype=np.float32)
        self.sampled_idx = None
        self.action = None
        self.reward = None
        self.total_steps = 0
        self.movies_ratings_and_tags = movies_ratings_and_tags
        self.movie_embed_to_id = movie_embed_to_id

    def step(self, action):
        self.action = action
        # print(self.action.shape, type(self.action))
        # print(f"action shape after view operation: {self.action.view(-1, self.action.shape).shape}")
        similarities = cosine_similarity(self.action.reshape(-1, self.action.shape[0]), self.action_vocab)
        # print(f"similarities.shape: {similarities.shape}")

        # Find indices of closest neighbors for each prediction in the batch
        closest_indices = np.argmax(similarities, axis=1)

        closest_vector = self.action_vocab[closest_indices.flatten()].reshape(1, self.action.shape[0])

        action_pred = closest_vector.reshape(-1)

        # print(f"pred action shape: {action_pred.shape}")
        # Need to create a mapping between actions and rewards
        # If the movie is actually rated by the user: then the reward is the user's rating
        # Else, the reward is the average rating of all users for the movie

        # First let's find the movie_id from the embed
        movie_id = self.movie_embed_to_id[tuple(action_pred)]
        user_id = self.dataset[self.sampled_idx]['user_id']

        rating_by_user = self.movies_ratings_and_tags[(self.movies_ratings_and_tags['movieId'] == movie_id) & (self.movies_ratings_and_tags['userId'] == user_id)]['rating']

        if rating_by_user.any():
            self.reward = rating_by_user.values[0]
        else:
            self.reward = self.movies_ratings_and_tags[self.movies_ratings_and_tags['movieId'] == movie_id]['rating_global'].mean()

        # Round to nearest 0.5
        def round_to_nearest_half(number):
            return round(number * 2) / 2

        # Example usage
        self.reward = round_to_nearest_half(self.reward)/5.0


        done = False
        
        # if self.pbar is not None:
        #     self.pbar.set_description(f"(idx, step): ({self.sampled_idx}, {self.current_step}) | predicted movie_id: {movie_id} | reward: {self.reward:.2f}")
        #     # time.sleep(0.25)
        self.current_step += 1
        self.total_steps += 1
        obs, done = self._next_observation()
        return obs, self.reward, done, None, {}
        
        # return obs, self.reward, done, user_feature, self.total_steps, movie_id
    
    def reset(self):
        self.sampled_idx = random.randint(0, len(self.dataset) - 1)
        self.current_step = 0
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']

        obs = traj['observations'][self.current_step]

        return obs, None
    
    def _next_observation(self):
        if self.dataset[self.sampled_idx]['terminals'][self.current_step]:
            done = True
            obs, _ = self.reset()
            return obs, done
        
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']
        obs = traj['observations'][self.current_step]

        done = False
        return obs, done


    def eval(self):
        self.training = False
        
    def get_true_temperature(self):
        target_temperature = self.dataset[self.sampled_idx]['actions'][self.current_step]
        target_temperature = self.action_space.actions_map[target_temperature]
        return target_temperature
        

In [None]:
action_embed_shape = 32

train_dataset_path = f"../data/dt-datasets/movielens/train-test-sets/mlens-train-trajectories-movies-as-actions-reduced-from-768-to-{action_embed_shape}.pkl"

test_set_path = f"../data/dt-datasets/movielens/train-test-sets/mlens-test-trajectories-movies-as-actions-reduced-from-768-to-{action_embed_shape}.pkl"

movie_embeds_to_id_map_path = f"../data/dt-datasets/movielens/processed-data/movie_embed_with_shape_{action_embed_shape}_to_id_mapping.pkl"

action_vocab_path = f"../data/dt-datasets/movielens/processed-data/action_vocab_of_shape_{action_embed_shape}.pkl"

with open(action_vocab_path, 'rb') as f:
    action_vocab = pickle.load(f)

with open(train_dataset_path, 'rb') as f:
    train_data = pickle.load(f)

with open(test_set_path, 'rb') as f:
    test_data = pickle.load(f)

with open(movie_embeds_to_id_map_path, 'rb') as f:
    movie_embed_to_id = pickle.load(f)

movies_ratings_and_tags = pd.read_csv("../data/movies_ratings_and_tags_mlens_small.csv")
movies_ratings_and_tags.drop('Unnamed: 0', axis=1, inplace=True)

overall_ratings = movies_ratings_and_tags.groupby('movieId')['rating'].mean().reset_index()
# Merge the overall ratings back into the original DataFrame
movies_ratings_and_tags = movies_ratings_and_tags.merge(overall_ratings, on='movieId', suffixes=('', '_global'))

In [None]:
train_data[0]['observations'].shape

In [None]:
# Data preparation for Discrete CQL
import numpy as np
observations_mlens = []
observations_mlens = np.concatenate([ep['observations'] for ep in train_data])
actions_mlens = np.concatenate([ep['actions'] for ep in train_data])
rewards_mlens = np.concatenate([ep['rewards'] for ep in train_data])
terminals_mlens = np.concatenate([ep['terminals'] for ep in train_data])

timeouts = None

In [None]:
from d3rlpy.dataset import EpisodeGenerator

episode_generator = EpisodeGenerator(
    observations=observations_mlens,
    actions=actions_mlens,
    rewards=rewards_mlens,
    terminals=terminals_mlens,
    timeouts=timeouts,
)

episodes_generated_mlens = episode_generator()

In [None]:
from d3rlpy.dataset import ReplayBuffer, InfiniteBuffer

dataset = ReplayBuffer(
    InfiniteBuffer(),
    episodes=episodes_generated_mlens,
    transition_picker=None,
    trajectory_slicer=None,
)

In [None]:
dataset.episodes[0].observations.shape

In [None]:
env = MovieLensEnv(test_data=test_data, movie_embed_to_id=movie_embed_to_id, movies_ratings_and_tags=movies_ratings_and_tags, action_vocab=action_vocab)

#### Continuous CQL

In [None]:
# # start training
# cql = d3rlpy.algos.CQLConfig().create(device='cuda')
# cql.fit(
#     dataset,
#     n_steps=10000,
#     n_steps_per_epoch=1000,
#     evaluators={
#         'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
#     },
# )

# # evaluate
# rewards = []
# for _ in range(5):
#     reward = d3rlpy.metrics.evaluate_qlearning_with_environment(cql, env)
#     rewards.append(reward)
# # print(np.round(rewards, 2))
    
# for r in np.round(rewards, 2):
#     print(r)

#### DDPG

In [None]:
# start training
ddpg = d3rlpy.algos.DDPGConfig().create(device='cuda')
ddpg.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    # evaluators={
    #     'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
    # },
)

# evaluate
rewards = []
for _ in range(5):
    reward = d3rlpy.metrics.evaluate_qlearning_with_environment(ddpg, env)
    rewards.append(reward)

    
for r in np.round(rewards, 1):
    print(r)

#### BEAR

In [None]:
# start training
bear = d3rlpy.algos.BEARConfig().create(device='cuda')
bear.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    # evaluators={
    #     'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
    # },
)

# evaluate
rewards = []
for _ in range(5):
    reward = d3rlpy.metrics.evaluate_qlearning_with_environment(bear, env)
    rewards.append(reward)

    
for r in np.round(rewards, 1):
    print(r)

#### BC

In [None]:
from tqdm import tqdm
# start training
bc = d3rlpy.algos.BCConfig().create(device='cuda')
bc.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    # evaluators={
    #     'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
    # },
)

# evaluate
rewards = []
for _ in tqdm(range(5)):
    reward = d3rlpy.metrics.evaluate_qlearning_with_environment(bc, env)
    rewards.append(reward)

    
for r in np.round(rewards, 1):
    print(r)