### Imports

In [None]:
import sys
import os
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.base_vec_env import VecEnv
# from stable_baselines3.common.policies import BasePolicy, register_policy
import time
from datetime import date
from matplotlib import pyplot as plt
import torch
%matplotlib inline

#### Movielens Env

In [None]:
import gymnasium as gym
from gymnasium import spaces, Space

import random
import pickle
import numpy as np
from tqdm import tqdm
import time
from sklearn.metrics.pairwise import cosine_similarity

class CustomActionSpace(Space):
    def __init__(self, shape=None, dtype=None):
        super().__init__(shape, dtype)
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.actions = list(self.actions_map.keys())
    
class MovieLensEnv(gym.Env):
    
    def __init__(self, data, action_vocab, movie_embed_to_id, movies_ratings_and_tags, use_prev_temp_as_feature=False, van_specific_embeddings=None, pbar=None):
        self.dataset = data

        super(MovieLensEnv, self).__init__()

        self.current_step = 0
        self.max_steps = sum(len(traj['observations']) for traj in self.dataset)
        self.act_dim = self.dataset[0]['actions'].shape[1]

        self.low_action_space = [-1.0] * 8  # Lower bounds for each dimension
        self.high_action_space = [1.0] * 8  # Upper bounds for each dimension

        # self.action_space = spaces.box.Box(low=-1, high=1, shape=(self.act_dim,), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array(self.low_action_space), high=np.array(self.high_action_space), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(self.dataset[0]['observations'].shape[1],), dtype=np.float32)
        # self.observation_space = Space(shape=(self.dataset[0]['observations'].shape[1],), dtype=np.float32)
        self.sampled_idx = None
        self.action = None
        self.reward = None
        self.pbar = pbar
        self.total_steps = 0
        self.use_prev_temp = use_prev_temp_as_feature
        self.personalized_features = van_specific_embeddings
        self.movies_ratings_and_tags = movies_ratings_and_tags
        self.movie_embed_to_id = movie_embed_to_id
        self.action_vocab = action_vocab

    def step(self, action):
        # print(f"Back to stepping")
        raw_action = action

        # Need to create a mapping between actions and rewards
        # If the movie is actually rated by the user: then the reward is the user's rating
        # Else, the reward is the average rating of all users for the movie
        # if np.isnan(raw_action).sum() > 0:
        #     print(f"raw_action: {raw_action}")
        # print(f"raw_action: {raw_action}")
        similarities = cosine_similarity(raw_action.reshape(1, -1), self.action_vocab)

        # Find indices of closest neighbors for each prediction in the batch
        closest_indices = np.argmax(similarities, axis=1)

        # print(f"closest_indices.shape: {closest_indices.shape}")
        closest_vectors = self.action_vocab[closest_indices]

        final_action = closest_vectors.reshape(-1)

        # print(f"final action: {final_action}, shape: {final_action.shape}")
        # First let's find the movie_id from the embed
        movie_id = self.movie_embed_to_id[tuple(final_action)]
        user_id = self.dataset[self.sampled_idx]['user_id']

        rating_by_user = self.movies_ratings_and_tags[(self.movies_ratings_and_tags['movieId'] == movie_id) & (self.movies_ratings_and_tags['userId'] == user_id)]['rating']

        if rating_by_user.any():
            self.reward = rating_by_user.values[0]
        else:
            self.reward = self.movies_ratings_and_tags[self.movies_ratings_and_tags['movieId'] == movie_id]['rating_global'].mean()

        # Round to nearest 0.5
        def round_to_nearest_half(number):
            return round(number * 2) / 2

        # Example usage
        self.reward = round_to_nearest_half(self.reward)
        done = False
        

        if self.pbar is not None:
            self.pbar.set_description(f"(idx, step): ({self.sampled_idx}, {self.current_step}) | predicted movie_id: {movie_id} | reward: {self.reward:.2f}")
            # time.sleep(0.25)
        self.current_step += 1
        obs, done = self._next_observation()

        print(f"Step: {self.total_steps} | Predicted movie_id: {movie_id} | reward: {self.reward}")
        self.total_steps += 1

        return obs, self.reward, done, None, {}
    
    def reset(self, seed=None):
        # print(f"Reset method called")
        self.sampled_idx = random.randint(0, len(self.dataset) - 1)
        self.current_step = 0
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']

        obs = traj['observations'][self.current_step]

        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[user_id]))
        # print(f"returning from reset()")print()
        # return obs, None
        return obs, False
    
    def _next_observation(self):
        if self.dataset[self.sampled_idx]['terminals'][self.current_step]:
            done = True
            obs, _ = self.reset()
            return obs, done
        
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']
        obs = traj['observations'][self.current_step]
        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[user_id]))
        done = False
        return obs, done

    def eval(self):
        self.training = False
        
    def get_true_temperature(self):
        target_temperature = self.dataset[self.sampled_idx]['actions'][self.current_step]
        target_temperature = self.action_space.actions_map[target_temperature]
        return target_temperature
        

### Different Versions of the data with different reward schemes

In [None]:
# from copy import deepcopy
# with open("../data/dt-datasets/movielens/processed-data/all_trajectories_with_concatenated_movname_genres_tags_userid_reward_of_scale_5.pkl", 'rb') as f:
#     all_trajectories = pickle.load(f)
# # all_trajs_copy = deepcopy(all_trajectories)

In [None]:
# # Calculate the size for the training set
# np.random.seed(42)
# trajectories = all_trajectories
# indices = {i for i in range(len(trajectories))}
# train_indices = list(np.random.choice(list(indices), size=round(0.7*len(indices)), replace=False))
# remaining_indices = indices.difference(train_indices)
# test_indices = remaining_indices

# print(f"total train users: {len(train_indices)}")
# print(f"total test users: {len(test_indices)}")

# train_trajectories = [trajectories[idx]for idx in train_indices]
# test_trajectories = [trajectories[idx]for idx in test_indices]

# print("Train set:", len(train_trajectories))
# print("Test set:", len(test_trajectories))

#### Reward scheme 1: Naive reward: all observations marked with 1

In [None]:
# # train_data_copy = deepcopy(train_data)
# train_trajectories_copy = deepcopy(train_trajectories)
# for traj in train_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     # print(traj['observations'].shape)
#     traj['rewards'] =  np.ones_like(traj['rewards'])



# test_trajectories_copy = deepcopy(test_trajectories)
# for traj in test_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     traj['rewards'] =  np.ones_like(traj['rewards'])

    
# train_data_with_naive_rewards = train_trajectories_copy
# test_data_with_naive_rewards = test_trajectories_copy

# with open('data/train_data_with_naive_rewards.pkl', 'wb') as f:
#     pickle.dump(train_data_with_naive_rewards, f)

# with open('data/test_data_with_naive_rewards.pkl', 'wb') as f:
#     pickle.dump(test_data_with_naive_rewards, f)

#### Reward scheme 2: Proximity based reward: scaled reward in the range of 0 and 1

In [None]:
# train_trajectories_copy = deepcopy(train_trajectories)
# highest_rating = 5.0
# for traj in train_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     # print(traj['observations'].shape)
#     errors = abs(highest_rating - traj['targets'])
#     traj['rewards'] = (1- (errors / 4.5)) ** 2

# test_trajectories_copy = deepcopy(test_trajectories)
# for traj in test_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     # print(traj['observations'].shape)
#     errors = abs(highest_rating - traj['targets'])
#     traj['rewards'] = (1- (errors / 4.5)) ** 2

# train_data_with_range_rewards = train_trajectories_copy
# test_data_with_range_rewards = test_trajectories_copy

# with open('data/train_data_with_range_rewards.pkl', 'wb') as f:
#     pickle.dump(train_data_with_range_rewards, f)

# with open('data/test_data_with_range_rewards.pkl', 'wb') as f:
#     pickle.dump(test_data_with_range_rewards, f)

#### Reward scheme 3: Binary reward: 1 or 0; 1 if liked 0 if not

In [None]:
# train_trajectories_copy = deepcopy(train_trajectories)
# threshold = 3.5
# for traj in train_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     # print(traj['observations'].shape)
#     errors = abs(highest_rating - traj['targets'])
#     traj['rewards'] = (traj['targets'] >= threshold).astype(int)


# test_trajectories_copy = deepcopy(test_trajectories)
# for traj in test_trajectories_copy:
#     name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
#     traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
#     # print(traj['observations'].shape)
#     errors = abs(highest_rating - traj['targets'])
#     traj['rewards'] = (traj['targets'] >= threshold).astype(int)

# train_data_with_binary_rewards = train_trajectories_copy
# test_data_with_binary_rewards = test_trajectories_copy

# with open('data/train_data_with_binary_rewards.pkl', 'wb') as f:
#     pickle.dump(train_data_with_binary_rewards, f)

# with open('data/test_data_with_binary_rewards.pkl', 'wb') as f:
#     pickle.dump(test_data_with_binary_rewards, f)

### Experiment with RL methods

#### Set up the environment

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

# Create a function to instantiate your custom environment
def create_custom_env(data, action_vocab, movie_embed_to_id, movies_ratings_and_tags):
    return MovieLensEnv(data, action_vocab, movie_embed_to_id, movies_ratings_and_tags)  # Instantiate your custom environment

# Create a vectorized environment
# env = make_vec_env(create_custom_env, n_envs=4)

In [None]:
action_embed_shape = 8

train_data_path = f'../data/dt-datasets/movielens/train-test-sets/mlens-train-trajectories-movies-as-actions-reduced-from-768-to-{action_embed_shape}_with_tanh.pkl'
test_data_path = f'../data/dt-datasets/movielens/train-test-sets/mlens-test-trajectories-movies-as-actions-reduced-from-768-to-{action_embed_shape}_with_tanh.pkl'

movies_ratings_and_tags = pd.read_csv("../data/movies_ratings_and_tags_mlens_small.csv")
movies_ratings_and_tags.drop('Unnamed: 0', axis=1, inplace=True)
overall_ratings = movies_ratings_and_tags.groupby('movieId')['rating'].mean().reset_index()
# Merge the overall ratings back into the original DataFrame
movies_ratings_and_tags = movies_ratings_and_tags.merge(overall_ratings, on='movieId', suffixes=('', '_global'))

movie_embeds_to_id_map_path = f"../data/dt-datasets/movielens/processed-data/movie_embed_with_shape_{action_embed_shape}_to_id_mapping_with_tanh.pkl"

action_vocab_path = f"../data/dt-datasets/movielens/processed-data/action_vocab_of_shape_{action_embed_shape}_with_tanh.pkl"

with open(train_data_path, 'rb') as f:
    train_data = pickle.load(f)

with open(test_data_path, 'rb') as f:
    test_data = pickle.load(f)
# # Try normalizing the train_data between -1 and 1
# for traj in train_data:
#     traj['actions'] = 2 * (traj['actions'] - np.min(traj['actions'])) / (np.max(traj['actions']) - np.min(traj['actions'])) - 1
#     traj['observations'] = 2 * (traj['observations'] - np.min(traj['observations'])) / (np.max(traj['actions']) - np.min(traj['actions'])) - 1


with open(movie_embeds_to_id_map_path, 'rb') as f:
    movie_embed_to_id = pickle.load(f)

with open(action_vocab_path, 'rb') as f:
    action_vocab = pickle.load(f)

train_env = Monitor(create_custom_env(data=train_data, action_vocab=action_vocab, movie_embed_to_id=movie_embed_to_id, movies_ratings_and_tags=movies_ratings_and_tags))

In [None]:
# all_obs = []
# all_actions = []
# for traj in train_data:
#     all_obs.append(traj['observations'])
#     all_actions.append(traj['actions'])

# obss = np.concatenate(all_obs)
# actions = np.concatenate(all_actions)
# obss.shape, actions.shape

#### Vanilla DDPG

In [None]:
train_env.action_space.shape[-1]

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
n_actions = train_env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
n_actions


model = DDPG("MlpPolicy", train_env, action_noise=action_noise, verbose=1, seed=123, learning_rate=1e-4)
model.policy


In [None]:
model.learn(total_timesteps=10000, log_interval=1)

##### Evaluate DDPG model

In [None]:
max_ep_len = 100
episode_rewards = []

for i in range(10):
    test_env = create_custom_env(test_data, action_vocab=action_vocab, movie_embed_to_id=movie_embed_to_id, movies_ratings_and_tags=movies_ratings_and_tags)
    obs = test_env.reset()[0]
    rewards = 0
    for t in range(max_ep_len):
        action, _ = model.predict(obs)
        # print(f"action: {action}, type: {type(action)}, shape: {action.shape}")
        obs, reward, done, _, info = test_env.step(action)
        # print(f"reward: {reward}")
        rewards += reward
        if done:
            obs, _ = test_env.reset()
    episode_rewards.append(rewards)

In [None]:
scaled_ep_rewards = [r * 100/ 100 for r in episode_rewards]
scaled_ep_rewards

In [None]:
action_vocab.shape

In [None]:
# Another evaluation policy

# from stable_baselines3.common.evaluation import evaluate_policy
# # # Evaluate the trained model
# with open('data/test_data_with_naive_rewards.pkl', 'rb') as f:
#     test_data = pickle.load(f)
# test_env = Monitor(create_custom_env(test_data))

# mean_reward, another = evaluate_policy(model, test_env, n_eval_episodes=5, return_episode_rewards=True)
# mean_reward, another
# # # Use the trained model to interact with the environment
# # # test_env = make_vec_env(create_custom_env, num_envs=1)
# # test_env = create_custom_env(test_data)
# # obs = test_env.reset()[0]
# # # print(f"obs: {obs}; shape: {obs.shape}")
# # rewards = []
# # for _ in range(1000):
# #     action, _ = model.predict(obs, deterministic=True)
# #     # print(f"action: {action}, type: {type(action)}, shape: {action.shape}")
# #     obs, reward, done, _, info = test_env.step(action.item())
# #     print(f"reward: {reward}")
# #     rewards.append(reward)
# #     if done:
# #         obs, _ = test_env.reset()