### A Toy Example

In [1]:
import sys
import os
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.base_vec_env import VecEnv
# from stable_baselines3.common.policies import BasePolicy, register_policy
import time
from datetime import date
from matplotlib import pyplot as plt
from stable_baselines3 import DQN
import torch
%matplotlib inline

In [2]:
actions = np.arange(0.5, 5.5, 0.5)
actions_map = {idx: action for idx, action in enumerate(actions)}
actions_map

{0: 0.5,
 1: 1.0,
 2: 1.5,
 3: 2.0,
 4: 2.5,
 5: 3.0,
 6: 3.5,
 7: 4.0,
 8: 4.5,
 9: 5.0}

In [3]:
actions_map[0]

0.5

In [4]:
import gymnasium as gym
from gymnasium import spaces, Space

import random
import pickle
import numpy as np
from tqdm import tqdm
import time


class CustomActionSpace(Space):
    def __init__(self, shape=None, dtype=None):
        super().__init__(shape, dtype)
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.actions = list(self.actions_map.keys())
    
class MovieLensEnv(gym.Env):
    
    def __init__(self, data, use_prev_temp_as_feature=False, van_specific_embeddings=None, pbar=None):
        # print("__init__ method")
        # with open('../gym/data/mlens/mlens-test-trajectories-v1.pkl', 'rb') as f:
        # with open(test_traj_path, 'rb') as f:
        self.dataset = data

        super(MovieLensEnv, self).__init__()
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.current_step = 0
        self.max_steps = sum(len(traj['observations']) for traj in self.dataset)
        self.action_space = spaces.Discrete(10)  # You need to define CustomActionSpace
        self.observation_space = spaces.Box(low=0, high=1, shape=(self.dataset[0]['observations'].shape[1],), dtype=np.float32)
        self.sampled_idx = None
        self.action = None
        self.reward = None
        self.pbar = pbar
        self.total_steps = 0
        self.use_prev_temp = use_prev_temp_as_feature
        # self.idx_of_prev_temp_feat = np.where(self.dataset[0]['features'] == 'd_prev_target_temp')[0][0]
        self.personalized_features = van_specific_embeddings

    def step(self, action):
        self.action = action
        target_rating = self.dataset[self.sampled_idx]['targets'][self.current_step]
        # print(f"action taken in step: {action}")
        # print(f"type of action: {type(action)}")
        pred_rating = self.actions_map[action]
        # print(f"pred_rating: {pred_rating}")
        
        acc = 0
        if pred_rating == target_rating:
            acc = 1

        # Rewards scheme 5
        # -------------------------------
        error = abs(target_rating - pred_rating)
        self.reward = (1- (error / 4.5)) ** 2
        # # -------------------------------
        
        # # Binary Rewards scheme 
        # # -------------------------------
        # if target_rating >= 3.5 and pred_rating >= 3.5:
        #     self.reward = 1
        # else:
        #     self.reward = 0
        
        # # -------------------------------
        # # Reward for special cases
        # if target_rating != pred_rating:
        #     special_reward = reward
        # else:
        #     special_reward = 0
        
        done = False
        
        # if self.pbar is not None:
        #     self.pbar.set_description(f"(idx, step): ({self.sampled_idx}, {self.current_step}) | True rating: {target_rating} | Predicted rating: {pred_rating} | reward: {self.reward:.2f}")
        #     # time.sleep(0.25)
        self.current_step += 1
        obs, done = self._next_observation()
        self.total_steps += 1
        # return obs, self.reward, done, acc, target_rating, pred_rating, self.total_steps
        return obs, self.reward, done, None, {}
    
    def reset(self, seed=None):
        self.sampled_idx = random.randint(0, len(self.dataset) - 1)
        self.current_step = 0
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']


        obs = traj['observations'][self.current_step]

        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[user_id]))
        
        return obs, None
    
    def _next_observation(self):
        if self.dataset[self.sampled_idx]['terminals'][self.current_step]:
            done = True
            obs, _ = self.reset()
            return obs, done
        
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']
        obs = traj['observations'][self.current_step]
        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[van_id]))
        done = False
        return obs, done

    def eval(self):
        self.training = False
        
    def get_true_temperature(self):
        target_temperature = self.dataset[self.sampled_idx]['actions'][self.current_step]
        target_temperature = self.actions_map[target_temperature]
        return target_temperature
        


In [5]:
train_data_path = "../Data/mlens-train-trajectories-v1_with_0_to_1_reward_with_tags_and_summed.pkl"

with open (train_data_path, 'rb') as f:
    train_data = pickle.load(f)

In [6]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

# Create a function to instantiate your custom environment
def create_custom_env(data):
    return MovieLensEnv(data)  # Instantiate your custom environment

# Create a vectorized environment
# env = make_vec_env(create_custom_env, n_envs=4)

In [7]:
sample_env = create_custom_env(train_data)

In [8]:
# Initialize and train the DQN model
model = PPO("MlpPolicy", sample_env, verbose=1)

  return torch._C._cuda_getDeviceCount() > 0


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
model.learn(total_timesteps=10000, log_interval=8)

<stable_baselines3.ppo.ppo.PPO at 0x7fdcdc41d550>

In [10]:
test_data_path = "../Data/mlens-test-trajectories-v1_with_0_to_1_reward_with_tags_and_summed.pkl"
with open (test_data_path, 'rb') as f:
    test_data = pickle.load(f)

In [11]:
# # # Evaluate the trained model
# # mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=1)

# # Use the trained model to interact with the environment
# # test_env = make_vec_env(create_custom_env, num_envs=1)
# test_env = create_custom_env(test_data)
# obs = test_env.reset()[0]
# # print(f"obs: {obs}; shape: {obs.shape}")
# rewards = []
# for _ in range(1000):
#     action, _ = model.predict(obs, deterministic=True)
#     # print(f"action: {action}, type: {type(action)}, shape: {action.shape}")
#     obs, reward, done, _, info = test_env.step(action.item())
#     print(f"reward: {reward}")
#     rewards.append(reward)
#     if done:
#         obs, _ = test_env.reset()

In [None]:
max_ep_len = 1000
episode_rewards = []

for i in range(10):
    test_env = create_custom_env(test_data)
    obs = test_env.reset()[0]
    rewards = 0
    for t in range(max_ep_len):
        action, _ = model.predict(obs, deterministic=True)
        # print(f"action: {action}, type: {type(action)}, shape: {action.shape}")
        obs, reward, done, _, info = test_env.step(action.item())
        print(f"reward: {reward}")
        rewards += reward
        if done:
            obs, _ = test_env.reset()
    episode_rewards.append(rewards)

In [None]:
max(episode_rewards)

In [None]:
plt.plot(episode_rewards)


### DQN on a gym env

In [None]:
import gymnasium as gym

from stable_baselines3 import DQN

env = gym.make("CartPole-v1")

model = DQN("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=10000, log_interval=4)
# model.save("dqn_cartpole")

# del model # remove to demonstrate saving and loading

# model = DQN.load("dqn_cartpole")

# obs, info = env.reset()
# while True:
#     action, _states = model.predict(obs, deterministic=True)
#     obs, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         obs, info = env.reset()