In [1]:
import gymnasium as gym
from gymnasium import spaces, Space

import random
import pickle
import numpy as np
from tqdm import tqdm
import time
import d3rlpy

In [2]:
np.random.seed(42)
random.seed(42)

### Discrete Actions

In [3]:
class CustomActionSpace(Space):
    def __init__(self, shape=None, dtype=None):
        super().__init__(shape, dtype)
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.actions = list(self.actions_map.keys())
    
class MovieLensEnv(gym.Env):
    
    def __init__(self, data, use_prev_temp_as_feature=False, van_specific_embeddings=None, pbar=None):
        # print("__init__ method")
        # with open('../gym/data/mlens/mlens-test-trajectories-v1.pkl', 'rb') as f:
        # with open(test_traj_path, 'rb') as f:
        self.dataset = data

        super(MovieLensEnv, self).__init__()
        actions = np.arange(0.5, 5.5, 0.5)
        self.actions_map = {idx: action for idx, action in enumerate(actions)}
        self.current_step = 0
        self.max_steps = sum(len(traj['observations']) for traj in self.dataset)
        self.action_space = spaces.Discrete(10)  # You need to define CustomActionSpace
        self.observation_space = spaces.Box(low=0, high=1, shape=(self.dataset[0]['observations'].shape[1],), dtype=np.float32)
        self.sampled_idx = None
        self.action = None
        self.reward = None
        self.pbar = pbar
        self.total_steps = 0
        self.use_prev_temp = use_prev_temp_as_feature
        # self.idx_of_prev_temp_feat = np.where(self.dataset[0]['features'] == 'd_prev_target_temp')[0][0]
        self.personalized_features = van_specific_embeddings

    def step(self, action):
        self.action = action
        target_rating = self.dataset[self.sampled_idx]['targets'][self.current_step]
        # print(f"action taken in step: {action}")
        # print(f"type of action: {type(action)}")
        pred_rating = self.actions_map[action]
        # print(f"pred_rating: {pred_rating}")
        
        # print(f"action: {self.action} | pred_rating: {pred_rating} | original_rating: {target_rating}")
        acc = 0
        if pred_rating == target_rating:
            acc = 1

        # Rewards scheme 5
        # -------------------------------
        error = abs(target_rating - pred_rating)
        self.reward = (1- (error / 4.5)) ** 2
        # # -------------------------------
        
        # Binary Rewards scheme 
        # -------------------------------
        if target_rating >= 3.5 and pred_rating >= 3.5:
            self.reward = 1
        else:
            self.reward = 0
        
        # # -------------------------------
        # # Reward for special cases
        # if target_rating != pred_rating:
        #     special_reward = reward
        # else:
        #     special_reward = 0
        
        done = False
        
        # if self.pbar is not None:
        #     self.pbar.set_description(f"(idx, step): ({self.sampled_idx}, {self.current_step}) | True rating: {target_rating} | Predicted rating: {pred_rating} | reward: {self.reward:.2f}")
        #     # time.sleep(0.25)
        self.current_step += 1
        obs, done = self._next_observation()
        self.total_steps += 1
        # return obs, self.reward, done, acc, target_rating, pred_rating, self.total_steps
        return obs, self.reward, done, None, {}
    
    def reset(self, seed=None):
        self.sampled_idx = random.randint(0, len(self.dataset) - 1)
        self.current_step = 0
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']


        obs = traj['observations'][self.current_step]

        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[user_id]))
        
        return obs, None
    
    def _next_observation(self):
        if self.dataset[self.sampled_idx]['terminals'][self.current_step]:
            done = True
            obs, _ = self.reset()
            return obs, done
        
        traj = self.dataset[self.sampled_idx]
        user_id = traj['user_id']
        obs = traj['observations'][self.current_step]
        if self.personalized_features is not None:
            obs = np.hstack((obs, self.personalized_features[van_id]))
        done = False
        return obs, done

    def eval(self):
        self.training = False
        
    def get_true_temperature(self):
        target_temperature = self.dataset[self.sampled_idx]['actions'][self.current_step]
        target_temperature = self.actions_map[target_temperature]
        return target_temperature
        


In [4]:
from copy import deepcopy
with open("../data/dt-datasets/movielens/processed-data/all_trajectories_with_concatenated_movname_genres_tags_userid_reward_of_scale_5.pkl", 'rb') as f:
    all_trajectories = pickle.load(f)
# all_trajs_copy = deepcopy(all_trajectories)

In [5]:
# Calculate the size for the training set
np.random.seed(42)
trajectories = all_trajectories
indices = {i for i in range(len(trajectories))}
train_indices = list(np.random.choice(list(indices), size=round(0.7*len(indices)), replace=False))
remaining_indices = indices.difference(train_indices)
test_indices = remaining_indices

print(f"total train users: {len(train_indices)}")
print(f"total test users: {len(test_indices)}")

train_trajectories = [trajectories[idx]for idx in train_indices]
test_trajectories = [trajectories[idx]for idx in test_indices]

print("Train set:", len(train_trajectories))
print("Test set:", len(test_trajectories))

total train users: 427
total test users: 183
Train set: 427
Test set: 183


#### Reward scheme 1: Naive reward: all observations marked with 1

In [6]:
# train_data_copy = deepcopy(train_data)
train_trajectories_copy = deepcopy(train_trajectories)
for traj in train_trajectories_copy:
    name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
    traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
    # print(traj['observations'].shape)
    traj['rewards'] =  np.ones_like(traj['rewards'])



test_trajectories_copy = deepcopy(test_trajectories)
for traj in test_trajectories_copy:
    name_and_genre_embeds = traj['observations'][:, 0:768] + traj['observations'][:, 768:2*768]
    traj['observations'] = np.concatenate((name_and_genre_embeds, traj['observations'][:, 3*768:]), axis=1)
    traj['rewards'] =  np.ones_like(traj['rewards'])

    
train_data_with_naive_rewards = train_trajectories_copy
test_data_with_naive_rewards = test_trajectories_copy

with open('data/train_data_with_naive_rewards.pkl', 'wb') as f:
    pickle.dump(train_data_with_naive_rewards, f)

with open('data/test_data_with_naive_rewards.pkl', 'wb') as f:
    pickle.dump(test_data_with_naive_rewards, f)

In [7]:
# Data preparation for Discrete CQL
import numpy as np
observations_mlens = []
observations_mlens = np.concatenate([ep['observations'] for ep in train_data_with_naive_rewards])
actions_mlens = np.concatenate([ep['actions'] for ep in train_data_with_naive_rewards])
rewards_mlens = np.concatenate([ep['rewards'] for ep in train_data_with_naive_rewards])
terminals_mlens = np.concatenate([ep['terminals'] for ep in train_data_with_naive_rewards])

timeouts = None

In [8]:
from d3rlpy.dataset import EpisodeGenerator

episode_generator = EpisodeGenerator(
    observations=observations_mlens,
    actions=actions_mlens,
    rewards=rewards_mlens,
    terminals=terminals_mlens,
    timeouts=timeouts,
)

episodes_generated_mlens = episode_generator()

In [9]:
from d3rlpy.dataset import ReplayBuffer, InfiniteBuffer

dataset = ReplayBuffer(
    InfiniteBuffer(),
    episodes=episodes_generated_mlens,
    transition_picker=None,
    trajectory_slicer=None,
)

[2m2024-02-11 13:15.08[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(800,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m
[2m2024-02-11 13:15.08[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2024-02-11 13:15.08[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m10[0m


In [10]:
dataset.episodes[0].observations.shape

(227, 800)

In [11]:
env = MovieLensEnv(test_data_with_naive_rewards)

#### Discrete CQL

In [12]:
# # start training
# cql = d3rlpy.algos.DiscreteCQLConfig().create(device='cuda')
# cql.fit(
#     dataset,
#     n_steps=10000,
#     n_steps_per_epoch=1000,
#     evaluators={
#         'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
#     },
# )

# # evaluate
# rewards = []
# for _ in range(10):
#     reward = d3rlpy.metrics.evaluate_qlearning_with_environment(cql, env)
#     rewards.append(reward)
# # print(np.round(rewards, 2))
    
# for r in np.round(rewards, 2):
#     print(r)

#### DQN

In [13]:
# # start training
# dqn = d3rlpy.algos.DQNConfig().create(device='cuda')
# dqn.fit(
#     dataset,
#     n_steps=10000,
#     n_steps_per_epoch=1000,
#     evaluators={
#         'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
#     },
# )

# # evaluate
# rewards = []
# for _ in range(10):
#     reward = d3rlpy.metrics.evaluate_qlearning_with_environment(dqn, env)
#     rewards.append(reward)
# # print(np.round(rewards, 2))
    
# for r in np.round(rewards, 2):
#     print(r)

#### DDQN

In [14]:
# start training
ddqn = d3rlpy.algos.DoubleDQNConfig().create(device='cuda')
ddqn.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    evaluators={
        'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
    },
)

# evaluate
rewards = []
for _ in range(10):
    reward = d3rlpy.metrics.evaluate_qlearning_with_environment(ddqn, env)
    rewards.append(reward)
# print(np.round(rewards, 2))
    
for r in np.round(rewards, 2):
    print(r)

[2m2024-02-11 13:15.47[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(800,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=10)[0m
[2m2024-02-11 13:15.47[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DoubleDQN_20240211131547[0m
[2m2024-02-11 13:15.47[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2024-02-11 13:15.47[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2024-02-11 13:15.47[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [800], 'action_size': 10, 'config': {'type': 'double_dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'a

Epoch 1/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2024-02-11 13:15.53[0m [[32m[1minfo     [0m] [1mDoubleDQN_20240211131547: epoch=1 step=1000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0015747854709625244, 'time_algorithm_update': 0.0035011582374572755, 'loss': 0.49966634510457514, 'time_step': 0.0051364459991455075, 'environment': 18.91482362743111}[0m [36mstep[0m=[35m1000[0m
[2m2024-02-11 13:15.53[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DoubleDQN_20240211131547/model_1000.d3[0m


Epoch 2/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2024-02-11 13:15.59[0m [[32m[1minfo     [0m] [1mDoubleDQN_20240211131547: epoch=2 step=2000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0016096196174621583, 'time_algorithm_update': 0.0033745198249816896, 'loss': 0.42170781384408473, 'time_step': 0.005047095537185669, 'environment': 33.12005319034266}[0m [36mstep[0m=[35m2000[0m
[2m2024-02-11 13:15.59[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DoubleDQN_20240211131547/model_2000.d3[0m


Epoch 3/10:   0%|          | 0/1000 [00:00<?, ?it/s]

[2m2024-02-11 13:16.05[0m [[32m[1minfo     [0m] [1mDoubleDQN_20240211131547: epoch=3 step=3000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0015847759246826172, 'time_algorithm_update': 0.0033999805450439452, 'loss': 0.4093866769969463, 'time_step': 0.0050436131954193116, 'environment': 25.744818758232036}[0m [36mstep[0m=[35m3000[0m
[2m2024-02-11 13:16.05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DoubleDQN_20240211131547/model_3000.d3[0m


Epoch 4/10:   0%|          | 0/1000 [00:00<?, ?it/s]

#### SAC

In [None]:
# start training
sac = d3rlpy.algos.DiscreteSACConfig().create(device='cuda')
sac.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    evaluators={
        'environment': d3rlpy.metrics.EnvironmentEvaluator(env),
    },
)

# evaluate
rewards = []
for _ in range(10):
    reward = d3rlpy.metrics.evaluate_qlearning_with_environment(sac, env)
    rewards.append(reward)
# print(np.round(rewards, 2))
    
for r in np.round(rewards, 2):
    print(r)

#### Continuous CQL

In [None]:
import d3rlpy

# prepare dataset
dataset, env = d3rlpy.datasets.get_d4rl('hopper-medium-v0')

# # prepare algorithm
# cql = d3rlpy.algos.CQLConfig().create(device='cuda:0')

# # train
# cql.fit(
#     dataset,
#     n_steps=100000,
#     evaluators={"environment": d3rlpy.metrics.EnvironmentEvaluator(env)},
# )

In [None]:
env.step(0.45)

In [None]:
dataset.episodes[0].observations.shape

In [None]:
import d4rl

In [None]:
a = [1, 2, 3]


In [None]:
a[-2:]