In [2]:
import numpy as np
import d3rlpy
from d3rlpy.datasets import get_d4rl
import gym
from d3rlpy.metrics.scorer import evaluate_on_environment
import argparse
import os
from tqdm import tqdm


os.environ['D4RL_SUPPRESS_IMPORT_ERROR'] = '1'


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, help="task/game to be played")
parser.add_argument("--algo", type=str, help="algorithm to be used for training")
args = parser.parse_args()

task = args.task #['HalfCheetah-v4', 'Walker2d-v4', 'Ant-v4']
algo = args.algo
print(task)


usage: ipykernel_launcher.py [-h] [--task TASK] [--algo ALGO]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9008 --control=9006 --hb=9005 --Session.signature_scheme="hmac-sha256" --Session.key=b"9e8c588e-7ee3-41ea-b19d-c84d6b03e25b" --shell=9007 --transport="tcp" --iopub=9009 --f=/home/raunakk/.local/share/jupyter/runtime/kernel-v2-23007p03Mm7VoJj4E.json


SystemExit: 2

In [3]:
class MODEL():
    def __init__(self, task, algo, gpu=True):
        self.mean_results = []
        self.task = task 
        self.algo = algo
        self.f_params = {"use_gpu": gpu}
        self.engine = None

    def set_engine(self):
        if self.algo == "IQL":
            self.engine = d3rlpy.algos.IQL(**self.f_params)

        elif self.algo == "CQL":
            self.f_params["actor_learning_rate"] = 3e-5
            self.engine = d3rlpy.algos.CQL(**self.f_params)

        elif self.algo == "MOPO":
            self.engine = d3rlpy.algos.MOPO(**self.f_params)
            
        elif self.algo == "COMBO":
            self.engine = d3rlpy.algos.COMBO(**self.f_params)

    def train(self, n=100, n_steps=1000000, n_steps_per_epoch=10000, save_interval=100, save_metrics=False, verbose=False):
        dataset, env = get_d4rl(self.task)
        online_env = gym.make(self.task)
        for i in range(n):
            d3rlpy.seed(i)
            env.seed(i)
            online_env.seed(i)

            self.set_engine()

            self.engine.fit(dataset, n_steps=n_steps, n_steps_per_epoch=n_steps_per_epoch, save_interval=save_interval, save_metrics=save_metrics, verbose=verbose)
            self.engine.save_model("./saved_models/iql_{}_{}_{}.pt".format(algo, task, i))
            for i in range(100):
                scorer = evaluate_on_environment(online_env, n_trials=1)
                self.mean_results.append(scorer(self.engine))
        return self.mean_results

In [4]:
task = "halfcheetah-medium-v2"
algo = "CQL"
model = MODEL(task, algo)
mean_results = model.train(n=1, n_steps=1, n_steps_per_epoch=1)

pybullet build time: Jun 13 2023 11:48:14
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 21/21 [00:03<00:00,  5.33it/s]
  deprecation(


[2m2023-06-24 18:20:14[0m [[32m[1mdebug    [0m] [1mRandomIterator is selected.[0m
[2m2023-06-24 18:20:14[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-06-24 18:20:57[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m


Epoch 1/1: 100%|██████████| 1/1 [00:33<00:00, 33.63s/it, temp_loss=8.4, temp=1, alpha_loss=-27.5, alpha=1, critic_loss=71.5, actor_loss=-2.36]


In [5]:
mean_results

[-1.3035401301234066,
 -1.0729566577045104,
 -3.9790731379671676,
 -1.5766215749340042,
 -0.8668231847213048,
 -1.703900797168258,
 -2.890828548706894,
 -0.3297628187275022,
 -1.9078329191056413,
 -2.077963004021463]

In [1]:
import d3rlpy
from d3rlpy.datasets import get_d4rl
from d3rlpy.dynamics import ProbabilisticEnsembleDynamics
from d3rlpy.metrics.scorer import dynamics_observation_prediction_error_scorer
from d3rlpy.metrics.scorer import dynamics_reward_prediction_error_scorer
from d3rlpy.metrics.scorer import dynamics_prediction_variance_scorer
from sklearn.model_selection import train_test_split
import gym
from tqdm.auto import trange

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset, env = get_d4rl('hopper-medium-v2')

No module named 'flow'
No module named 'carla'
pybullet build time: Jun 13 2023 11:48:14
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 21/21 [00:09<00:00,  2.24it/s]


In [9]:
dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True)

train_episodes, test_episodes = train_test_split(dataset)
# same as algorithms
dynamics.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=1,
             save_interval=1,
             with_timestamp=False
)

[2m2023-07-09 04:11:55[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.[0m
[2m2023-07-09 04:11:55[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/ProbabilisticEnsembleDynamics[0m
[2m2023-07-09 04:11:55[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-07-09 04:23:21[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m
[2m2023-07-09 04:23:26[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'batch_size': 100, 'discrete_action': False, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 1.0, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_ensembles': 5, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0.0001, 'amsgrad': False}, 'real_ratio': 1.0, 'reward_scaler': N

Epoch 1/1: 100%|██████████| 7464/7464 [07:37<00:00, 16.32it/s, loss=-71.5] 


[2m2023-07-09 04:31:04[0m [[32m[1minfo     [0m] [1mProbabilisticEnsembleDynamics: epoch=1 step=7464[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01169888025183642, 'time_algorithm_update': 0.0480932249964114, 'loss': -71.47799279192904, 'time_step': 0.06030370510659417}[0m [36mstep[0m=[35m7464[0m
[2m2023-07-09 04:31:05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics/model_7464.pt[0m


[(1,
  {'time_sample_batch': 0.01169888025183642,
   'time_algorithm_update': 0.0480932249964114,
   'loss': -71.47799279192904,
   'time_step': 0.06030370510659417})]

In [11]:
# load trained dynamics model
dynamics = ProbabilisticEnsembleDynamics.from_json('./d3rlpy_logs/ProbabilisticEnsembleDynamics/params.json')
dynamics.load_model('./d3rlpy_logs/ProbabilisticEnsembleDynamics/model_7464.pt')

# give mopo as generator argument.
mopo = d3rlpy.algos.MOPO(dynamics=dynamics)

