In [1]:
'''This file trains using MOPO and COMBO algorithm'''

#This training script generates multiple trajectories and notes the score. So, we get 100 scores instead of noting just the mean score.

import d3rlpy
from d3rlpy.datasets import get_d4rl
import gym
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.dynamics import ProbabilisticEnsembleDynamics
from sklearn.model_selection import train_test_split
# import argparse
import os
from tqdm import tqdm
#import wandb

#wandb.login()


os.environ['D4RL_SUPPRESS_IMPORT_ERROR'] = '1'
# parser = argparse.ArgumentParser()
# parser.add_argument("--task", type=str, help="task/game to be played")
# parser.add_argument("--algo", type=str, help="algorithm to be used for training")
# args = parser.parse_args()

task = 'halfcheetah-medium-expert-v2' #args.task 
algo = 'COMBO' #args.algo

"""
suggested hypers for combo

halfcheetah-medium-v2: rollout-length=5, cql-weight=0.5
hopper-medium-v2: rollout-length=5, cql-weight=5.0
walker2d-medium-v2: rollout-length=1, cql-weight=5.0
halfcheetah-medium-replay-v2: rollout-length=5, cql-weight=0.5
hopper-medium-replay-v2: rollout-length=5, cql-weight=0.5
walker2d-medium-replay-v2: rollout-length=1, cql-weight=0.5
halfcheetah-medium-expert-v2: rollout-length=5, cql-weight=5.0
hopper-medium-expert-v2: rollout-length=5, cql-weight=5.0
walker2d-medium-expert-v2: rollout-length=1, cql-weight=5.0

suggested hypers for mopo

halfcheetah-medium-v2: rollout-length=5, penalty-coef=0.5
hopper-medium-v2: rollout-length=5, penalty-coef=5.0
walker2d-medium-v2: rollout-length=5, penalty-coef=0.5
halfcheetah-medium-replay-v2: rollout-length=5, penalty-coef=0.5
hopper-medium-replay-v2: rollout-length=5, penalty-coef=2.5
walker2d-medium-replay-v2: rollout-length=1, penalty-coef=2.5
halfcheetah-medium-expert-v2: rollout-length=5, penalty-coef=2.5
hopper-medium-expert-v2: rollout-length=5, penalty-coef=5.0
walker2d-medium-expert-v2: rollout-length=1, penalty-coef=2.5
"""

class Model():
    def __init__(self, task, algo, gpu=True):
        self.task = task 
        self.algo = algo
        self.dynamics = None
        self.f_params = {}
        self.experiment_name = f"{self.algo}_{self.task}"

    def set_engine(self):
        if self.algo == "MOPO":
            if self.algo == "halfcheetah-medium-v2" or self.algo == "halfcheetah-medium-replay-v2" or self.algo == "walker2d-medium-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 0.5}
            elif self.algo == "hopper-medium-v2" or self.algo == "hopper-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 5.0}
            elif self.algo == "hopper-medium-replay-v2" or self.algo == "halfcheetah-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 2.5}
            elif self.algo == "walker2d-medium-replay-v2" or self.algo == "walker2d-medium-expert-v2":
                self.f_params = {'rollout_length': 1, 'penalty_coef': 2.5}            

            self.engine = d3rlpy.algos.MOPO(dynamics=self.dynamics, **self.f_params)     

        elif self.algo == "COMBO":
            if self.algo == "halfcheetah-medium-v2" or self.algo == "halfcheetah-medium-replay-v2" or self.algo == "hopper-medium-replay-v2":
                self.f_params = {'rollout_length': 5, 'cql_weight': 0.5}
            elif self.algo == "hopper-medium-v2" or self.algo == "hopper-medium-expert-v2" or self.algo == "halfcheetah-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'cql_weight': 5.0}
            elif self.algo == "walker2d-medium-v2" or self.algo == "walker2d-medium-expert-v2":
                self.f_params = {'rollout_length': 1, 'cql_weight': 5.0}
            elif self.algo == "walker2d-medium-replay-v2":
                self.f_params = {'rollout_length': 1, 'cql_weight': 0.5}

            self.engine = d3rlpy.algos.COMBO(dynamics=self.dynamics, **self.f_params)
        

    def train_dynamics(self, n_epochs=100, save_interval=100, save_metrics=True, verbose=False, with_timestamp=False):
        self.dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True)
        train_episodes, test_episodes = train_test_split(self.dataset)
        self.dynamics.fit(train_episodes, eval_episodes=test_episodes, n_epochs=n_epochs, 
        save_interval=save_interval, save_metrics=save_metrics, verbose=verbose, with_timestamp=with_timestamp, experiment_name=self.experiment_name)
            

    def train_engine(self, n_steps=1000000, save_interval=101, save_metrics=False, verbose=False):
        self.set_engine()
        self.engine.fit(self.dataset, n_steps=n_steps, save_interval=save_interval, save_metrics=save_metrics, verbose=verbose)
    
    def train(self, n=50, n_epochs=100, n_steps=1000000, save_engine_interval=100, save_dynamics_interval=100,
               save_dynamics_metrics=True, save_engine_metrics=False, verbose=False, with_timestamp=False):
        for i in range(n):
            self.dataset, self.env = get_d4rl(self.task)
            self.online_env = gym.make(self.task)
            d3rlpy.seed(i)
            self.env.reset(seed=i)
            self.online_env.reset(seed=i)

            self.train_dynamics(n_epochs=n_epochs, save_interval=save_dynamics_interval, save_metrics=save_dynamics_metrics, verbose=verbose, with_timestamp=with_timestamp)
            # load trained dynamics model
            json_path = f'./d3rlpy_logs/{self.experiment_name}/params.json'
            source_directory = f'./d3rlpy_logs/{self.experiment_name}'
            keyword = "model"

            # Iterate over each file in the directory
            for root, dirs, files in os.walk(source_directory):
                for filename in files:
                    # Check if the file name contains the keyword "model"
                    if keyword in filename:
                        # Get the full path of the file
                        model_path = os.path.join(root, filename)
		    		    
            self.dynamics = ProbabilisticEnsembleDynamics.from_json(json_path)
            self.dynamics.load_model(model_path)
            self.train_engine(n_steps=n_steps, save_interval=save_engine_interval, save_metrics=save_engine_metrics, verbose=verbose)
            scorer = evaluate_on_environment(self.online_env, n_trials=1)
            f = open(f'./txt_files/{algo}_{task}_rollout.txt', 'a+')
            f.write(f"n={i}\n")

            for i in range(1000):       
                normalized_score = self.online_env.get_normalized_score(scorer(self.engine))
                f = open(f'./txt_files/{algo}_{task}_rollout.txt', 'a+')
                f.write(f"{normalized_score}\n")

    

model = Model(task, algo)
model.train(n=1, n_epochs=1, save_dynamics_interval=1)


  from .autonotebook import tqdm as notebook_tqdm
pybullet build time: Jun 13 2023 11:48:14
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 9/9 [00:14<00:00,  1.63s/it]


[2m2023-07-14 18:05:09[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.[0m
[2m2023-07-14 18:05:09[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/COMBO_halfcheetah-medium-expert-v2[0m
[2m2023-07-14 18:05:09[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-07-14 18:06:46[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m


Epoch 1/1: 100%|██████████| 14985/14985 [16:38<00:00, 15.01it/s, loss=-7.32]  


[2m2023-07-14 18:23:27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/COMBO_halfcheetah-medium-expert-v2/model_14985.pt[0m


In [1]:
'''This file trains using MOPO and COMBO algorithm'''

#This training script generates multiple trajectories and notes the score. So, we get 100 scores instead of noting just the mean score.

import d3rlpy
from d3rlpy.datasets import get_d4rl
import gym
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.dynamics import ProbabilisticEnsembleDynamics
from sklearn.model_selection import train_test_split
# import argparse
import os
from tqdm import tqdm
#import wandb

#wandb.login()


os.environ['D4RL_SUPPRESS_IMPORT_ERROR'] = '1'
# parser = argparse.ArgumentParser()
# parser.add_argument("--task", type=str, help="task/game to be played")
# parser.add_argument("--algo", type=str, help="algorithm to be used for training")
# args = parser.parse_args()

task = 'halfcheetah-medium-expert-v2' #args.task 
algo = 'COMBO' #args.algo

"""
suggested hypers for combo

halfcheetah-medium-v2: rollout-length=5, cql-weight=0.5
hopper-medium-v2: rollout-length=5, cql-weight=5.0
walker2d-medium-v2: rollout-length=1, cql-weight=5.0
halfcheetah-medium-replay-v2: rollout-length=5, cql-weight=0.5
hopper-medium-replay-v2: rollout-length=5, cql-weight=0.5
walker2d-medium-replay-v2: rollout-length=1, cql-weight=0.5
halfcheetah-medium-expert-v2: rollout-length=5, cql-weight=5.0
hopper-medium-expert-v2: rollout-length=5, cql-weight=5.0
walker2d-medium-expert-v2: rollout-length=1, cql-weight=5.0

suggested hypers for mopo

halfcheetah-medium-v2: rollout-length=5, penalty-coef=0.5
hopper-medium-v2: rollout-length=5, penalty-coef=5.0
walker2d-medium-v2: rollout-length=5, penalty-coef=0.5
halfcheetah-medium-replay-v2: rollout-length=5, penalty-coef=0.5
hopper-medium-replay-v2: rollout-length=5, penalty-coef=2.5
walker2d-medium-replay-v2: rollout-length=1, penalty-coef=2.5
halfcheetah-medium-expert-v2: rollout-length=5, penalty-coef=2.5
hopper-medium-expert-v2: rollout-length=5, penalty-coef=5.0
walker2d-medium-expert-v2: rollout-length=1, penalty-coef=2.5
"""

class Model():
    def __init__(self, task, algo, gpu=True):
        self.task = task 
        self.algo = algo
        self.dynamics = None
        self.f_params = {}
        self.experiment_name = f"{self.algo}_{self.task}"

    def set_engine(self):
        if self.algo == "MOPO":
            if self.algo == "halfcheetah-medium-v2" or self.algo == "halfcheetah-medium-replay-v2" or self.algo == "walker2d-medium-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 0.5}
            elif self.algo == "hopper-medium-v2" or self.algo == "hopper-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 5.0}
            elif self.algo == "hopper-medium-replay-v2" or self.algo == "halfcheetah-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'penalty_coef': 2.5}
            elif self.algo == "walker2d-medium-replay-v2" or self.algo == "walker2d-medium-expert-v2":
                self.f_params = {'rollout_length': 1, 'penalty_coef': 2.5}            

            self.engine = d3rlpy.algos.MOPO(dynamics=self.dynamics, **self.f_params)     

        elif self.algo == "COMBO":
            if self.algo == "halfcheetah-medium-v2" or self.algo == "halfcheetah-medium-replay-v2" or self.algo == "hopper-medium-replay-v2":
                self.f_params = {'rollout_length': 5, 'cql_weight': 0.5}
            elif self.algo == "hopper-medium-v2" or self.algo == "hopper-medium-expert-v2" or self.algo == "halfcheetah-medium-expert-v2":
                self.f_params = {'rollout_length': 5, 'cql_weight': 5.0}
            elif self.algo == "walker2d-medium-v2" or self.algo == "walker2d-medium-expert-v2":
                self.f_params = {'rollout_length': 1, 'cql_weight': 5.0}
            elif self.algo == "walker2d-medium-replay-v2":
                self.f_params = {'rollout_length': 1, 'cql_weight': 0.5}

            self.engine = d3rlpy.algos.COMBO(dynamics=self.dynamics, **self.f_params)
        

    def train_dynamics(self, n_epochs=100, save_interval=100, save_metrics=True, verbose=False, with_timestamp=False):
        self.dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=True)
        train_episodes, test_episodes = train_test_split(self.dataset)
        self.dynamics.fit(train_episodes, eval_episodes=test_episodes, n_epochs=n_epochs, 
        save_interval=save_interval, save_metrics=save_metrics, verbose=verbose, with_timestamp=with_timestamp, experiment_name=self.experiment_name)
            

    def train_engine(self, n_steps=1000000, save_interval=101, save_metrics=False, verbose=False):
        self.set_engine()
        self.engine.fit(self.dataset, n_steps=n_steps, save_interval=save_interval, save_metrics=save_metrics, verbose=verbose)
    
    def train(self, n=50, n_epochs=100, n_steps=1000000, save_engine_interval=100, save_dynamics_interval=100,
               save_dynamics_metrics=True, save_engine_metrics=False, verbose=False, with_timestamp=False):
        for i in range(n):
            self.dataset, self.env = get_d4rl(self.task)
            self.online_env = gym.make(self.task)
            d3rlpy.seed(i)
            self.env.reset(seed=i)
            self.online_env.reset(seed=i)

            # self.train_dynamics(n_epochs=n_epochs, save_interval=save_dynamics_interval, save_metrics=save_dynamics_metrics, verbose=verbose, with_timestamp=with_timestamp)
            # load trained dynamics model
            json_path = f'./d3rlpy_logs/{self.experiment_name}/params.json'
            source_directory = f'./d3rlpy_logs/{self.experiment_name}'
            keyword = "model"
            print(1)

            # Iterate over each file in the directory
            for root, dirs, files in os.walk(source_directory):
                for filename in files:
                    # Check if the file name contains the keyword "model"
                    if keyword in filename:
                        # Get the full path of the file
                        model_path = os.path.join(root, filename)
		    		    
            self.dynamics = ProbabilisticEnsembleDynamics.from_json(json_path)
            self.dynamics.load_model(model_path)
            self.train_engine(n_steps=n_steps, save_interval=save_engine_interval, save_metrics=save_engine_metrics, verbose=verbose)
            scorer = evaluate_on_environment(self.online_env, n_trials=1)
            f = open(f'./txt_files/{algo}_{task}_rollout.txt', 'a+')
            f.write(f"n={i}\n")

            for i in range(1000):       
                normalized_score = self.online_env.get_normalized_score(scorer(self.engine))
                f = open(f'./txt_files/{algo}_{task}_rollout.txt', 'a+')
                f.write(f"{normalized_score}\n")

    

model = Model(task, algo)
model.train(n=1, n_epochs=1, n_steps=10000, save_dynamics_interval=1)


  from .autonotebook import tqdm as notebook_tqdm
pybullet build time: Jun 13 2023 11:48:14
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
load datafile: 100%|██████████| 9/9 [00:10<00:00,  1.21s/it]


1
[2m2023-07-14 18:46:04[0m [[32m[1mdebug    [0m] [1mRandomIterator is selected.[0m
[2m2023-07-14 18:46:04[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-07-14 18:46:05[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m


Epoch 1/1:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2023-07-14 18:46:13[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m250000[0m [36mreal_transitions[0m=[35m1998000[0m


Epoch 1/1:  10%|▉         | 999/10000 [01:18<09:37, 15.60it/s, critic_loss=15.6, actor_loss=-24, temp_loss=8.99, temp=0.953]   

[2m2023-07-14 18:47:29[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m500000[0m [36mreal_transitions[0m=[35m1998000[0m


Epoch 1/1:  20%|█▉        | 1999/10000 [02:29<08:27, 15.76it/s, critic_loss=13.8, actor_loss=-38.5, temp_loss=7.87, temp=0.913]  

[2m2023-07-14 18:48:39[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m750000[0m [36mreal_transitions[0m=[35m1998000[0m


Epoch 1/1:  30%|██▉       | 2999/10000 [03:45<08:00, 14.56it/s, critic_loss=13.9, actor_loss=-54.1, temp_loss=6.94, temp=0.876]  

[2m2023-07-14 18:49:58[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m1000000[0m [36mreal_transitions[0m=[35m1998000[0m


Epoch 1/1:  40%|███▉      | 3999/10000 [05:07<07:00, 14.26it/s, critic_loss=14.8, actor_loss=-70.8, temp_loss=6.14, temp=0.844]  

[2m2023-07-14 18:51:20[0m [[32m[1mdebug    [0m] [1m250000 transitions are generated.[0m [36mfake_transitions[0m=[35m1250000[0m [36mreal_transitions[0m=[35m1998000[0m


Epoch 1/1:  50%|█████     | 5000/10000 [06:32<05:35, 14.92it/s, critic_loss=16.5, actor_loss=-88.3, temp_loss=5.46, temp=0.814]

: 

: 