In [1]:
!pip install dm_control
!pip install pink-noise-rl
!pip install wandb

Collecting dm_control

  Downloading dm_control-1.0.16-py3-none-any.whl.metadata (1.3 kB)


Collecting dm-env (from dm_control)

  Downloading dm_env-1.6-py3-none-any.whl (26 kB)


Collecting glfw (from dm_control)

  Downloading glfw-2.6.5-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl.metadata (5.4 kB)

Collecting labmaze (from dm_control)

  Downloading labmaze-1.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m


Collecting mujoco>=3.1.1 (from dm_control)

  Downloading mujoco-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m



Collecting pyopengl>=3.1.4 (from dm_control)

  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)

















In [2]:
import gym
from gym import spaces

from dm_control import suite
from dm_env import specs


def convert_dm_control_to_gym_space(dm_control_space):
    r"""Convert dm_control space to gym space. """
    if isinstance(dm_control_space, specs.BoundedArray):
        space = spaces.Box(low=dm_control_space.minimum, 
                           high=dm_control_space.maximum, 
                           dtype=dm_control_space.dtype)
        assert space.shape == dm_control_space.shape
        return space
    elif isinstance(dm_control_space, specs.Array) and not isinstance(dm_control_space, specs.BoundedArray):
        space = spaces.Box(low=-float('inf'), 
                           high=float('inf'), 
                           shape=dm_control_space.shape, 
                           dtype=dm_control_space.dtype)
        return space
    elif isinstance(dm_control_space, dict):
        space = spaces.Dict({key: convert_dm_control_to_gym_space(value)
                             for key, value in dm_control_space.items()})
        return space


class DMSuiteEnv(gym.Env):
    def __init__(self, domain_name, task_name, task_kwargs=None, environment_kwargs=None, visualize_reward=False):
        self.env = suite.load(domain_name, 
                              task_name, 
                              task_kwargs=task_kwargs, 
                              environment_kwargs=environment_kwargs, 
                              visualize_reward=visualize_reward)
        self.metadata = {'render.modes': ['human', 'rgb_array'],
                         'video.frames_per_second': round(1.0/self.env.control_timestep())}
        print(self.env.observation_spec())
        self.observation_space = convert_dm_control_to_gym_space(self.env.observation_spec())
        print(self.observation_space)
        print("________________________")
        print(self.env.action_spec())
        self.action_space = convert_dm_control_to_gym_space(self.env.action_spec())
        print(self.action_space)
        self.viewer = None
    
    def seed(self, seed):
        return self.env.task.random.seed(seed)
    
    def step(self, action):
        timestep = self.env.step(action)
        observation = timestep.observation
        reward = timestep.reward
        done = timestep.last()
        info = {}
        truncated = False
        return observation, reward, done, info
    
    def reset(self):
        timestep = self.env.reset()
        return timestep.observation
    
    def render(self, mode='human', **kwargs):
        if 'camera_id' not in kwargs:
            kwargs['camera_id'] = 0  # Tracking camera
        use_opencv_renderer = kwargs.pop('use_opencv_renderer', False)
        
        img = self.env.physics.render(**kwargs)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                if not use_opencv_renderer:
                    from gym.envs.classic_control import rendering
                    self.viewer = rendering.SimpleImageViewer(maxwidth=1024)
                else:
                    from . import OpenCVImageViewer
                    self.viewer = OpenCVImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen
        else:
            raise NotImplementedError

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
        return self.env.close()

/opt/conda/lib/python3.10/site-packages/glfw/__init__.py:916: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'



In [None]:
env1 = ["cartpole", "cartpole", "ball_in_cup", "hopper", "cheetah", "reacher", "pendulum"]
env2 = ["balance_sparse","swingup_sparse","catch","hop","run","hard","swingup"]

In [5]:
corrected_script_path = "/kaggle/input/pinkie/r.py"
new_file_path = "/opt/conda/lib/python3.10/site-packages/pink/sb3.py"

with open(corrected_script_path, 'r') as corrected_script:
    corrected_code = corrected_script.read()
    
with open(new_file_path, "w") as new_file:
    new_file.write(corrected_code)

In [6]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
print(device)

cuda


In [8]:
import gymnasium as gym
import numpy as np
import torch
from pink import PinkNoiseDist
from pink import ColoredNoiseDist
from stable_baselines3 import SAC
import time
from tqdm import tqdm

# Define a function to evaluate an episode
def evaluate_episode(model, env):
    obs = env.reset()
    done = False
    total_reward = 0.0
    steps=0
    while steps<1000 and not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        steps+=1
    return total_reward

# Reproducibility
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed)

for i in range(7):
    env = DMSuiteEnv(env1[i],env2[i])
    action_dim = env.action_space.shape[-1]
    seq_len = 1000
    rng = np.random.default_rng(0)

    # Initialize agents
    model_default = SAC("MultiInputPolicy", env, seed=seed)
    model_pink = SAC("MultiInputPolicy", env, seed=seed)
    model_OU = SAC("MultiInputPolicy", env, seed=seed)

    # Set action noise
    model_pink.actor.action_dist = PinkNoiseDist(seq_len, action_dim, rng=rng)
    model_OU.actor.action_dist = ColoredNoiseDist(beta=2, seq_len=seq_len, action_dim=action_dim, rng=rng)

    # Training parameters
    total_timesteps = 1000000
    eval_frequency = 10000 # Evaluate every 104 interactions
    eval_rollouts = 5

    wandb.init(
        project="Pinkie",
        config = {
        "Total_timesteps": total_timesteps,
        "Eval_frequency": eval_frequency,
        "Eval_rollouts": eval_rollouts,
        "Environment": env1[i] + " " + env2[i]
        }
    )

    #Final average performances
    avg_default=0.0
    avg_pink=0.0
    avg_OU=0.0
    final_default=0.0
    final_pink=0.0
    final_OU=0.0

    # Train agents with evaluation
    timesteps_so_far = 0
    # for timesteps_so_far in tqdm(range(0,total_timesteps,eval_frequency)):
    while timesteps_so_far < total_timesteps:
        t1 = time.time()
        # Train the default noise model
        model_default.learn(total_timesteps=eval_frequency)
        t2 = time.time()

        # Evaluate the default noise model
        mean_return_default = 0.0
        for _ in range(eval_rollouts):
            mean_return_default += evaluate_episode(model_default, env)
        mean_return_default /= eval_rollouts
        avg_default+=mean_return_default
        if(timesteps_so_far>=0.95*total_timesteps):
            final_default+=mean_return_default

        print(f"Return (Default): {mean_return_default}")
        print(f"Time taken (Default Model): {t2 - t1:.2f} seconds")
        print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_default}")

        t1=time.time()
        # Train the pink noise model
        model_pink.learn(total_timesteps=eval_frequency)
        # timesteps_so_far += eval_frequency
        t2 = time.time()

        # Evaluate the pink noise model
        mean_return_pink = 0.0
        for _ in range(eval_rollouts):
            mean_return_pink += evaluate_episode(model_pink, env)
        mean_return_pink /= eval_rollouts
        avg_pink+=mean_return_pink
        if(timesteps_so_far>=0.95*total_timesteps):
            final_pink+=mean_return_pink

        print(f"Return (Pink): {mean_return_pink}")
        print(f"Time taken (Pink Noise Model): {t2 - t1:.2f} seconds")
        print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_pink}")

        t1=time.time()
        # Train the pink noise model
        model_OU.learn(total_timesteps=eval_frequency)
        # timesteps_so_far += eval_frequency
        t2 = time.time()

        # Evaluate the pink noise model
        mean_return_OU = 0.0
        for _ in range(eval_rollouts):
            mean_return_OU += evaluate_episode(model_OU, env)
        mean_return_OU/= eval_rollouts
        avg_OU+=mean_return_OU
        if(timesteps_so_far>=0.95*total_timesteps):
            final_OU+=mean_return_OU

        print(f"Return (OU): {mean_return_OU}")
        print(f"Time taken (OU Noise Model): {t2 - t1:.2f} seconds")
        print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_OU}")

        timesteps_so_far += eval_frequency

        wandb.log({
            "mean_return_OU": mean_return_OU,
            "mean_return_pink": mean_return_pink,
            "mean_return_default": mean_return_default,
            "timesteps_so_far": timesteps_so_far
        })

    avg_default/=(total_timesteps/eval_frequency)
    avg_pink/=(total_timesteps/eval_frequency)
    avg_OU/=(total_timesteps/eval_frequency)

    final_default/=(0.05*total_timesteps/eval_frequency)
    final_pink/=(0.05*total_timesteps/eval_frequency)
    final_OU/=(0.05*total_timesteps/eval_frequency)

    wandb.log({
        "final_default": final_default,
        "final_pink": final_pink,
        "final_OU": final_OU,
        "avg_default": avg_default,
        "avg_pink": avg_pink,
        "avg_OU": avg_OU
    })

    print("Mean:")
    print(f"White:{avg_default}           Pink:{avg_pink}             OU:{avg_OU}")
    print("Final:")
    print(f"White:{final_default}           Pink:{final_pink}             OU:{final_OU}")

2024-01-31 07:41:21.843381: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered

2024-01-31 07:41:21.843512: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered

2024-01-31 07:41:21.970989: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered





[34m[1mwandb[0m: Currently logged in as: [33mcathedr4l[0m ([33mrl_team_1[0m). Use [1m`wandb login --relogin`[0m to force relogin

[34m[1mwandb[0m: Tracking run with wandb version 0.16.2

[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240131_074134-fnkfuh97[0m

[34m[1mwandb[0m: Run [1m`wandb offline`[

Return (Default): 32.17231413569819

Time taken (Default Model): 127.98 seconds

Timesteps: 0, Mean Return: 32.17231413569819

Return (Pink): 11.697002003566668

Time taken (Pink Noise Model): 122.81 seconds

Timesteps: 0, Mean Return: 11.697002003566668

Return (OU): 14.235833955091982

Time taken (OU Noise Model): 125.84 seconds

Timesteps: 0, Mean Return: 14.235833955091982

Return (Default): 26.37072504599511

Time taken (Default Model): 121.47 seconds

Timesteps: 10000, Mean Return: 26.37072504599511

Return (Pink): 22.667363252729086

Time taken (Pink Noise Model): 126.14 seconds

Timesteps: 10000, Mean Return: 22.667363252729086

Return (OU): 36.1149053265344

Time taken (OU Noise Model): 121.68 seconds

Timesteps: 10000, Mean Return: 36.1149053265344

Return (Default): 64.72957887443718

Time taken (Default Model): 122.63 seconds

Timesteps: 20000, Mean Return: 64.72957887443718

Return (Pink): 24.51514785623199

Time taken (Pink Noise Model): 126.14 seconds

Timesteps: 20000, 

In [9]:
env = DMSuiteEnv("cartpole","balance_sparse")
env = DMSuiteEnv("cartpole","swingup_sparse")
env = DMSuiteEnv("ball_in_cup","catch")
env = DMSuiteEnv("hopper","hop")
env = DMSuiteEnv("walker","run")
env = DMSuiteEnv("reacher","hard")
env = DMSuiteEnv("pendulum","swingup")

OrderedDict([('position', Array(shape=(3,), dtype=dtype('float64'), name='position')), ('velocity', Array(shape=(2,), dtype=dtype('float64'), name='velocity'))])

Dict('position': Box(-inf, inf, (3,), float64), 'velocity': Box(-inf, inf, (2,), float64))

________________________

BoundedArray(shape=(1,), dtype=dtype('float64'), name=None, minimum=[-1.], maximum=[1.])

Box(-1.0, 1.0, (1,), float64)

OrderedDict([('position', Array(shape=(3,), dtype=dtype('float64'), name='position')), ('velocity', Array(shape=(2,), dtype=dtype('float64'), name='velocity'))])

Dict('position': Box(-inf, inf, (3,), float64), 'velocity': Box(-inf, inf, (2,), float64))

________________________

BoundedArray(shape=(1,), dtype=dtype('float64'), name=None, minimum=[-1.], maximum=[1.])

Box(-1.0, 1.0, (1,), float64)

OrderedDict([('position', Array(shape=(4,), dtype=dtype('float64'), name='position')), ('velocity', Array(shape=(4,), dtype=dtype('float64'), name='velocity'))])

Dict('position': Box(-inf, inf, (