In [None]:
!pip install dm_control
!pip install pink-noise-rl
!pip install wandb

In [None]:
import gym
from gym import spaces

from dm_control import suite
from dm_env import specs


def convert_dm_control_to_gym_space(dm_control_space):
    r"""Convert dm_control space to gym space. """
    if isinstance(dm_control_space, specs.BoundedArray):
        space = spaces.Box(low=dm_control_space.minimum, 
                           high=dm_control_space.maximum, 
                           dtype=dm_control_space.dtype)
        assert space.shape == dm_control_space.shape
        return space
    elif isinstance(dm_control_space, specs.Array) and not isinstance(dm_control_space, specs.BoundedArray):
        space = spaces.Box(low=-float('inf'), 
                           high=float('inf'), 
                           shape=dm_control_space.shape, 
                           dtype=dm_control_space.dtype)
        return space
    elif isinstance(dm_control_space, dict):
        space = spaces.Dict({key: convert_dm_control_to_gym_space(value)
                             for key, value in dm_control_space.items()})
        return space


class DMSuiteEnv(gym.Env):
    def __init__(self, domain_name, task_name, task_kwargs=None, environment_kwargs=None, visualize_reward=False):
        self.env = suite.load(domain_name, 
                              task_name, 
                              task_kwargs=task_kwargs, 
                              environment_kwargs=environment_kwargs, 
                              visualize_reward=visualize_reward)
        self.metadata = {'render.modes': ['human', 'rgb_array'],
                         'video.frames_per_second': round(1.0/self.env.control_timestep())}
        print(self.env.observation_spec())
        self.observation_space = convert_dm_control_to_gym_space(self.env.observation_spec())
        print(self.observation_space)
        print("________________________")
        print(self.env.action_spec())
        self.action_space = convert_dm_control_to_gym_space(self.env.action_spec())
        print(self.action_space)
        self.viewer = None
    
    def seed(self, seed):
        return self.env.task.random.seed(seed)
    
    def step(self, action):
        timestep = self.env.step(action)
        observation = timestep.observation
        reward = timestep.reward
        done = timestep.last()
        info = {}
        truncated = False
        return observation, reward, done, info
    
    def reset(self):
        timestep = self.env.reset()
        return timestep.observation
    
    def render(self, mode='human', **kwargs):
        if 'camera_id' not in kwargs:
            kwargs['camera_id'] = 0  # Tracking camera
        use_opencv_renderer = kwargs.pop('use_opencv_renderer', False)
        
        img = self.env.physics.render(**kwargs)
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            if self.viewer is None:
                if not use_opencv_renderer:
                    from gym.envs.classic_control import rendering
                    self.viewer = rendering.SimpleImageViewer(maxwidth=1024)
                else:
                    from . import OpenCVImageViewer
                    self.viewer = OpenCVImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen
        else:
            raise NotImplementedError

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None
        return self.env.close()

In [None]:
import wandb
wandb.login(key = "fc4f98ad0ffb2231a657df7c79439c6900855b20")

In [None]:
corrected_script_path = "/kaggle/input/pinkie/r.py"
new_file_path = "/opt/conda/lib/python3.10/site-packages/pink/sb3.py"

with open(corrected_script_path, 'r') as corrected_script:
    corrected_code = corrected_script.read()
    
with open(new_file_path, "w") as new_file:
    new_file.write(corrected_code)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
print(device)

In [None]:
env = DMSuiteEnv("cartpole","balance_sparse")

In [None]:
torch.set_default_device(device)

In [None]:
"""Colored noise generation script
Modified from colorednoise package: https://github.com/felixpatzelt/colorednoise
"""

import numpy as np
from numpy.fft import irfft, rfftfreq


def powerlaw_psd_gaussian(exponent, size, fmin=0, rng=None):
    """Gaussian (1/f)**beta noise.

    Based on the algorithm in:
    Timmer, J. and Koenig, M.:
    On generating power law noise.
    Astron. Astrophys. 300, 707-710 (1995)

    Normalised to unit variance

    Parameters:
    -----------

    exponent : float
        The power-spectrum of the generated noise is proportional to

        S(f) = (1 / f)**beta
        flicker / pink noise:   exponent beta = 1
        brown noise:            exponent beta = 2

        Furthermore, the autocorrelation decays proportional to lag**-gamma
        with gamma = 1 - beta for 0 < beta < 1.
        There may be finite-size issues for beta close to one.

    shape : int or iterable
        The output has the given shape, and the desired power spectrum in
        the last coordinate. That is, the last dimension is taken as time,
        and all other components are independent.

    fmin : float, optional
        Low-frequency cutoff.
        Default: 0 corresponds to original paper.

        The power-spectrum below fmin is flat. fmin is defined relative
        to a unit sampling rate (see numpy's rfftfreq). For convenience,
        the passed value is mapped to max(fmin, 1/samples) internally
        since 1/samples is the lowest possible finite frequency in the
        sample. The largest possible value is fmin = 0.5, the Nyquist
        frequency. The output for this value is white noise.

    rng : np.random.Generator, optional
        Random number generator (for reproducibility). If not passed, a new
        random number generator is created by calling
        `np.random.default_rng()`.


    Returns
    -------
    out : array
        The samples.


    Examples:
    ---------

    >>> # generate 1/f noise == pink noise == flicker noise
    >>> import colorednoise as cn
    >>> y = cn.powerlaw_psd_gaussian(1, 5)
    """

    # Make sure size is a list so we can iterate it and assign to it.
    try:
        size = list(size)
    except TypeError:
        size = [size]

    # The number of samples in each time series
    samples = size[-1]

    # Calculate Frequencies (we asume a sample rate of one)
    # Use fft functions for real output (-> hermitian spectrum)
    f = rfftfreq(samples)

    # Validate / normalise fmin
    if 0 <= fmin <= 0.5:
        fmin = max(fmin, 1./samples)    # Low frequency cutoff
    else:
        raise ValueError("fmin must be chosen between 0 and 0.5.")

    # Build scaling factors for all frequencies
    s_scale = f
    ix = np.sum(s_scale < fmin)   # Index of the cutoff
    if ix and ix < len(s_scale):
        s_scale[:ix] = s_scale[ix]
    s_scale = s_scale**(-exponent/2.)

    # Calculate theoretical output standard deviation from scaling
    w = s_scale[1:].copy()
    w[-1] *= (1 + (samples % 2)) / 2.    # correct f = +-0.5
    sigma = 2 * np.sqrt(np.sum(w**2)) / samples

    # Adjust size to generate one Fourier component per frequency
    size[-1] = len(f)

    # Add empty dimension(s) to broadcast s_scale along last
    # dimension of generated random power + phase (below)
    dims_to_add = len(size) - 1
    s_scale = s_scale[(None,) * dims_to_add + (Ellipsis,)]

    # Generate scaled random power + phase
    if rng is None:
        rng = np.random.default_rng()
    sr = rng.normal(scale=s_scale, size=size)
    si = rng.normal(scale=s_scale, size=size)

    # If the signal length is even, frequencies +/- 0.5 are equal
    # so the coefficient must be real.
    if not (samples % 2):
        si[..., -1] = 0
        sr[..., -1] *= np.sqrt(2)    # Fix magnitude

    # Regardless of signal length, the DC component must be real
    si[..., 0] = 0
    sr[..., 0] *= np.sqrt(2)    # Fix magnitude

    # Combine power + corrected phase to Fourier components
    s = sr + 1J * si

    # Transform to real time series & scale to unit variance
    y = irfft(s, n=samples, axis=-1) / sigma

    return y

In [None]:
class ColoredNoiseProcess():
    """Infinite colored noise process.

    Implemented as a buffer: every `size[-1]` samples, a cut to a new time series starts. As this cut influences the
    PSD of the combined signal, the maximum period (1 / low-frequency cutoff) can be specified.

    Methods
    -------
    sample(T=1)
        Sample `T` timesteps from the colored noise process.
    reset()
        Reset the buffer with a new time series.
    """
    def __init__(self, beta, size, scale=1, max_period=None, rng=None):
        """Infinite colored noise process.

        Implemented as a buffer: every `size[-1]` samples, a cut to a new time series starts. As this cut influences
        the PSD of the combined signal, the maximum period (1 / low-frequency cutoff) can be specified.

        Parameters
        ----------
        beta : float
            Exponent of colored noise power-law spectrum.
        size : int or tuple of int
            Shape of the sampled colored noise signals. The last dimension (`size[-1]`) specifies the time range, and
            is thus ths maximum possible correlation length of the combined signal.
        scale : int, optional, by default 1
            Scale parameter with which samples are multiplied
        max_period : float, optional, by default None
            Maximum correlation length of sampled colored noise singals (1 / low-frequency cutoff). If None, it is
            automatically set to `size[-1]` (the sequence length).
        rng : np.random.Generator, optional
            Random number generator (for reproducibility). If not passed, a new random number generator is created by
            calling `np.random.default_rng()`.
        """
        self.beta = beta
        if max_period is None:
            self.minimum_frequency = 0
        else:
            self.minimum_frequency = 1 / max_period
        self.scale = scale
        self.rng = rng

        # The last component of size is the time index
        try:
            self.size = list(size)
        except TypeError:
            self.size = [size]
        self.time_steps = self.size[-1]

        # Fill buffer and reset index
        self.reset()

    def reset(self):
        """Reset the buffer with a new time series."""
        self.buffer = powerlaw_psd_gaussian(
                exponent=self.beta, size=self.size, fmin=self.minimum_frequency, rng=self.rng)
        self.idx = 0

    def sample(self, T=1):
        """
        Sample `T` timesteps from the colored noise process.

        The buffer is automatically refilled when necessary.

        Parameters
        ----------
        T : int, optional, by default 1
            Number of samples to draw

        Returns
        -------
        array_like
            Sampled vector of shape `(*size[:-1], T)`
        """
        n = 0
        ret = []
        while n < T:
            if self.idx >= self.time_steps:
                self.reset()
            m = min(T - n, self.time_steps - self.idx)
            ret.append(self.buffer[..., self.idx:(self.idx + m)])
            n += m
            self.idx += m

        ret = self.scale * np.concatenate(ret, axis=-1)
        return ret if n > 1 else ret[..., 0]


class PinkNoiseProcess(ColoredNoiseProcess):
    """Infinite pink noise process.

    Implemented as a buffer: every `size[-1]` samples, a cut to a new time series starts. As this cut influences the
    PSD of the combined signal, the maximum period (1 / low-frequency cutoff) can be specified.

    Methods
    -------
    sample(T=1)
        Sample `T` timesteps from the pink noise process.
    reset()
        Reset the buffer with a new time series.
    """
    def __init__(self, size, scale=1, max_period=None, rng=None):
        """Infinite pink noise process.

        Implemented as a buffer: every `size[-1]` samples, a cut to a new time series starts. As this cut influences
        the PSD of the combined signal, the maximum period (1 / low-frequency cutoff) can be specified.

        Parameters
        ----------
        size : int or tuple of int
            Shape of the sampled pink noise signals. The last dimension (`size[-1]`) specifies the time range, and is
            thus ths maximum possible correlation length of the combined signal.
        scale : int, optional, by default 1
            Scale parameter with which samples are multiplied
        max_period : float, optional, by default None
            Maximum correlation length of sampled pink noise singals (1 / low-frequency cutoff). If None, it is
            automatically set to `size[-1]` (the sequence length).
        rng : np.random.Generator, optional
            Random number generator (for reproducibility). If not passed, a new random number generator is created by
            calling `np.random.default_rng()`.
        """
        super().__init__(1, size, scale, max_period, rng)

In [None]:
import numpy as np
import torch as th
from stable_baselines3.common.distributions import SquashedDiagGaussianDistribution
from stable_baselines3.common.noise import ActionNoise

def sigmoid_like(x, a, b, c, d, e):
    return a - (b / (1 + np.exp(-(x - c) * (d / e)))) 

class ColoredNoiseDist(SquashedDiagGaussianDistribution):
    def __init__(self, beta, seq_len, action_dim=None, rng=None, epsilon=1e-6):
        """
        Gaussian colored noise distribution for using colored action noise with stochastic policies.

        The colored noise is only used for sampling actions. In all other respects, this class acts like its parent
        class (`SquashedDiagGaussianDistribution`).

        Parameters
        ----------
        beta : float or array_like
            Exponent(s) of colored noise power-law spectra. If it is a single float, then `action_dim` has to be
            specified and the noise will be sampled in a vectorized manner for each action dimension. If it is
            array_like, then it specifies one beta for each action dimension. This allows different betas for different
            action dimensions, but sampling might be slower for high-dimensional action spaces.
        seq_len : int
            Length of sampled colored noise signals. If sampled for longer than `seq_len` steps, a new
            colored noise signal of the same length is sampled. Should usually be set to the episode length
            (horizon) of the RL task.
        action_dim : int, optional
            Dimensionality of the action space. If passed, `beta` has to be a single float and the noise will be
            sampled in a vectorized manner for each action dimension.
        rng : np.random.Generator, optional
            Random number generator (for reproducibility). If not passed, a new random number generator is created by
            calling `np.random.default_rng()`.
        epsilon : float, optional, by default 1e-6
            A small value to avoid NaN due to numerical imprecision.
        """
        assert (action_dim is not None) == np.isscalar(beta), \
            "`action_dim` has to be specified if and only if `beta` is a scalar."

        if np.isscalar(beta):
            super().__init__(action_dim, epsilon)
            self.beta = beta
            self.gen = ColoredNoiseProcess(beta=self.beta, size=(action_dim, seq_len), rng=rng)
        else:
            super().__init__(len(beta), epsilon)
            self.beta = np.asarray(beta)
            self.gen = [ColoredNoiseProcess(beta=b, size=seq_len, rng=rng) for b in self.beta]
        self.vis_states = []
        self.exp_radius = 0.005
        self.t_thresh = 100
        self.max_bins = 1000
        self._state_noise_weights = []
        
            
    def sample(self, obs) -> th.Tensor:
        ss_factor = 1
        if np.isscalar(self.beta):
            cn_sample = th.tensor(self.gen.sample()).float().to(device)
        else:
            cn_sample = th.tensor([cnp.sample() for cnp in self.gen]).float()
        last_obs = []
        for arr in list(dict(obs).values()):
            last_obs += arr.tolist()[0]
#       print(last_obs)
        last_obs = np.array(last_obs).flatten()
        t = np.clip(self.count_and_update_neighbors(last_obs), 0, 100)
        #reverse sigmoid on t to get the spatial scale factor ss factor
        ss_factor = sigmoid_like(t, 2, 1, 50, 4, 100)  
        self.gaussian_actions = self.distribution.mean + self.distribution.stddev*cn_sample*ss_factor
        return th.tanh(self.gaussian_actions)

    def __repr__(self) -> str:
        return f"ColoredNoiseDist(beta={self.beta})"

    def count_and_update_neighbors(self, last_obs):
        count = 0
        least_dist_ind = 0
        
        if(len(self.vis_states) < self.max_bins):
            self.vis_states.append(last_obs)
            self._state_noise_weights.append(1)
            return 1
        
        least_dist = np.linalg.norm(self.vis_states[0]-last_obs)
        
        for i,state in enumerate(self.vis_states):
            dist = np.linalg.norm(state-last_obs)
            if dist<least_dist:
                least_dist_ind = i
                least_dist = dist
            if (dist <= self.exp_radius):
                count+=self._state_noise_weights[i]
                
        self.vis_states[least_dist_ind] = (self.vis_states[least_dist_ind] * self._state_noise_weights[least_dist_ind] + last_obs)
        self._state_noise_weights[least_dist_ind] += 1
        self.vis_states[least_dist_ind] /= self._state_noise_weights[least_dist_ind]
            
        return count


class PinkNoiseDist(ColoredNoiseDist):
    def __init__(self, seq_len, action_dim, rng=None, epsilon=1e-6):
        """
        Gaussian pink noise distribution for using pink action noise with stochastic policies.

        The pink noise is only used for sampling actions. In all other respects, this class acts like its parent
        class (`SquashedDiagGaussianDistribution`).

        Parameters
        ----------
        seq_len : int
            Length of sampled colored noise signals. If sampled for longer than `seq_len` steps, a new
            colored noise signal of the same length is sampled. Should usually be set to the episode length
            (horizon) of the RL task.
        action_dim : int
            Dimensionality of the action space.
        rng : np.random.Generator, optional
            Random number generator (for reproducibility). If not passed, a new random number generator is created by
            calling `np.random.default_rng()`.
        epsilon : float, optional, by default 1e-6
            A small value to avoid NaN due to numerical imprecision.
        """
        super().__init__(1, seq_len, action_dim, rng, epsilon)

In [None]:
import gymnasium as gym
import numpy as np
import torch
import time
from tqdm import tqdm
from stable_baselines3 import SAC

# Define a function to evaluate an episode
def evaluate_episode(model, env):
    obs = env.reset()
    done = False
    total_reward = 0.0
    steps=0
    while steps<1000 and not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, _ = env.step(action)
        total_reward += reward
        steps+=1
    return total_reward

# Reproducibility
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed)

# Initialize environment
seq_len = 1000
action_dim = env.action_space.shape[-1]

# Initialize agents
# model_default = SAC("MultiInputPolicy", env, seed=seed)
model = SAC("MultiInputPolicy", env, seed=seed)
# model_OU = SAC("MultiInputPolicy", env, seed=seed)

# Set action noise
model.actor.action_dist = PinkNoiseDist(seq_len=seq_len, action_dim=action_dim, rng=rng)
# model_OU.actor.action_dist = ColoredNoiseDist(beta=2, seq_len=seq_len, action_dim=action_dim, rng=rng)

# Training parameters
total_timesteps = 1000000
eval_frequency = 10000 # Evaluate every 104 interactions
eval_rollouts = 5

wandb.init(
    project="Pinkie",
    config = {
    "Total_timesteps": total_timesteps,
    "Eval_frequency": eval_frequency,
    "Eval_rollouts": eval_rollouts
    }
)

#Final average performances
avg_default=0.0
avg_pink=0.0
avg_OU=0.0
final_default=0.0
final_pink=0.0
final_OU=0.0

# Train agents with evaluation
timesteps_so_far = 0
# for timesteps_so_far in tqdm(range(0,total_timesteps,eval_frequency)):
while timesteps_so_far < total_timesteps:
#     t1 = time.time()
#     # Train the default noise model
#     model_default.learn(total_timesteps=eval_frequency)
#     t2 = time.time()

#     # Evaluate the default noise model
#     mean_return_default = 0.0
#     for _ in range(eval_rollouts):
#         mean_return_default += evaluate_episode(model_default, env)
#     mean_return_default /= eval_rollouts
#     avg_default+=mean_return_default
#     if(timesteps_so_far>=0.95*total_timesteps):
#         final_default+=mean_return_default

#     print(f"Return (Default): {mean_return_default}")
#     print(f"Time taken (Default Model): {t2 - t1:.2f} seconds")
#     print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_default}")

    t1=time.time()
    # Train the pink noise model
    model.learn(total_timesteps=eval_frequency)
    # timesteps_so_far += eval_frequency
    t2 = time.time()

    # Evaluate the pink noise model
    mean_return_pink = 0.0
    for _ in range(eval_rollouts):
        mean_return_pink += evaluate_episode(model, env)
    mean_return_pink /= eval_rollouts
    avg_pink+=mean_return_pink
    if(timesteps_so_far>=0.95*total_timesteps):
        final_pink+=mean_return_pink

    print(f"Return (Pink): {mean_return_pink}")
    print(f"Time taken (Pink Noise Model): {t2 - t1:.2f} seconds")
    print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_pink}")

#     t1=time.time()
#     # Train the OU noise model
#     model_OU.learn(total_timesteps=eval_frequency)
#     # timesteps_so_far += eval_frequency
#     t2 = time.time()
    
#     # Evaluate the OU noise model
#     mean_return_OU = 0.0
#     for _ in range(eval_rollouts):
#         mean_return_OU += evaluate_episode(model_OU, env)
#     mean_return_OU/= eval_rollouts
#     avg_OU+=mean_return_OU
#     if(timesteps_so_far>=0.95*total_timesteps):
#         final_OU+=mean_return_OU

#     print(f"Return (OU): {mean_return_OU}")
#     print(f"Time taken (OU Noise Model): {t2 - t1:.2f} seconds")
#     print(f"Timesteps: {timesteps_so_far}, Mean Return: {mean_return_OU}")

    timesteps_so_far += eval_frequency
    
    wandb.log({
#         "mean_return_OU": mean_return_OU,
        "mean_return_pink": mean_return_pink,
#         "mean_return_default": mean_return_default,
        "timesteps_so_far": timesteps_so_far
    })

# avg_default/=(total_timesteps/eval_frequency)
avg_pink/=(total_timesteps/eval_frequency)
# avg_OU/=(total_timesteps/eval_frequency)

# final_default/=(0.05*total_timesteps/eval_frequency)
final_pink/=(0.05*total_timesteps/eval_frequency)
# final_OU/=(0.05*total_timesteps/eval_frequency)

wandb.log({
#     "final_default": final_default,
    "final_pink": final_pink,
#     "final_OU": final_OU,
#     "avg_default": avg_default,
    "avg_pink": avg_pink,
#     "avg_OU": avg_OU
})

print("Mean:")
print(f"Pink:{avg_pink}")
print("Final:")
print(f"Pink:{final_pink}")

In [None]:
# env = DMSuiteEnv("cartpole","balance_sparse")
# env = DMSuiteEnv("cartpole","swingup_sparse")
# env = DMSuiteEnv("ball_in_cup","catch")
# env = DMSuiteEnv("hopper","hop")
# env = DMSuiteEnv("walker","run")
# env = DMSuiteEnv("reacher","hard")
# env = DMSuiteEnv("pendulum","swingup")