In [24]:
from stable_baselines3 import DDPG
import gym
from gym import spaces
import pandas as pd
import numpy as np
import torch

In [29]:
# Custom environment
class PortfolioChoice(gym.Env):

  def __init__(self, T):
    super(PortfolioChoice, self).__init__()
    # Actions are (risk_alloc, consumption)
    self.action_space = spaces.Box(low=np.array([-1.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float32)
    # Observations are (wealth, time)
    self.observation_space = spaces.Box(low=np.array([0.0, 0.0]), high=np.array([np.inf, 1.0]), dtype=np.float64)

    self.curr_wealth = 1.0
    self.curr_time = 0
    self.horizon = 10
    self.desired_bequest = 0
    self.risk_aversion = 2

  # action space is [-1, 1]
  def rescale_riskalloc(self, risk_alloc):
    return (risk_alloc)*2+1
  def rescale_consumption(self, consumption):
    return (consumption+1)/2

  def update_wealth(self, action, time_step):
    # scaling returns based on time_step
    exp_return = 1-(1-0.08)**(time_step)
    rf_rate = 1-(1-0.05)**(time_step)
    sd_return = 0.15/np.sqrt(1/time_step)
    risk_alloc = self.rescale_riskalloc(action[0])
    consumption = self.rescale_consumption(action[1])
    curr_wealth = self.curr_wealth
    self.curr_wealth = (exp_return + risk_alloc*(exp_return - rf_rate))*curr_wealth - consumption + curr_wealth*risk_alloc*sd_return*np.random.normal()
    return self.curr_wealth

  def utility(self, consumption):
    gamma = self.risk_aversion
    utils = consumption**(1-gamma)
    return utils

  # every round has at least 10 steps with an average of 20, advance time by random draw from (0,1)
  def step(self, action):
    time_step = np.random.random()
    reward = 0
    horizon = self.horizon
    curr_time = self.curr_time

    # check if end of simulation has been reached
    if curr_time + time_step > horizon:
      time_step = horizon - curr_time
      done = True
    else:
      done = False

    self.curr_time += curr_time + time_step
    self.update_wealth(action, time_step)

    consumption = self.rescale_consumption(action[1])
    reward = self.utility(consumption)

    # floor wealth at 0, trigger an exit
    if self.curr_wealth < 0:
      self.curr_wealth = 0
      done = True

    obs = np.array([self.curr_wealth, self.curr_time])
    info = {}

    return obs, reward, done, info

  def reset(self):
    self.curr_wealth = 1.0
    self.curr_time = 0
    return np.array([1.0, 0.0])


In [30]:
from stable_baselines3.common.env_checker import check_env
env = PortfolioChoice(10)
check_env(env)


  logger.warn(


In [32]:

from stable_baselines3.ddpg.policies import MlpPolicy
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG

env = PortfolioChoice(10)

n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise)
model.learn(total_timesteps=1000)
# model.save("ddpg_estimates")

# demonstrate model
# while True:
    # action, _states = model.predict(obs)
    # obs, rewards, dones, info = env.step(action)
    # env.render()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 2.48     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 960      |
|    time_elapsed    | 0        |
|    total_timesteps | 4        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 3.95     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 955      |
|    time_elapsed    | 0        |
|    total_timesteps | 8        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 5.12     |
| time/              |          |
|    episodes        | 12       |
|    fps             |

  utils = consumption**(1-gamma)
  sd_return = 0.15/np.sqrt(1/time_step)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.31     |
|    ep_rew_mean     | nan      |
| time/              |          |
|    episodes        | 432      |
|    fps             | 123      |
|    time_elapsed    | 3        |
|    total_timesteps | 469      |
| train/             |          |
|    actor_loss      | nan      |
|    critic_loss     | nan      |
|    learning_rate   | 0.001    |
|    n_updates       | 363      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.5      |
|    ep_rew_mean     | nan      |
| time/              |          |
|    episodes        | 436      |
|    fps             | 121      |
|    time_elapsed    | 4        |
|    total_timesteps | 492      |
| train/             |          |
|    actor_loss      | nan      |
|    critic_loss     | nan      |
|    learning_rate   | 0.001    |
|    n_updates       | 386      |
--------------

<stable_baselines3.ddpg.ddpg.DDPG at 0x16bcd1654c0>