In [1]:
from stable_baselines3 import DDPG
import gym
from gym import spaces
import pandas as pd
import numpy as np
import torch

In [584]:
# My consumption is a bit weird because of the random intervals

# Custom environment
class PortfolioChoice(gym.Env):

  def __init__(self, T):
    super(PortfolioChoice, self).__init__()
    # Actions are (risk_alloc, consumption)
    self.action_space = spaces.Box(low=np.array([-1.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float32)
    # Observations are (wealth, time)
    self.observation_space = spaces.Box(low=np.array([0.0, 0.0]), high=np.array([np.inf, 1.0]), dtype=np.float64)

    self.curr_wealth = 1.0
    self.curr_time = 0
    self.horizon = 10
    self.desired_bequest = 0
    self.risk_aversion = 1
    self.rho = 0.05

  # action space is [-1, 1]
  def rescale_riskalloc(self, risk_alloc):
    return (risk_alloc)*2+1
  def rescale_consumption(self, consumption):
    return (consumption+1)/2

  # not incorporating dt into my consumption measure!
  # when i choose consumption it should be scaled by the length of the period for that consumption!
  # ...do i just not really know how SDEs work?
  # need to think about SDEs more carefully here! maybe when I'm better rested
  # a puzzle to dig into! how can I make consumption independent of time step?
  def update_wealth(self, action, time_step):
    time_scale = time_step*self.horizon # scaling returns based on time
    exp_return = 0.08
    rf_rate = 0.05
    sd_return = 0.15 #/np.sqrt(1/time_scale)
    curr_wealth = self.curr_wealth
    risk_alloc = self.rescale_riskalloc(action[0])
    consumption = self.rescale_consumption(action[1])/time_step

    returns = (rf_rate + risk_alloc*(exp_return - rf_rate) - consumption)*time_step + risk_alloc*time_step*sd_return*np.random.normal()
    self.curr_wealth = curr_wealth*(1+returns)
  
  def calculate_utility(self, consumption):
    period = self.curr_time*self.horizon
    utility = np.log(1+consumption*1e6)*np.exp(-self.rho*period)
    return utility

  def step(self, action, time_step=0):
    # bit of a hack here so I can manually input timesteps
    if time_step==0:
      time_step = 1/4*1/self.horizon
    curr_time = self.curr_time

    # check if end of simulation has been reached
    if curr_time + time_step>=1:
      time_step = 1 - curr_time
      done = True
    else:
      done = False

    # calculate reward
    consumption = self.rescale_consumption(action[1])*self.curr_wealth
    reward = self.calculate_utility(consumption)

    self.curr_time += time_step
    self.update_wealth(action, time_step)

    # floor wealth at 0, trigger an exit
    if self.curr_wealth <= 0:
      self.curr_wealth = 0
      done = True

    obs = np.array([self.curr_wealth, self.curr_time])
    info = {}

    return obs, reward, done, info

  def reset(self):
    self.curr_wealth = 1.0
    self.curr_time = 0
    return np.array([1.0, 0.0])


In [585]:
from stable_baselines3.common.env_checker import check_env
env = PortfolioChoice(10)
check_env(env)

  logger.warn(


In [586]:

from stable_baselines3.ddpg.policies import MlpPolicy
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3 import DDPG
import torch as th

env = PortfolioChoice(10)

n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3)*np.ones(n_actions))
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))

model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise)
model.learn(total_timesteps=10000)
# model.save("ddpg_estimates")

# demonstrate model
#while True:
    #action, _states = model.predict(obs)
    #obs, rewards, dones, info = env.step(action)
    #env.render()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 38       |
|    ep_rew_mean     | 91.7     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 361      |
|    time_elapsed    | 0        |
|    total_timesteps | 152      |
| train/             |          |
|    actor_loss      | -4.75    |
|    critic_loss     | 11.8     |
|    learning_rate   | 0.001    |
|    n_updates       | 40       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21       |
|    ep_rew_mean     | 57.2     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 195      |
|    time_elapsed    | 0        |
|    total_timesteps | 168      |
| train/             |          |
|    actor_loss      | -3.06    |
|    critic_loss     |

<stable_baselines3.ddpg.ddpg.DDPG at 0x2ad116d47f0>

In [587]:
obs = env.reset()
for n in range(20):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action,0.05)
    print(obs)

[-0.9999985 -0.917827 ]
[0.95944113 0.05      ]
[-0.9999992  -0.92243534]
[0.93008022 0.1       ]
[-0.9999994 -0.9227529]
[0.90381278 0.15      ]
[-0.99999917 -0.9133485 ]
[0.86281685 0.2       ]
[-0.999997   -0.89769274]
[0.82601591 0.25      ]
[-0.99998885 -0.87894756]
[0.7746074 0.3      ]
[-0.9999283 -0.8629127]
[0.72857305 0.35      ]
[-0.99965334 -0.8461721 ]
[0.66281012 0.4       ]
[-0.996334   -0.83728886]
[0.61043558 0.45      ]
[-0.97675    -0.82985467]
[0.55156521 0.5       ]
[-0.8448576 -0.8257804]
[0.5078558 0.55     ]
[-0.704014  -0.8222807]
[0.46226661 0.6       ]
[-0.70764303 -0.82048875]
[0.42229434 0.65      ]
[-0.7519663  -0.81796676]
[0.38277749 0.7       ]
[-0.9077324 -0.8167898]
[0.34909046 0.75      ]
[-0.9685038 -0.8142281]
[0.31763592 0.8       ]
[-0.98946685 -0.8109508 ]
[0.28491137 0.85      ]
[-0.99620974 -0.80678034]
[0.25480204 0.9       ]
[-0.99846584 -0.8007318 ]
[0.2297313 0.95     ]
[-0.99939567 -0.7924794 ]
[0.20517175 1.        ]
