In [9]:
from stable_baselines3 import DDPG
import gym
from gym import spaces
import pandas as pd
import numpy as np

In [42]:
# Custom environment
class PortfolioChoice(gym.Env):

  def __init__(self, T):
    super(PortfolioChoice, self).__init__()
    # Define action and observation space
    # Actions are (risk_alloc, consumption)
    self.action_space = spaces.Box(low=np.array([-1.0, 0]), high=np.array([2.0, 1.0]), dtype=np.float64)
    # Observations are (wealth, time)
    self.observation_space = spaces.Box(low=np.array([0.0, 0.0]), high=np.array([np.inf, 1.0]), dtype=np.float64)

    self.curr_wealth = 1.0
    self.curr_time = 0
    self.horizon = 10
    self.desired_bequest = 0
    self.risk_aversion = 2

  def update_wealth(self, action, time_step):

    # scaling returns based on time_step
    exp_return = 1-(1-0.08)**(time_step)
    rf_rate = 1-(1-0.05)**(time_step)
    sd_return = 0.15/np.sqrt(1/time_step)
    risk_alloc = action[0]
    consumption = action[1]
    curr_wealth = self.curr_wealth
    self.curr_wealth = (exp_return + risk_alloc*(exp_return - rf_rate))*curr_wealth - consumption + curr_wealth*risk_alloc*sd_return*np.random.normal()
    return self.curr_wealth

  def utility(self, consumption):
    gamma = self.risk_aversion
    utils = consumption**(1-gamma)
    return utils

  # every round has at least 10 steps with an average of 20, advance time by random draw from (0,1)
  def step(self, action):
    time_step = np.random.random()
    reward = 0
    horizon = self.horizon
    curr_time = self.curr_time

    # check if end of simulation has been reached
    if curr_time + time_step > horizon:
      time_step = horizon - curr_time
      done = True
    else:
      done = False

    self.curr_time += curr_time + time_step
    self.update_wealth(action, time_step)

    consumption = action[1]
    reward = self.utility(consumption)

    # floor wealth at 0, trigger an exit
    if self.curr_wealth < 0:
      self.curr_wealth = 0
      done = True

    obs = np.array([self.curr_wealth, self.curr_time])
    info = {}

    return obs, reward, done, info

  def reset(self):
    self.curr_wealth = 1.0
    self.curr_time = 0
    return np.array([1.0, 0.0])


In [43]:
from stable_baselines3.common.env_checker import check_env
env = PortfolioChoice(10)
check_env(env)


