In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import keras
import gym

import warnings
warnings.filterwarnings("ignore")

In [2]:
class OptionEnv(gym.Env):
    def __init__(self):
        self.S0 = 100.0
        self.K = 100.0
        self.r = 0.02
        self.sigma = 0.20
        self.T = 1.0
        self.N = 365    # 365 days
        self.S1 = 0
        self.reward = 0
        self.day_step = 0    # from day 0 taking N steps to day N

        self.action_space = gym.spaces.Discrete(2)         # 0: hold, 1:exercise
        self.observation_space = gym.spaces.Box(low=np.array([0, 0]), high=np.array([np.inf, 1.0]), dtype=np.float32)      # S in [0, inf], tao in [0, 1]

    def step(self, action):
        if action == 1:        # exercise
            reward = max(K-self.S1, 0.0) * np.exp(-self.r * self.T * (self.day_step/self.N))
            done = True
        else:       # hold
            if self.day_step == self.N:    # at maturity
                reward = max(self.K-self.S1, 0.0) * np.exp(-self.r * self.T)
                done = True
            else: # move to tomorrow
                reward = 0
                # lnS1 - lnS0 = (r - 0.5*sigma^2)*t + sigma * Wt
                self.S1 = self.S1 * np.exp((self.r - 0.5 * self.sigma**2) * (self.T/self.N) + self.sigma * np.sqrt(self.T/self.N) * np.random.normal())
                self.day_step += 1
                done = False

        tao = 1.0-self.day_step/self.N        # time to maturity, in unit of years
        return np.array([self.S1, tao]), reward, done, {}

    def reset(self):
        self.day_step = 0
        self.S1 = self.S0
        tao = 1.0-self.day_step/self.N        # time to maturity, in unit of years
        return [self.S1, tao]
    
    def render(self):
        """
        make video
        """
        pass

    def close(self):
        pass

In [3]:
env = OptionEnv()
s = env.reset()

sim_prices = []
sim_prices.append(s[0])
for i in range(365):
    action = 0
    s_next, reward, done, info = env.step(action)
    sim_prices.append(s_next[0])

plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.plot(sim_prices)

In [4]:
!pip install tf_agents

In [5]:
from tf_agents.environments import  gym_wrapper           # wrap OpenAI gym
from tf_agents.environments import tf_py_environment      # gym to tf gym
from tf_agents.networks import q_network                  # Q net
from tf_agents.agents.dqn import dqn_agent                # DQN Agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer      # replay buffer
from tf_agents.trajectories import trajectory              # s->s' trajectory
from tf_agents.utils import common                       # loss function

In [6]:
class RLHyperparameters:
    # Hyper-parameters
    num_iterations = 20000 # @param {type:"integer"}

    collect_steps_per_iteration = 10  # @param {type:"integer"}
    replay_buffer_max_length = 100000  # @param {type:"integer"}
    batch_size = 256  # @param {type:"integer"}

    learning_rate = 1e-3  # @param {type:"number"}
    num_eval_episodes = 10  # @param {type:"integer"}

    eval_interval = 1000  # @param {type:"integer"}
    log_interval = 200  # @param {type:"integer"}
    
    
hyperparameters = RLHyperparameters

In [7]:
train_env_gym = OptionEnv()
eval_env_gym = OptionEnv()

train_env_wrap = gym_wrapper.GymWrapper(train_env_gym)
eval_env_wrap = gym_wrapper.GymWrapper(eval_env_gym)

train_env  = tf_py_environment.TFPyEnvironment(train_env_wrap)
eval_env = tf_py_environment.TFPyEnvironment(eval_env_wrap)

In [8]:
# Define Q Net
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)


# Define DQN Agent
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=hyperparameters.learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()