In [1]:
# set to use cpu or gpu in the DQN
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "" # empty, so don't use gpu here

#### Import dependencies

In [2]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

#### Set parameters

In [3]:
# set the environment, in application it would be the real (or virtual) trading trials
env = gym.make('CartPole-v0')

In [4]:
# decide the state and the actions size to model
# in this example, it's just 4 things to observe in each time period
# but in the appication, these are the number of observable features
# we will use to train the model 
state_size = env.observation_space.shape[0]
state_size

4

In [7]:
# the number of actions, also could be customized
action_size = env.action_space.n
action_size

2

#### Set batch size for the stochastic gradient descent

In [8]:
batch_size = 32

In [9]:
# number of trials for training
episodes = 1001

In [10]:
# store the output to a folder
output_dir = 'model_output/cartpole'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Define agent

In [None]:
# like state, action, memory, discount, exploration rate(epsilon, its decay and bound), adam's step size(or called learning rate)
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 200)
        self.gamma = 0.95 # discount over time
        self.epsilon = 1 # starting exploration rate
        self.epsilon_decay = 0.995 # decay of exploration rate each step
        self.epsilon_min = 0.01 # the lower bound of the exploration rate
        self.learning_rate = 0.001 # learning rate (step size) of Adam
        self.model = self._build_model()
        
    # build the neural network for estimating the optimal q-value
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
        return model
    
    # remeber method for the experience replay, which is key to DQN here
    def remeber(self, state, action, reward, next_state, done):
        self.memory.append((self, state, action, reward, next_state, done))
        
    # actions to take based on the state
    def act(self, state):
        if np.random.rand() <= self.epsilon: # exploration
            return random.randrange(self.action_size)
        act_value = self.model.predict(state) # expolitation
        return np.argmax(act_value[0]) # the index corresponding to the largest reward, i.e., the best action
    
    # define the replay 
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            # if the the episode is done:
            target = reward
            if not done: # if the the episode is not done, we update the reward using the network's prediction
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) # I think we only wanna update the specific reward related to that action we take, so it's supposed to be an index [action] added in the end
            target_f = self.model.predict(state)
            target_f[0][action] = target # if the 2nd above line doesn't have [action] there, this [action] should be added in the end here
            # now let the trained model evolve as more experience data we used
            self.model.fit(state, target_f, epochs = 1, verbose = 0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            