In [1]:
# set to use cpu or gpu in the DQN
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "" # empty, so don't use gpu here

#### Import dependencies

In [2]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

#### Set parameters

In [3]:
# set the environment, in application it would be the real (or virtual) trading trials
env = gym.make('CartPole-v0')

In [4]:
# decide the state and the actions size to model
# in this example, it's just 4 things to observe in each time period
# but in the appication, these are the number of observable features
# we will use to train the model 
state_size = env.observation_space.shape[0]
state_size

4

In [7]:
# the number of actions, also could be customized
action_size = env.action_space.n
action_size

2

#### Set batch size for the stochastic gradient descent

In [58]:
batch_size = 32

In [9]:
# number of trials for training
episodes = 1001

In [10]:
# store the output to a folder
output_dir = 'model_output/cartpole'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Define agent

In [61]:
# like state, action, memory, discount, exploration rate(epsilon, its decay and bound), adam's step size(or called learning rate)
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 200)
        self.gamma = 0.95 # discount over time
        self.epsilon = 1 # starting exploration rate
        self.epsilon_decay = 0.995 # decay of exploration rate each step
        self.epsilon_min = 0.01 # the lower bound of the exploration rate
        self.learning_rate = 0.001 # learning rate (step size) of Adam
        self.model = self._build_model()
        
    # build the neural network for estimating the optimal q-value
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(24, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
        return model
    
    # remeber method for the experience replay, which is key to DQN here
    def remeber(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    # actions to take based on the state
    def act(self, state):
        if np.random.rand() <= self.epsilon: # exploration, in our application, we could replace these with all users' trading actions in the exchange platform as the exploration of the Q-function estimation
            return random.randrange(self.action_size)
        act_value = self.model.predict(state) # expolitation
        return np.argmax(act_value[0]) # the index corresponding to the largest reward, i.e., the best action
    
    # define the replay 
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            # if the the episode is done:
            target = reward
            if not done: # if the the episode is not done, we update the reward using the network's prediction
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) # I think we only wanna update the specific reward related to that optimal action
            target_f = self.model.predict(state)
            target_f[0][action] = target # if the 2nd above line doesn't have [action] there, this [action] should be added in the end here
            # now let the trained model evolve as more experience data we used
            self.model.fit(state, target_f, epochs = 1, verbose = 0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    # Save and load weights
    def load(self, name):
        self.model.load_weights(name)
    def save(self, name):
        self.model.save_weights(name)            

In [62]:
agent = DQNAgent(state_size, action_size)

#### Interact with environment

In [None]:
done = False
for e in range(episodes):
    
    state = env.reset() # restart the environment
    state = np.reshape(state, [1, state_size]) # reshape to be one row 
    
    for time in range(5000):
        env.render() # render the gym window, but there's a small rendering issue in Jupyter to fix
        action = agent.act(state) # actions to take based the initial state (initial epsilon is 1, i.e., random actions)

        # get the next_state, reward, whether it's done and debugging info
        # in our application, we may create a virtual environment (online, offline, actual experience, imaginary experience all have different strategies)
        next_state, reward, done, _ = env.step(action)
        # if the game is done (defined by the environment) before 5000 steps, then give a -10 reward, in our application it would the loss of the balance
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        # remember the new experience
        agent.remeber(state, action, reward, next_state, done)
        state = next_state
        if done: # print out the score, the episode and epsilon
            print("episode: {}/{}, score: {}, e: {:.2}".format(e, episodes, time, agent.epsilon/1.0))
            break
            
        # if the stored experience is more than the memory we specified, we start replay
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
            
        # store the weights every 50 episodes
        if e % 50 == 0:
            agent.save(output_dir + 'weights_' + '{:04d}'.format(e) + ".hdf5")

episode: 0/1001, score: 111, e: 0.01
episode: 1/1001, score: 160, e: 0.01
episode: 2/1001, score: 111, e: 0.01
episode: 3/1001, score: 71, e: 0.01
episode: 4/1001, score: 126, e: 0.01
episode: 5/1001, score: 134, e: 0.01
episode: 6/1001, score: 9, e: 0.01
episode: 7/1001, score: 10, e: 0.01
episode: 8/1001, score: 8, e: 0.01
episode: 9/1001, score: 11, e: 0.01
episode: 10/1001, score: 45, e: 0.01
episode: 11/1001, score: 130, e: 0.01
episode: 12/1001, score: 152, e: 0.01
episode: 13/1001, score: 91, e: 0.01
episode: 14/1001, score: 8, e: 0.01
episode: 15/1001, score: 14, e: 0.01
episode: 16/1001, score: 53, e: 0.01
episode: 17/1001, score: 147, e: 0.01
episode: 18/1001, score: 199, e: 0.01
episode: 19/1001, score: 199, e: 0.01
episode: 20/1001, score: 199, e: 0.01
episode: 21/1001, score: 166, e: 0.01
episode: 22/1001, score: 168, e: 0.01
episode: 23/1001, score: 144, e: 0.01
episode: 24/1001, score: 137, e: 0.01
episode: 25/1001, score: 9, e: 0.01
episode: 26/1001, score: 22, e: 0.01


In [64]:
env.render()

True

In [65]:
env.close()