In [1]:
import os
import json
import io
from data_generator import DataGenerator
from trade_env import TraderEnv

In [2]:
dg = DataGenerator()

In [3]:
trade = TraderEnv(dg)

In [4]:
import threading
import numpy as np
import tensorflow as tf
import pylab
import time
import gym
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K


# global variables for threading
episode = 0
scores = []

EPISODES = 20000

# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
# In this example, we use A3C algorithm
class A3CAgent:
    def __init__(self, state_size, action_size, env_name):
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # get gym environment name
        self.env_name = env_name

        # these are hyper parameters for the A3C
        self.actor_lr = 0.00002
        self.critic_lr = 0.00002
        self.discount_factor = .80
        self.hidden1, self.hidden2 = 24, 24
        self.threads = 32

        # create model for actor and critic network
        self.actor, self.critic = self.build_model()

        # method for training actor and critic network
        self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]

        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)
        self.sess.run(tf.global_variables_initializer())

    # approximate policy and value using Neural Network
    # actor -> state is input and probability of each action is output of network
    # critic -> state is input and value of state is output of network
    # actor and critic network share first hidden layer
    def build_model(self):
        state = Input(batch_shape=(None,  self.state_size))
        shared = Dense(self.state_size*2, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
        
        actor_hidden = Dense(self.state_size, activation='relu', kernel_initializer='glorot_uniform')(shared)
        actor_hidden = Dense(60, activation='relu', kernel_initializer='glorot_uniform')(actor_hidden)
        actor_hidden = Dense(30, activation='relu', kernel_initializer='glorot_uniform')(actor_hidden)
        actor_hidden = Dense(20, activation='relu', kernel_initializer='glorot_uniform')(actor_hidden)
        action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)

        value_hidden = Dense(self.state_size, activation='relu', kernel_initializer='he_uniform')(shared)
        value_hidden = Dense(80, activation='relu', kernel_initializer='he_uniform')(value_hidden)
        value_hidden = Dense(50, activation='relu', kernel_initializer='he_uniform')(value_hidden)
        value_hidden = Dense(25, activation='relu', kernel_initializer='he_uniform')(value_hidden)
        state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)

        actor = Model(inputs=state, outputs=action_prob)
        critic = Model(inputs=state, outputs=state_value)

        actor._make_predict_function()
        critic._make_predict_function()

        actor.summary()
        critic.summary()

        return actor, critic

    # make loss function for Policy Gradient
    # [log(action probability) * advantages] will be input for the back prop
    # we add entropy of action probability to loss
    def actor_optimizer(self):
        action = K.placeholder(shape=(None, self.action_size))
        advantages = K.placeholder(shape=(None, ))

        policy = self.actor.output

        good_prob = K.sum(action * policy, axis=1)
        eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
        loss = -K.sum(eligibility)

        entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)

        actor_loss = loss + 0.01*entropy

        optimizer = Adam(lr=self.actor_lr)
        updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
        train = K.function([self.actor.input, action, advantages], [], updates=updates)
        return train

    # make loss function for Value approximation
    def critic_optimizer(self):
        discounted_reward = K.placeholder(shape=(None, ))

        value = self.critic.output

        loss = K.mean(K.square(discounted_reward - value))

        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
        train = K.function([self.critic.input, discounted_reward], [], updates=updates)
        return train

    # make agents(local) and start training
    def train(self):
        # self.load_model('./save_model/cartpole_a3c.h5')
        agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
                        self.action_size, self.state_size) for i in range(self.threads)]

        for agent in agents:
            agent.start()

        while True:
            time.sleep(20)

            plot = scores[:]
            pylab.plot(range(len(plot)), plot, 'b')
            pylab.savefig("./save_graph/cartpole_a3c.png")

            self.save_model('./save_model/cartpole_a3c.h5')

    def save_model(self, name):
        self.actor.save_weights(name + "_actor.h5")
        self.critic.save_weights(name + "_critic.h5")

    def load_model(self, name):
        self.actor.load_weights(name + "_actor.h5")
        self.critic.load_weights(name + "_critic.h5")

# This is Agent(local) class for threading
class Agent(threading.Thread):
    def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
        threading.Thread.__init__(self)

        self.states = []
        self.rewards = []
        self.actions = []

        self.index = index
        self.actor = actor
        self.critic = critic
        self.optimizer = optimizer
        self.env_name = env_name
        self.discount_factor = discount_factor
        self.action_size = action_size
        self.state_size = state_size

    # Thread interactive with environment
    def run(self):
        global episode
        env = TraderEnv(DataGenerator())
        while episode < EPISODES:
            state = env.reset()
            score = 0
            while True:
                action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                score += reward

                self.memory(state, action, reward)

                state = next_state

                if done:
                    episode += 1
                    print("episode: ", episode, "/ score : ", score)
                    scores.append(score)
                    self.train_episode(True)
                    break
        print("Train finished: ", episode)

    # In Policy Gradient, Q function is not available.
    # Instead agent uses sample returns for evaluating policy
    def discount_rewards(self, rewards, done=True):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        if not done:
            running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    # save <s, a ,r> of each step
    # this is used for calculating discounted rewards
    def memory(self, state, action, reward):
        self.states.append(state)
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
        self.rewards.append(reward)

    # update policy network and value network every episode
    def train_episode(self, done):
        discounted_rewards = self.discount_rewards(self.rewards, done)

        values = self.critic.predict(np.array(self.states))
        values = np.reshape(values, len(values))

        advantages = discounted_rewards - values

        self.optimizer[0]([self.states, self.actions, advantages])
        self.optimizer[1]([self.states, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []

    def get_action(self, state):
        policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]




  return f(*args, **kwds)
Using TensorFlow backend.


In [5]:
env = TraderEnv(DataGenerator())

state_size = env.observation_space.shape[0]
action_size = env.action_space.n


In [None]:
state_size, action_size

(44, 5)

In [None]:
global_agent = A3CAgent(state_size, action_size, "TraderEnv")
global_agent.train()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 44)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 88)                3960      
_________________________________________________________________
dense_2 (Dense)              (None, 44)                3916      
_________________________________________________________________
dense_3 (Dense)              (None, 60)                2700      
_________________________________________________________________
dense_4 (Dense)              (None, 30)                1830      
_________________________________________________________________
dense_5 (Dense)              (None, 20)                620       
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 105       
Total para

episode:  157 / score :  -3290
episode: episode:   159158  / score : / score :   -3284-3287

episode:  160 / score :  -2201.576774999866
episode:  161 / score :  -3288
episode:  162 / score :  -2550.170799999888
episode:  163episode: episode:    / score : 165164   -3280/ score : / score : 
  -2987.3689750000017-3233.182774999988

episode:  166episode:  167  / score : / score :   -3213.769374999989-3278

episode:  168 / score :  -3213.769374999989
episode:  169episode:   / score : 170episode:    -3219.0924000000314/ score : 171
  / score : -3029.8239000000412 
-2134.9375
episode: episode:  episode:  173 172 174 / score :  / score :  / score :  -3282 -3291
-2813.1612750000145

episode: episode:   episode: episode: 175176   177 / score : / score :  178  / score :  -2440.5637499999643-2569.4573000001606 / score : 

-3280 -3113.907099999892

episode:  179 / score :  -3286
episode:  180 / score :  -3268.1918999999834
episode:  181 / score :  -3231.163975000062
episode:  182 / score :  -2479.

episode:  378 / score :  -2478.9375
episode:  379 / score : episode:  380  -3292/ score : 
 -3287
episode:  381 / score :  -2833.735900000118
episode:  382 / score :  -2551.148974999779
episode:  episode: 383  384/ score :   / score : -3287 
-3216.0924000000314
episode:  385 / score :  -2484.9375
episode:  386 / score :  -3286
episode:  387 / score :  -3293
episode:  388 episode: / score :   389-3208.0924000000314 
/ score :  -3228.7693749999885
episode:  390 / score :  -2598.2702999998396
episode:  391 / score :  -3273
episode:  392 / score : episode:  393 / score :   -3215.7693749999885episode: -2571.3203749998593
 
episode: 394  / score : 395  -2980.4567750000706/ score : 
 -3278
episode:  396 / score :  -3288
episode:  397 / score :  -2547.148974999779
episode:  398 / score :  -3291
episode: episode:  399  400/ score :  episode:  / score :  -3291 401
-3243.246774999977 / score : 
 -3137.3403999999864
episode:  402 / score :  episode: -3177.9084000000585 
403 / score :  episode: -28

episode:  613 / score :  episode: -3219.0924000000314 episode: 
614 615  / score : / score :   -3287episode: episode: -3285
  
617616  / score : / score :   -3012.576374999986-3287

episode:  618 / score :  -3290
episode: episode:  619  / score : 620  -2807.969375000031/ score : 
 -3033.8239000000412
episode:  621 / score :  -2206.576774999866
episode:  622 / score :  -2759.904575000074
episode:  623 / score :  -3285
episode:  624 / score :  -2589.63087499977
episode:  625 / score :  -2429.714999999832
episode:  626episode:  / score :  627  / score : -3283 
-3148.3403999999864
episode:  628 / score : episode:   -3148.3403999999864629
 / score :  -3294
episode: episode:   631630  / score : / score : episode:   -3295-2973.759975000093 

632 / score :  -3248.433874999986
episode:  633 / score :  -2822.969375000031
episode: episode:   635634  / score : / score :   -1861.2273749996352-3283

episode:  636 / score :  -2881.984375
episode:  637 / score :  -3126.8851750000313
episode:  638 / sc

episode:  838 / score :  -2605.0783999998566
episode:  839 episode: / score :  episode:  840 -3249.7949750000093 841
/ score :  episode:   / score : 842-3277.9036749999786  
-3278episode: / score :  
 843-2545.148974999779 
/ score :  -3216.7693749999885
episode: episode:  845  episode: / score : 844   846-3289/ score :  
 / score :  -3292-3213.7693749999885

episode: episode:  847  / score : 848  -2059.126849999899/ score : 
 -3236.2419750000026
episode:  849 / score :  -3264.8975000000014
episode:  850 / score :  -3218.976774999973
episode:  851 / score :  -2965.199274999922
episode:  852 / score :  -2582.63087499977
episode: episode:  854  / score : 853  -3235.976774999973/ score : 
 episode: -2490.5726750000204
 855 / score :  -3210.961274999972
episode:  856 / score :  -2245.3706750001556
episode:  857 / score :  -3216.1639750000622
episode:  episode: 858  859/ score :   / score : -2806.969375000031 
-3292episode:  
860 / score :  -3009.065875000044
episode:  861 / score : episode

/ score : episode: episode:   1053-3293
  / score :  -3257.246774999977
1054 / score :  -3225.0924000000314
episode:  1055 / score :  -2819.735900000118
episode:  1056 / score :  -3283
episode:  1057 / score :  -2549.148974999779
episode: episode: episode:  1060episode:   episode:   10581059 episode: / score : 1061  1062   / score : / score :  1063episode: -3037.8239000000412/ score :   / score :   
 -3289-2776.2630999997937 / score : 1064 -1809.0158999998564

-2807.969375000031 / score : 
-3262.1918999999834
 
-3270
episode:  1065 / score :  -3259.1918999999834
episode:  episode: 1066  1067/ score :   / score : -2142.1748749999047 
-3210.2187499999754
episode:  1068 / score :  -3272.8975000000014
episode:  1069 / score : episode: episode:    -3227.769374999988510701071
  episode:  / score : / score : 1072   -3279-3286/ score : 

 -3234.163975000062
episode:  1073 / score :  -3283
episode:  1074 / score :  -3297
episode:  1075 / score :  -3282
episode:  1076episode:   1077 / score :  -

/ score : episode: episode:  -3285.375599999991 
1257 1258  / score : / score :   -3290-3209.1639750000622

episode: episode:   12601259  / score : / score :   -2988.800675000014-3273.711774999995

episode:  1261 / score :  -3298
episode:  1262 / score :  -3284
episode:  1263 / score :  -3231.953749999998episode: 
 1264 / score :  -2541.325900000043
episode:  1265 / score :  -2457.3417500001237
episode: episode:  1267  1266/ score :   / score : -3174.597850000048 
-3216.961274999972
episode:  1268 / score :  -3175.9084000000585
episode:  1269episode:   / score : 1270  -3292/ score : 
 -3222.976774999973
episode:  1271 / score :  -2757.5050749997963
episode:  1272 / score :  -3215.769374999989
episode:  1273 / score :  -2577.3203749998593
episode:  1274 / score :  -3257.263974999998
episode:  1275 / score :  -2530.847774999845
episode:  1276 / score :  -2514.3909499997812
episode:  1277 / score :  -3283
episode:  1278 / score :  -3278
episode:  1279 / score :  -3160.6774999999675
episod

1476 / score :  -2579.3364749997886
episode:  1477 / score :  -3225.976774999973
episode:  1478 / score :  -3275
episode:  1479 / score :  -3020.84637499999
episode:  1480 / score :  -2994.7635750000754
episode:  1481 / score :  -1859.2273749996352
episode:  1482 / score :  -3281
episode:  1483 / score :  -3215.769374999989
episode:  1484 / score :  -1805.0158999998564episode: 
 1485 / score :  -3293
episode:  1486 / score :  -2198.576774999866
episode:  episode: 1487  1488episode: / score :   / score :  episode: -3031.8239000000412 1489 episode: 
-3273.8975000000014 1490 episode:  
/ score :  14911492  / score :  / score : -3277 / score :  
-3289 -3213.182774999988episode: 
-2078.11834999981
 
1493 / score :  episode: -3283 
1494episode:   1495/ score :   / score : -2822.969375000031 
-2748.904575000074
episode:  1496 / score : episode:   -2480.93751497
 / score :  -2538.079725000019
episode:  1498 / score :  -3203.1639750000622episode: 
 1499 / score :  -2572.3203749998593
episode:  

In [None]:

def get_action(state):
    policy =  global_agent.actor.predict(np.reshape(state, [1, state_size]))[0]
    return np.argmax(policy)

In [None]:
env = TraderEnv(DataGenerator())
state = env.reset()

In [None]:
get_action(state)

In [None]:
env = TraderEnv(DataGenerator())
state = env.reset()
score = 0
while True:
    action = get_action(state)
    print(action)
    next_state, reward, done, info = env.step(action)
    score += reward

    state = next_state
    if done:
        print ("score: ", score , "info", info)
        break