<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/a3c_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from datetime import datetime
import random
import copy
import pickle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

import math

from google.colab import drive

from concurrent.futures import ThreadPoolExecutor

mode = 'test'
name = 'a3c'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Mounted at /content/drive/


In [None]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test'):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money

        return self._get_now_state()

    def step(self, action):

        prev_revenue = self._get_revenue()
        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
 
        reward = cur_revenue - prev_revenue

        if self.mode == 'test':
            info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }
        else:
            info = { 'cur_revenue' : cur_revenue }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
            if self.mode == 'test':
                self.trade_time += 1
                if self.cash_in_hand > self.brfore_buy_cash:
                    self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    if self.mode == 'test':
                        self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    if self.mode == 'test':
                        self.trade_time += 1
                        if self.cash_in_hand > self.brfore_buy_cash:
                            self.trade_win += 1

In [None]:
class MasterBrain:
    def __init__(self,n_action = 3):

        n_shape = 3
        self.n_action = n_action
        lr = 0.01

        common = input_ = keras.layers.Input(shape=n_shape)
        common = keras.layers.Dense(128, activation="relu")(common)

        actor = keras.layers.Dense(self.n_action, activation="softmax")(common)
        critic = keras.layers.Dense(1, activation="linear")(common)

        mastermodel = keras.Model(input_, [actor, critic])
        mastermodel.compile(optimizer=Adam(lr=lr))
        mastermodel.summary()
        self.mastermodel = mastermodel

    def load(self, name):
        self.mastermodel.load_weights(name)

    def save(self, name):
        self.mastermodel.save_weights(name)

    def placement(self, model):
        for m, mm in zip(model.trainable_weights, self.mastermodel.trainable_weights):
            m.assign(mm)

    def integration(self, model):
        for mm, m in zip(self.mastermodel.trainable_weights, model.trainable_weights):
            mm.assign(m)

In [None]:
class Brain:
    def __init__(self, masterbrain, n_action = 3):

        n_shape = 3
        self.n_action = n_action
        lr = 0.01

        common = input_ = keras.layers.Input(shape=n_shape)
        common = keras.layers.Dense(128, activation="relu")(common)

        actor = keras.layers.Dense(self.n_action, activation="softmax")(common)
        critic = keras.layers.Dense(1, activation="linear")(common)

        model = keras.Model(input_, [actor, critic])
        model.compile(optimizer=Adam(lr=lr))
        model.summary()
        self.model = model

        self.masterbrain = masterbrain
        self.mastermodel = masterbrain.mastermodel

    def load(self, name):
        self.masterbrain.load(name)

    def save(self, name):
        self.masterbrain.save(name)

    def layering(self):
        self.masterbrain.placement(self.model)

    def integration(self):
        self.masterbrain.integration(self.model)

In [None]:
class Actor:
    def __init__(self, brain, n_action = 3):
        self.model = brain.model
        self.n_action = n_action
        self.brain = brain

    def policynetwork(self, state):
        act_p, _ = self.model(state.reshape((1,-1)))
        return np.random.choice(self.n_action, p=act_p[0].numpy())

    def load(self, name):
        self.brain.load(name)

    def save(self, name):
        self.brain.save(name)

    def layering(self):
        self.brain.layering()

    def integration(self):
        self.brain.integration()

In [None]:
class Critic:
    def __init__(self,model,n_action=3):
        self.model = model
        self.n_action = n_action
        self.gamma = 0.9
        self.beta = 0.1

    def valuenetwork(self, experiences):

        discounted_return = self._discounted_return(experiences)

        state_batch = np.asarray([e["state"] for e in experiences])
        action_batch = np.asarray([e["action"] for e in experiences])

        onehot_actions = tf.one_hot(action_batch, self.n_action)

        with tf.GradientTape() as tape:

            act_p, v = self.model(state_batch, training=True)
            selct_pai = tf.reduce_sum(onehot_actions * act_p, axis=1, keepdims=True)
            selected_action_probs = tf.clip_by_value(selct_pai, 1e-10, 1.0)
            advantage = discounted_return - tf.stop_gradient(v)

            value_losses = self._value_losses(advantage)
            policy_losses = self._policy_losses(advantage,selected_action_probs,v,discounted_return)
            total_loss = value_losses + policy_losses
            loss = tf.reduce_mean(total_loss)

        gradients = tape.gradient(loss, self.model.trainable_variables)

        self.model.optimizer.apply_gradients(
            (grad, var) 
            for (grad, var) in zip(gradients, model.trainable_variables) 
            if grad is not None
        )

    def _discounted_return(self,experiences):
        if experiences[-1]["done"]:
            G = 0
        else:
            next_state = np.atleast_2d(experiences[-1]["next_state"])
            _, n_v = self.model(next_state)
            G = n_v[0][0].numpy()

        discounted_return = []
        for exp in reversed(experiences):
            if exp["done"]:
                G = 0
            G = exp["reward"] + self.gamma * G
            discounted_return.append(G)
        discounted_return.reverse()
        discounted_return = np.asarray(discounted_return).reshape((-1, 1))
        discounted_return -= np.mean(discounted_return)
        return discounted_return


    def _value_losses(self,advantage):
        return (advantage)**2

    def _policy_losses(self,advantage,selected_action_probs,v,discounted_return):

        a = tf.math.log(selected_action_probs) * advantage
        b = self._entropy(v)
        policy_losses = - ( a + b )

        return policy_losses

    def _entropy(self, v):

        a,_ = v.shape

        ave = v.numpy()    
        sigma2 = np.std(ave)
        entropy = self.beta*0.5*(math.log(2 * math.pi * sigma2) + 1)

        mylist = [[entropy] for i in range(a)]
        rank_1_tensor = tf.constant(mylist)

        return rank_1_tensor

In [None]:
class Main:
    def __init__(self, env, actor, critic, num, mdl_dir, name, batch_size = 32, episodes_times = 1000, mode = 'test'):
        self.env = env
        self.actor = actor
        self.critic = critic
        self.num = str(num)
        self.mdl_dir = mdl_dir
        self.scaler = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.batch_size = batch_size
        self.mode = mode
        self.name = name

        if self.mode == 'test':
            self._load()
            with open(csv_path, 'w') as f:
                row = 'FixedProfit,TradeTimes,TradeWin'
                print(row, file=f)
        else:
            with open(csv_path, 'w') as f:
                row = 'FixedProfit'
                print(row, file=f)
        
        self.actor.layering()

    def play_game(self):

        for episode in range(self.episodes_times):
            state = self.env.reset()
            state = self.scaler.transform([state])
            state = state.flatten()
            done = False
            start_time = datetime.now()
            experiences = []
    
            while not done:
                
                action = self.actor.policynetwork(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                next_state = next_state.flatten()

                if self.mode == 'train':
                    experiences.append({"state": state, "action": action, "reward": reward, "next_state": next_state, "done": done,})
                    if len(experiences) == self.batch_size:
                        self.critic.valuenetwork(experiences)
                        experiences = []

                state = next_state
               
            play_time = datetime.now() - start_time
            if mode == 'test':
                print("Episode: {}/{} RapTime: {} FixedProfit: {:.0f} TradeTimes: {} TradeWin: {}".format(episode + 1, episodes_times, play_time, info['cur_revenue'], info['trade_time'], info['trade_win']))
                with open(csv_path, 'a') as f:
                    row = str(info['cur_revenue']) + ',' + str(info['trade_time']) + ',' + str(info['trade_win'])
                    print(row, file=f)
            else:
                actor.integration()
                actor.layering()
                print("Episode: {}/{} RapTime: {} FixedProfit: {:.0f}".format(episode + 1, episodes_times, play_time, info['cur_revenue']))
                with open(csv_path, 'a') as f:
                    row = str(info['cur_revenue'])
                    print(row, file=f)

        if self.mode == 'train':
            self._save()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _load(self):
        with open('{}/{}_{}.pkl'.format(self.mdl_dir, self.name, self.num), 'rb') as f:
            self.scaler = pickle.load(f)
        self.actor.load('{}/{}.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        self.actor.save('{}/{}.h5'.format(self.mdl_dir, self.name))
        with open('{}/{}_{}.pkl'.format(self.mdl_dir, self.name, self.num), 'wb') as f:
            pickle.dump(self.scaler, f)

In [None]:
initial_money=1000000
episodes_times = 25
batch_size = 32
masterbrain = MasterBrain()

thread_num = 4
envs = []
for i in range(thread_num):
    env = Environment(df, initial_money=initial_money,mode = mode)
    brain = Brain(masterbrain)
    model = brain.model
    actor = Actor(brain)
    critic = Critic(model)
    main = Main(env, actor, critic, i, mdl_dir, name, batch_size, episodes_times, mode)
    envs.append(main)

datas = []
with ThreadPoolExecutor(max_workers=thread_num) as executor:
    for env in envs:
        job = lambda: env.play_game()
        datas.append(executor.submit(job))

  "The `lr` argument is deprecated, use `learning_rate` instead.")
  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          512         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 3)            387         dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1)            129         dense[0][0]                      
Total params: 1,028
Trainable params: 1,028
Non-trainable params: 0
__________________________

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 128)          512         input_3[0][0]                    
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 3)            387         dense_6[0][0]                    
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 1)            129         dense_6[0][0]                    
Total params: 1,028
Trainable params: 1,028
Non-trainable params: 0
________________________

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 128)          512         input_4[0][0]                    
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 3)            387         dense_9[0][0]                    
__________________________________________________________________________________________________
dense_11 (Dense)                (None, 1)            129         dense_9[0][0]                    
Total params: 1,028
Trainable params: 1,028
Non-trainable params: 0
________________________

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense_12 (Dense)                (None, 128)          512         input_5[0][0]                    
__________________________________________________________________________________________________
dense_13 (Dense)                (None, 3)            387         dense_12[0][0]                   
__________________________________________________________________________________________________
dense_14 (Dense)                (None, 1)            129         dense_12[0][0]                   
Total params: 1,028
Trainable params: 1,028
Non-trainable params: 0
________________________