<a href="https://colab.research.google.com/github/sugiyama404/BuckTest/blob/main/DRQN-BurnIn/drqn-burnin_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
from matplotlib import pyplot as plt
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU, LSTM, Activation, Input, MaxPool1D, Conv1D
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import backend as K

from tensorflow.keras.utils import Progbar

mode  = 'test'
name  = 'drqn-burnin'
level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Mounted at /content/drive/


In [None]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money
        self.sell_price      = 0
        self.buy_price       = 0

        return self._get_now_state()

    def step(self, action):

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self.sell_price = 0
        self._trade(action,done)
        reward = 0
        if (self.sell_price > 0) and (self.buy_price > 0) and ((self.sell_price - self.buy_price) != 0):
            reward = (self.sell_price - self.buy_price) / self.buy_price
            self.buy_price = 0
        cur_revenue = self._get_revenue()
 
        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            if self.hold_a_position != 0:
                self.cash_in_hand += self.now_price * self.hold_a_position
                self.hold_a_position = 0
                self.trade_time += 1
                if self.cash_in_hand > self.brfore_buy_cash:
                    self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.buy_price += self.now_price
                            self.cash_in_hand -= self.now_price + self.commission * self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.sell_price += self.now_price * self.hold_a_position
                    self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    self.trade_time += 1
                    if self.cash_in_hand > self.brfore_buy_cash:
                        self.trade_win += 1

In [None]:
class ReplayMemory:
    def __init__(self, max_size=500, batch_size=32):

        self.cntr = 0
        self.size = 0
        self.max_size = max_size
        self.batch_size = batch_size
        self.states_memory = np.zeros([self.max_size, 3], dtype=np.float32)
        self.next_states_memory = np.zeros([self.max_size, 3], dtype=np.float32)
        self.acts_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.rewards_memory = np.zeros(self.max_size, dtype=np.float32)
        self.done_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.hidden_state_memory = np.zeros([self.max_size, 2, 16], dtype=np.float32)

    def store_transition(self, state, act, reward, next_state, done, hidden_state):
        self.states_memory[self.cntr] = state
        self.next_states_memory[self.cntr] = next_state
        self.acts_memory[self.cntr] = act
        self.rewards_memory[self.cntr] = reward
        self.done_memory[self.cntr] = done
        self.hidden_state_memory[self.cntr] = hidden_state
        self.cntr = (self.cntr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def random_sampling(self):
        dice = np.arange(10, self.size)
        mb_index = np.random.choice(dice, self.batch_size, replace=False)
        mb_index_min = mb_index - 10

        states_3d = np.empty((0, 10, 3))
        next_states_3d = np.empty((0, 10, 3))
        for a,b in zip(mb_index, mb_index_min):
            states_tmp = self.states_memory[b:a]
            next_states_tmp = self.next_states_memory[b:a]

            states_tmp  = np.reshape(states_tmp, (1, 10, 3))
            next_states_tmp  = np.reshape(next_states_tmp, (1, 10, 3))

            states_3d = np.append(states_3d, states_tmp,axis=0)
            next_states_3d = np.append(next_states_3d, next_states_tmp,axis=0)

        key = ['state','next_state','act','reward','done', 'hidden_state']
        value = [states_3d ,next_states_3d, self.acts_memory[mb_index], self.rewards_memory[mb_index], self.done_memory[mb_index],self.hidden_state_memory[mb_index]]
        dict1=dict(zip(key,value))
        return dict1

In [None]:
class Brain:
    def __init__(self):

        conv_filter = 12
        units = 16
        look_back = 10
        opt = Adam(learning_rate=0.001)

        model = Sequential()
        model.add(Conv1D(filters=conv_filter, kernel_size=1, padding="same", activation="tanh",batch_input_shape=(1, look_back, 3)))
        model.add(MaxPool1D(pool_size=1, padding='same'))
        model.add(Activation("relu"))
        model.add(LSTM(units, stateful=True, name="lstm"))
        model.add(Dense(3, kernel_initializer='random_uniform'))
        model.compile(loss = "mean_absolute_error", optimizer=opt)
        model.summary()
        self.model = model

In [None]:
class Agent(Brain, ReplayMemory):
    def __init__(self, max_size=500, batch_size=32):
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.r = 0.9995
        #self.r = 0.995
        self.batch_size = batch_size
        self.local_state = np.empty((0,3), float)
        Brain.__init__(self)
        ReplayMemory.__init__(self, max_size, batch_size)

    def reset(self):
        self.local_state = np.empty((0,3), float)

    def update_replay_memory(self, state, action, reward, next_state, done, hidden_state):
        self.store_transition(state, action, reward, next_state, done, hidden_state)

    def act(self, state):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.r

        if len(self.local_state) >= 10:
            self.local_state = self.local_state[1:]
            self.local_state = np.append(self.local_state, np.array(state), axis=0)
            tmp_state = copy.deepcopy(self.local_state)
            tmp_state  = np.reshape(tmp_state, (1, 10, 3))

            if np.random.rand() <= self.epsilon:
                _ = self.model.predict(tmp_state)
                return np.random.choice(3), self._get_hidden_state()
            act_values = self.model.predict(tmp_state)
            return np.argmax(act_values[0]), self._get_hidden_state()
        else:
            self.local_state = np.append(self.local_state, np.array(state), axis=0)
            return np.random.choice(3), self._get_hidden_state()

    def replay(self):
        if self.size < (self.batch_size + 11):
            return

        m_batch = self.random_sampling()
        states, next_states, actions, rewards, dones = m_batch['state'], m_batch['next_state'], m_batch['act'], m_batch['reward'], m_batch['done']
        hidden_states = m_batch['hidden_state']

        hold_hidden_state = self._get_hidden_state()

        for state, next_state, action, reward, done, hidden_state in zip(states, next_states, actions, rewards, dones, hidden_states):
            self.reset_model_state()
            self._set_hidden_states(hidden_state)

            next_state = np.reshape(next_state, [1, 10, 3])
            state = np.reshape(state, [1, 10, 3])

            target = reward + (1 - done) * self.gamma * np.amax(self.model.predict(next_state), axis=1)
            d = self.model.predict(next_state)
            c = np.amax(self.model.predict(next_state), axis=1)

            target_full = self.model.predict(state)

            target_full[0][action] = target
            self.model.fit(state, target_full, verbose=0)

        self.reset_model_state()
        self._set_hidden_states(hold_hidden_state)

    def _get_hidden_state(self):
        lstm = self.model.get_layer("lstm")
        hidden_states = np.concatenate([K.get_value(lstm.states[0]), K.get_value(lstm.states[1])]) # (20, 16)
        return hidden_states

    def reset_model_state(self):
        self.model.reset_states()

    def _set_hidden_states(self, hidden_states):
        a0, a1 = np.split(hidden_states, 2, 0)# (16, 2, 16)
        self.model.get_layer("lstm").reset_states([a0, a1])

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [None]:
class Main:
    def __init__(self, env, agent, mdl_dir, name, episodes_times = 200, mode = 'test'):
        self.env            = env
        self.agent          = agent
        self.mdl_dir        = mdl_dir
        self.scaler         = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode           = mode
        self.name           = name

        with open(csv_path, 'w') as f:
            row = 'FixedProfit,TradeTimes,TradeWin'
            print(row, file=f)

        if self.mode == 'test':
            self._load()
            self.agent.epsilon = 0.01

    def play_game(self):
        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                if (int(str(self.episodes_times)[:-1])*10 == episode):
                    pb_i = Progbar(int(str(self.episodes_times)[-1]), stateful_metrics=metrics_names)
                else:
                    pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            done  = False
            self.agent.reset_model_state()
            i = 0
        
            while not done:
                action, hidden_state = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                reward = self._reward_clipping(reward)

                if self.mode == 'train':
                    self.agent.update_replay_memory(state, action, reward, next_state, done, hidden_state)
                    if (i > 0) and (i % 100 == 0):
                        self.agent.replay()

                state = next_state
                i += 1

            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)
            with open(csv_path, 'a') as f:
                row = str(info['cur_revenue']) + ',' + str(info['trade_time']) + ',' + str(info['trade_win'])
                print(row, file=f)

        if self.mode == 'train':
            self._save()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break      
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        if val > 0:
            return 1
        elif val == 0:
            return 0
        else:
            return -1

    def _load(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'rb') as f:
            self.scaler = pickle.load(f)
        self.agent.load('{}/{}.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'wb') as f:
            pickle.dump(self.scaler, f)
        self.agent.save('{}/{}.h5'.format(self.mdl_dir, self.name))

In [None]:
initial_money=1000000
commission = 0 if level == 1 else 0.002
episodes_times = 100
batch_size = 32
max_size = 500

env = Environment(df, initial_money = initial_money, mode = mode, commission = commission)
agent = Agent(max_size, batch_size)
main = Main(env, agent, mdl_dir, name, episodes_times, mode)
main.play_game()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (1, 10, 12)               48        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (1, 10, 12)               0         
_________________________________________________________________
activation (Activation)      (1, 10, 12)               0         
_________________________________________________________________
lstm (LSTM)                  (1, 16)                   1856      
_________________________________________________________________
dense (Dense)                (1, 3)                    51        
Total params: 1,955
Trainable params: 1,955
Non-trainable params: 0
_________________________________________________________________
