<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/Rainbow/rainbow_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow-addons



In [2]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
from matplotlib import pyplot as plt
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU, Input, Lambda
from tensorflow.keras.optimizers import RMSprop, Adam
from statistics import mean

from tensorflow.keras.utils import Progbar
import tensorflow.keras as keras
from tensorflow.keras.losses import Huber
from tensorflow.keras import backend as K

import tensorflow_addons as tfa
from tensorflow_addons.layers import NoisyDense

from sklearn.preprocessing import StandardScaler


mode = 'train'
name = 'rainbow'

level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money
        self.sell_price      = 0
        self.buy_price       = 0

        return self._get_now_state()

    def step(self, action):

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self.sell_price = 0
        self._trade(action,done)
        reward = 0
        if (self.sell_price > 0) and (self.buy_price > 0) and ((self.sell_price - self.buy_price) != 0):
            reward = (self.sell_price - self.buy_price) / self.buy_price
            self.buy_price = 0
        cur_revenue = self._get_revenue()
 
        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            if self.hold_a_position != 0:
                self.cash_in_hand += self.now_price * self.hold_a_position
                self.hold_a_position = 0
                self.trade_time += 1
                if self.cash_in_hand > self.brfore_buy_cash:
                    self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.buy_price += self.now_price
                            self.cash_in_hand -= self.now_price + self.commission * self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.sell_price += self.now_price * self.hold_a_position
                    self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    self.trade_time += 1
                    if self.cash_in_hand > self.brfore_buy_cash:
                        self.trade_win += 1

In [4]:
class Brain:
    def __init__(self, mode):

        learning_rate = 0.00001
        neurons_per_layer = 24
        optimizer = Adam(learning_rate=learning_rate, epsilon=0.001)

        input = Input(shape=(3,))
        if mode == 'test':
            common = Dense(neurons_per_layer*2, activation='relu')(input)
            common = Dense(neurons_per_layer*4, activation='relu')(common)
            common = Dense(4, activation='linear')(common)
        else:
            common = NoisyDense(neurons_per_layer*2, activation='relu')(input)
            common = NoisyDense(neurons_per_layer*4, activation='relu')(common)
            common = NoisyDense(4, activation='linear')(common)
        output = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - 0.0*K.mean(a[:, 1:], keepdims=True),output_shape=(3,))(common)

        model1 = keras.Model(inputs=input, outputs=output)

        model1.compile(loss=Huber(), optimizer=optimizer)
        model1.summary()
        self.model1 = model1

        learning_rate = 0.00001
        neurons_per_layer = 24
        optimizer = Adam(learning_rate=learning_rate, epsilon=0.001)

        input = Input(shape=(3,))
        if mode == 'test':
            common = Dense(neurons_per_layer*2, activation='relu')(input)
            common = Dense(neurons_per_layer*4, activation='relu')(common)
            common = Dense(4, activation='linear')(common)
        else:
            common = NoisyDense(neurons_per_layer*2, activation='relu')(input)
            common = NoisyDense(neurons_per_layer*4, activation='relu')(common)
            common = NoisyDense(4, activation='linear')(common)
        output = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - 0.0*K.mean(a[:, 1:], keepdims=True),output_shape=(3,))(common)

        model2 = keras.Model(inputs=input, outputs=output)

        model2.compile(loss=Huber(), optimizer=optimizer)
        model2.summary()
        self.model2 = model2

In [5]:
class Memory:
    def __init__(self, max_size=500, batch_size=32):

        self.cntr = 0
        self.size = 0
        self.max_size = max_size
        self.batch_size = batch_size
        self.states_memory = np.zeros([self.max_size, 3], dtype=np.float32)
        self.next_states_memory = np.zeros([self.max_size, 3], dtype=np.float32)
        self.acts_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.rewards_memory = np.zeros(self.max_size, dtype=np.float32)
        self.done_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.tderrors_memory = np.zeros(self.max_size, dtype=np.float32)

    def store_transition(self, state, act, reward, next_state, done, tderror):
        self.states_memory[self.cntr] = state
        self.next_states_memory[self.cntr] = next_state
        self.acts_memory[self.cntr] = act
        self.rewards_memory[self.cntr] = reward
        self.done_memory[self.cntr] = done
        self.tderrors_memory[self.cntr] = tderror
        self.cntr = (self.cntr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sampling(self, mb_index = np.array([1,2,3])):
        if len(mb_index) != self.batch_size:
            mb_index = np.random.choice(self.size, self.batch_size, replace=False)
        key = ['state','next_state','act','reward','done']
        value = [self.states_memory[mb_index],self.next_states_memory[mb_index],
                 self.acts_memory[mb_index],self.rewards_memory[mb_index],
                 self.done_memory[mb_index]]
        dict1=dict(zip(key,value))
        return dict1

    def findall(self):
        return self.states_memory,self.next_states_memory,self.acts_memory,self.rewards_memory,self.done_memory,self.tderrors_memory

    def update_memory_tderror(self, val):
        self.tderrors_memory = val

In [6]:
class LocalMemory:
    def __init__(self):
        self.local_states_memory = np.empty((0,3), float)
        self.local_next_states_memory = np.empty((0,3), float)
        self.local_acts_memory = np.zeros(0, dtype=np.int32)
        self.local_rewards_memory = np.zeros(0, dtype=np.uint8)
        self.local_done_memory = np.zeros(0, dtype=np.uint8)
        self.local_safe_flag = False

    def store_local_transition(self, state, act, reward, next_state, done):

        if self.local_done_memory.shape[0] == 3:
            self.local_states_memory = self.local_states_memory[1:]
            self.local_next_states_memory = self.local_next_states_memory[1:]
            self.local_acts_memory = self.local_acts_memory[1:]
            self.local_rewards_memory = self.local_rewards_memory[1:]
            self.local_done_memory = self.local_done_memory[1:]

            self.local_states_memory = np.append(self.local_states_memory, np.array(state), axis=0)
            self.local_next_states_memory = np.append(self.local_next_states_memory, np.array(next_state), axis=0)
            self.local_acts_memory = np.append(self.local_acts_memory, np.array(act))
            self.local_rewards_memory = np.append(self.local_rewards_memory, np.array(reward))
            self.local_done_memory = np.append(self.local_done_memory, np.array(done))
            self.local_safe_flag = True
        else:
            self.local_states_memory = np.append(self.local_states_memory, np.array(state), axis=0)
            self.local_next_states_memory = np.append(self.local_next_states_memory, np.array(next_state), axis=0)
            self.local_acts_memory = np.append(self.local_acts_memory, np.array(act))
            self.local_rewards_memory = np.append(self.local_rewards_memory, np.array(reward))
            self.local_done_memory = np.append(self.local_done_memory, np.array(done))

    def isLocal_safe(self):
        return True if self.local_safe_flag else False

    def get_localmemory(self):
        return (np.array([self.local_states_memory[0]]), np.array([self.local_next_states_memory[0]]),
                np.array([self.local_next_states_memory[2]]),self.local_acts_memory[0], self.local_rewards_memory[0],
                self.local_rewards_memory[1], self.local_rewards_memory[2], self.local_done_memory[0])

In [7]:
class Agent(Brain, Memory, LocalMemory):
    def __init__(self, max_size=500, batch_size=32, mode ='test'):
        self.gamma = 0.997
        self.alpha = 0.5
        self.mode = mode
        self.td_epsilon = 0.0001
        self.batch_size = batch_size
        self.max_size = max_size
        Brain.__init__(self, mode)
        Memory.__init__(self, max_size, batch_size)
        LocalMemory.__init__(self)

    def act(self, state):
        act_values = self._predict(state)
        return np.argmax(act_values[0])

    def _predict(self, state):
        q1 = self.model1.predict(state)
        q2 = self.model2.predict(state)
        return (q1 + q2)

    def replay(self):
        if self.size < self.batch_size:
            return

        m_batch = self.sampling()
        states, next_states, actions, rewards, done = m_batch['state'], m_batch['next_state'], m_batch['act'], m_batch['reward'], m_batch['done']

        s_flag = 11 if np.random.random() <= 0.5 else 22
        if s_flag == 11:
            q = self.model1.predict(states)  
            next_q = self.model2.predict(next_states)
            target_full = np.copy(q)

            target = rewards + (1 - done) * self.gamma * np.amax(next_q, axis=1)
            target_full[np.arange(self.batch_size), actions] = target
            self.model1.train_on_batch(states, target_full)
        else:
            q = self.model2.predict(states)  
            next_q = self.model1.predict(next_states)
            target_full = np.copy(q)

            target = rewards + (1 - done) * self.gamma * np.amax(next_q, axis=1)
            target_full[np.arange(self.batch_size), actions] = target
            self.model2.train_on_batch(states, target_full)

    def prioritized_experience_replay(self):
        if self.size < self.batch_size:
            return

        prob = self._tderror_prob()
        num_np = np.random.choice(self.size, self.batch_size, p=prob, replace=False)

        m_batch = self.sampling(num_np)
        states, next_states, actions, rewards, done = m_batch['state'], m_batch['next_state'], m_batch['act'], m_batch['reward'], m_batch['done']

        s_flag = 11 if np.random.random() <= 0.5 else 22
        if s_flag == 11:
            q = self.model1.predict(states)  
            next_q = self.model2.predict(next_states)
            target_full = np.copy(q)

            target = rewards + (1 - done) * self.gamma * np.amax(next_q, axis=1)
            target_full[np.arange(self.batch_size), actions] = target
            self.model1.train_on_batch(states, target_full)
        else:
            q = self.model2.predict(states)  
            next_q = self.model1.predict(next_states)
            target_full = np.copy(q)

            target = rewards + (1 - done) * self.gamma * np.amax(next_q, axis=1)
            target_full[np.arange(self.batch_size), actions] = target
            self.model2.train_on_batch(states, target_full)

    def tderror(self, states, next_states, n3_states, actions, rewards, next_rewards, n2_rewards, done):
        n3_q = np.amax(self._predict(n3_states), axis = 1)
        target = rewards + self.gamma * next_rewards + (self.gamma ** 2) * n2_rewards + (self.gamma ** 3) * n3_q
        tderror = target - np.amax(self._predict(states), axis = 1)

        return tderror

    def _tderror_prob(self):
        absolute_tderror = 0
        tderror = self.tderrors_memory
        absolute_tderror = np.power(np.abs(tderror) + self.td_epsilon, self.alpha)
        return absolute_tderror / np.sum(absolute_tderror)

    def update_tderror(self):
        states, next_states, acts, rewards, done, tderror = self.findall()
        next_action = np.argmax(self._predict(next_states)[0])
        target = rewards + self.gamma * self._predict(next_states)[0][next_action]
        TDerror = target - self._predict(next_states)[0][acts]
        self.update_memory_tderror(TDerror)

    def load(self, name, name2):
        self.model1.load_weights(name)
        self.model2.load_weights(name2)

    def save(self, name, name2):
        self.model1.save_weights(name)
        self.model2.save_weights(name2)

In [8]:
class Main:
    def __init__(self, env, agent, mdl_dir, name, episodes_times = 200, mode = 'test'):
        self.env            = env
        self.agent          = agent
        self.mdl_dir        = mdl_dir
        self.scaler         = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode           = mode
        self.name           = name

        with open(csv_path, 'w') as f:
            row = 'FixedProfit,TradeTimes,TradeWin'
            print(row, file=f)

        if self.mode == 'test':
            self._load()

    def play_game(self):

        total_reward = [1000000]

        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            done  = False
        
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                reward = self._reward_clipping(reward)

                if self.mode == 'train':
                    self.agent.store_local_transition(state, action, reward, next_state, done)
                    if self.agent.isLocal_safe():
                        l_states, l_next_states, l_n3_states, l_acts, l_rewards, l_next_rewards, l_n2_rewards, l_done = self.agent.get_localmemory()
                        tderror = self.agent.tderror(l_states, l_next_states, l_n3_states, l_acts, l_rewards, l_next_rewards, l_n2_rewards, l_done)
                        self.agent.store_transition(l_states, l_acts, l_rewards, l_next_states, l_done, tderror)

                    if self.agent.cntr % 10 == 0:
                        if mean(total_reward) < 1020000:
                            self.agent.replay()
                        else:
                            self.agent.prioritized_experience_replay()
                            self.agent.update_tderror()

                state = next_state

            if self.mode == 'train':
                total_reward.append(info['cur_revenue'])

            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)
            with open(csv_path, 'a') as f:
                row = str(info['cur_revenue']) + ',' + str(info['trade_time']) + ',' + str(info['trade_win'])
                print(row, file=f)

        if self.mode == 'train':
            self._save()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        result = 1 if val > 0 else 0 if val == 0 else -1
        return result

    def _load(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'rb') as f:
            self.scaler = pickle.load(f)
        self.agent.load('{}/{}_1.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'wb') as f:
            pickle.dump(self.scaler, f)
        self.agent.save('{}/{}_1.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))

In [9]:
initial_money=1000000
episodes_times = 20
commission = 0 if level == 1 else 0.002
batch_size = 32
max_size = 300

env = Environment(df, initial_money=initial_money, mode = mode, commission = commission)
agent = Agent(max_size, batch_size, mode)
main = Main(env, agent, mdl_dir, name, episodes_times, mode)
main.play_game()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
noisy_dense (NoisyDense)     (None, 48)                384       
_________________________________________________________________
noisy_dense_1 (NoisyDense)   (None, 96)                9408      
_________________________________________________________________
noisy_dense_2 (NoisyDense)   (None, 4)                 776       
_________________________________________________________________
lambda (Lambda)              (None, 3)                 0         
Total params: 10,568
Trainable params: 10,568
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shap