<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/gorila_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
from matplotlib import pyplot as plt
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.optimizers import RMSprop

optimizer = RMSprop()

from sklearn.preprocessing import StandardScaler

from concurrent.futures import ThreadPoolExecutor

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'sp500_train.csv'
exp_dir = 'Colab Notebooks/workspace/export/'
csv_path = '/content/drive/My Drive/' + exp_dir + 'gorila_train.csv'

models_folder = '/content/drive/My Drive/' + exp_dir + 'rl_models'
rewards_folder = '/content/drive/My Drive/' + exp_dir + 'rl_rewards'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=1000, mode = 'test'):

        self.df = df.dropna().reset_index()

        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.mode = mode
        self.trade_time = None
        self.trade_win = None
        self.brfore_buy_cash = None
        self.action_space = np.array([0, 1, 2])
        self.hold_a_position = None
        self.now_price = None
        self.cash_in_hand = None

        self.reset()
        
    def reset(self):

        self.trade_time = 0
        self.trade_win = 0
        self.brfore_buy_cash = 0
        self.end_step = self.df_total_steps
        self.now_step = 0
        self.hold_a_position = 0.0
        self.now_price = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand = self.initial_money

        return self._get_now_state()

    def step(self, action):

        prev_revenue = self._get_revenue()

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
        reward = cur_revenue - prev_revenue

        if self.mode == 'test':
            info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }
        else:
            info = { 'cur_revenue' : cur_revenue }


        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand

        return state

    def _get_revenue(self):
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):

        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
        else:
            if self.action_space[0] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    if self.mode == 'test':
                        self.trade_time += 1
                        if self.cash_in_hand > self.brfore_buy_cash:
                            self.trade_win += 1 
            if self.action_space[2] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    if self.mode == 'test':
                        self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price
                        else:
                            buy_flag = False

In [3]:
class ParameterServer:
    def __init__(self, n_hidden_layers=1, hidden_dim=32):

        n_mid = 3
        n_state = 3
        n_action = 3

        mastermodel = Sequential()
        mastermodel.add(Dense(n_mid, input_shape=(n_state,)))
        mastermodel.add(ReLU()) 
        mastermodel.add(Dense(n_mid))
        mastermodel.add(ReLU()) 
        mastermodel.add(Dense(n_action))
        mastermodel.compile(loss="mse", optimizer=optimizer)

        print((mastermodel.summary()))
        self.mastermodel = mastermodel
    
    def load(self, name):
        self.mastermodel.load_weights(name)

    def save(self, name):
        self.mastermodel.save_weights(name)

    def placement(self, model):
        for m, mm in zip(model.trainable_weights, self.mastermodel.trainable_weights):
            m.assign(mm)

    def integration(self, model):
        for mm, m in zip(self.mastermodel.trainable_weights, model.trainable_weights):
            mm.assign(m)

In [4]:
class Brain:
    def __init__(self, masterbrain, n_hidden_layers=1, hidden_dim=32):

        n_mid = 3
        n_state = 3
        n_action = 3

        model = Sequential()
        model.add(Dense(n_mid, input_shape=(n_state,)))
        model.add(ReLU()) 
        model.add(Dense(n_mid))
        model.add(ReLU()) 
        model.add(Dense(n_action))
        model.compile(loss="mse", optimizer=optimizer)

        print((model.summary()))
        self.model = model
        self.masterbrain = masterbrain
        self.mastermodel = masterbrain.mastermodel

    def layering(self):
        self.masterbrain.placement(self.model)

    def integration(self):
        self.masterbrain.integration(self.model)

In [5]:
def make_scaler(env):

    states = []
    for _ in range(env.df_total_steps):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [6]:
class Actor:
    def __init__(self, brain, memory):
        self.state_size = 3
        self.action_size = 3
        self.memory = memory
        self.epsilon = 1.0
        self.model = brain.model
        self.brain = brain

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def layering(self):
        self.brain.layering()

In [7]:
class ReplayMemory:
    def __init__(self, size, batch_size=32):

        self.cntr = 0
        self.size = 0
        self.max_size = size
        self.state_dim = 3
        self.batch_size = batch_size

        self.states_buf = np.zeros([size, self.state_dim], dtype=np.float32)
        self.next_states_buf = np.zeros([size, self.state_dim], dtype=np.float32)
        self.acts_buf = np.zeros(size, dtype=np.uint8)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.uint8)

    def store(self, state, act, rew, next_state, done):
        self.states_buf[self.cntr] = state
        self.next_states_buf[self.cntr] = next_state
        self.acts_buf[self.cntr] = act
        self.rews_buf[self.cntr] = rew
        self.done_buf[self.cntr] = done
        self.cntr = (self.cntr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sampling(self):
        idxs = np.random.randint(0, self.size, size=self.batch_size)
        return dict(s=self.states_buf[idxs],
                    s2=self.next_states_buf[idxs],
                    a=self.acts_buf[idxs],
                    r=self.rews_buf[idxs],
                    d=self.done_buf[idxs])

In [8]:
class Learner:
    def __init__(self, brain, memory, batch_size=32):
        self.model = brain.model
        self.brain = brain
        self.memory = memory
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.batch_size = batch_size

    def learn(self):
        if self.memory.size < self.batch_size:
            return

        minibatch = self.memory.sampling()
        states = minibatch['s']
        actions = minibatch['a']
        rewards = minibatch['r']
        next_states = minibatch['s2']
        done = minibatch['d']

        target = rewards + (1 - done) * self.gamma * np.amax(self.model.predict(next_states), axis=1)

        target_full = self.model.predict(states)

        target_full[np.arange(self.batch_size), actions] = target
        self.model.train_on_batch(states, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def integration(self):
        self.brain.integration()

In [9]:
def play_game(env, actor, learner,scaler, episodes_times = 25, mode = 'test'):

    actor.layering()

    for episode in range(episodes_times):
        state = env.reset()
        state = scaler.transform([state])
        done = False
        start_time = datetime.now()
       
        while not done:
            action = actor.act(state)
            next_state, reward, done, info = env.step(action)
            next_state = scaler.transform([next_state])

            if mode == 'train':
                actor.store(state, action, reward, next_state, done)
                learner.learn()

        play_time = datetime.now() - start_time
        if mode == 'test':
            print("Episode: {}/{} RapTime: {} FixedProfit: {:.0f} TradeTimes: {} TradeWin: {}".format(episode + 1, episodes_times, play_time, info['cur_revenue'], info['trade_time'], info['trade_win']))
            with open(csv_path, 'a') as f:
                row = str(info['cur_revenue']) + ',' + str(info['trade_time']) + ',' + str(info['trade_win'])
                print(row, file=f)
        else:
            learner.integration()
            actor.layering()
            print("Episode: {}/{} RapTime: {} FixedProfit: {:.0f}".format(episode + 1, episodes_times, play_time, info['cur_revenue']))
            with open(csv_path, 'a') as f:
                row = str(info['cur_revenue'])
                print(row, file=f)
        
        state = next_state

In [None]:
initial_money=1000000
episodes_times = 2
batch_size = 32
mode = 'train'
memorysize = 500

masterbrain = ParameterServer()

if mode == 'test':
    masterbrain.load(f'{models_folder}/gorila_model.h5')

    with open(csv_path, 'w') as f:
        row = 'FixedProfit,TradeTimes,TradeWin'
        print(row, file=f)
else:
    with open(csv_path, 'w') as f:
        row = 'FixedProfit'
        print(row, file=f)

thread_num = 4
envs = []
for i in range(thread_num):
    e = Environment(df, initial_money=initial_money,mode = mode)
    brain = Brain(masterbrain)
    model = brain.model
    memory = ReplayMemory(memorysize,batch_size)
    a = Actor(brain, memory)
    l = Learner(brain, memory,batch_size)
    if mode == 'test':
        a.epsilon = 0.01
    s = make_scaler(e)
    arr = [e,a,l,s]
    envs.append(arr)

datas = []
with ThreadPoolExecutor(max_workers=thread_num) as executor:
    for env in envs:
        job = lambda: play_game(env[0], env[1], env[2], env[3], episodes_times, mode)
        datas.append(executor.submit(job))

if mode == 'train':
    masterbrain.save(f'{models_folder}/gorila_model.h5')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3)                 12        
_________________________________________________________________
re_lu (ReLU)                 (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
re_lu_1 (ReLU)               (None, 3)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Outp