<a href="https://colab.research.google.com/github/sugiyama404/BuckTest/blob/main/R2D2/R2D2_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
from matplotlib import pyplot as plt
import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, ReLU, Input, Lambda, LSTM, Activation, MaxPool1D, Conv1D

from tensorflow.keras.optimizers import Adam
from statistics import mean
import tensorflow.keras as keras
from tensorflow.keras.losses import Huber
from tensorflow.keras import backend as K

from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from time import sleep
import random, string

from tensorflow.keras.utils import Progbar

mode = 'train'
name = 'r2d2'
level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money
        self.sell_price      = 0
        self.buy_price       = 0

        return self._get_now_state()

    def step(self, action):

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self.sell_price = 0
        self._trade(action,done)
        reward = 0
        if (self.sell_price > 0) and (self.buy_price > 0) and ((self.sell_price - self.buy_price) != 0):
            reward = (self.sell_price - self.buy_price) / self.buy_price
            self.buy_price = 0
        cur_revenue = self._get_revenue()
 
        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            if self.hold_a_position != 0:
                self.cash_in_hand += self.now_price * self.hold_a_position
                self.hold_a_position = 0
                self.trade_time += 1
                if self.cash_in_hand > self.brfore_buy_cash:
                    self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.buy_price += self.now_price
                            self.cash_in_hand -= self.now_price + self.commission * self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.sell_price += self.now_price * self.hold_a_position
                    self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    self.trade_time += 1
                    if self.cash_in_hand > self.brfore_buy_cash:
                        self.trade_win += 1

In [3]:
class Brain:
    def __init__(self):

        conv_filter = 12
        units = 16
        look_back = 10

        model = Sequential()
        model.add(Conv1D(filters=conv_filter, kernel_size=1, padding="same", activation="tanh",batch_input_shape=(1, look_back, 3)))
        model.add(MaxPool1D(pool_size=1, padding='same'))
        model.add(Activation("relu"))
        model.add(LSTM(units, stateful=True, name="lstm"))
        model.add(Dense(4, kernel_initializer='random_uniform'))
        model.add(Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - 0.0*K.mean(a[:, 1:], keepdims=True),output_shape=(3,)))
        model.compile(loss = "mean_absolute_error", optimizer=Adam(learning_rate=0.001, clipvalue=40))
        model.summary()
        self.model = model

In [4]:
class Learner:
    def __init__(self, model1, model2, memory):

        self.model1 = model1
        self.model2 = model2
        self.memory = memory
        self.gamma  = 0.99

    def learn(self, queue):
        while True:
            val = queue.get()
            if (val == 'done_prosess'):
                break

            for state, next_state, action, reward, done, hidden_state1, hidden_state2 in zip(states, next_states, actions, rewards, dones, hidden_states1, hidden_states2):
                self.reset_model_state()
                self._set_hidden_states(hidden_state1, hidden_state2)
                s_flag = 11 if np.random.random() <= 0.5 else 22

                next_state = np.reshape(next_state, [1, 10, 3])
                state = np.reshape(state, [1, 10, 3])

                if s_flag == 11:
                    target = reward + (1 - done) * self.gamma * np.amax(self.model2.predict(next_state), axis=1)
                    target_full = self.model1.predict(state)

                    target_full[0][action] = target
                    self.model1.fit(state, target_full, verbose=0)

                else:
                    target = reward + (1 - done) * self.gamma * np.amax(self.model1.predict(next_state), axis=1)
                    target_full = self.model2.predict(state)

                    target_full[0][action] = target
                    self.model2.fit(state, target_full, verbose=0)

            self._tderror()
            self.memory.placement(self.model1, self.model2)
            queue.task_done()

    def _predict(self, state, s_flag = 12):
        values = None
        q1 = self.model1.predict(state)
        q2 = self.model2.predict(state)
        if s_flag == 12:
            values = q1 + q2
        elif s_flag == 11:
            values = q1 + q1
        else:
            values = q2 + q2
        return values

    def _tderror(self):
        states, next_states, actions, rewards, done, hidden_state1, hidden_state2, tderror = self.memory.findall()
        
        n2_rewards = rewards[2:-1]
        next_rewards = rewards[1:-2]
        n3_next_states = next_states[3:]

        states = states[:-3]
        next_states = next_states[:-3]
        actions = actions[:-3]
        rewards = rewards[:-3]
        done = done[:-3]

        next_predict = self._predict(n3_next_states)
        predict = self._predict(states)

        n3_q = np.max(next_predict, axis = 1)
        target = rewards + self.gamma * next_rewards + (self.gamma ** 2) * n2_rewards + (self.gamma ** 3) * n3_q
        tderror = target - np.max(predict, axis = 1)
        self.memory.replace_tderror(tderror)

    def _get_hidden_state(self):
        lstm1 = self.model.get_layer("lstm")
        hidden_states1 = np.concatenate([K.get_value(lstm1.states[0]), K.get_value(lstm1.states[1])])
        lstm2 = self.model.get_layer("lstm")
        hidden_states2 = np.concatenate([K.get_value(lstm2.states[0]), K.get_value(lstm2.states[1])])
        return hidden_states1, hidden_states2

    def reset_model_state(self):
        self.model1.reset_states()
        self.model2.reset_states()

    def _set_hidden_states(self, hidden_states1, hidden_states2):
        a0, a1 = np.split(hidden_states1, 2, 0)# (16, 2, 16)
        self.model1.get_layer("lstm").reset_states([a0, a1])
        a2, a3 = np.split(hidden_states2, 2, 0)# (16, 2, 16)
        self.model2.get_layer("lstm").reset_states([a2, a3])

In [5]:
class Memory:
    def __init__(self, model1, model2, max_size=500, batch_size=32):

        self.cntr = 0
        self.size = 0
        self.max_size = max_size
        self.batch_size = batch_size
        self.model1 = model1
        self.model2 = model2
        self.code = ''
        self.states_memory = np.zeros([self.max_size, 10, 3], dtype=np.float32)
        self.next_states_memory = np.zeros([self.max_size, 10, 3], dtype=np.float32)
        self.acts_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.rewards_memory = np.zeros(self.max_size, dtype=np.float32)
        self.done_memory = np.zeros(self.max_size, dtype=np.uint8)
        self.hidden_state1_memory = np.zeros([self.max_size, 2, 16], dtype=np.float32)
        self.hidden_state2_memory = np.zeros([self.max_size, 2, 16], dtype=np.float32)
        self.tderrors_memory = np.zeros(self.max_size, dtype=np.float32)

    def store_transition(self, state, act, reward, next_state, done, hidden_state1, hidden_state2):
        self.states_memory[self.cntr] = state
        self.next_states_memory[self.cntr] = next_state
        self.acts_memory[self.cntr] = act
        self.rewards_memory[self.cntr] = reward
        self.done_memory[self.cntr] = done
        self.hidden_state1_memory[self.cntr] = hidden_state1
        self.hidden_state2_memory[self.cntr] = hidden_state2
        self.cntr = (self.cntr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sampling(self, mb_index = np.array([1,2,3])):
        if len(mb_index) != self.batch_size:
            mb_index = np.random.choice(self.size, self.batch_size, replace=False)

        key = ['state','next_state','act','reward','done', 'hidden_state1', 'hidden_state2']
        value = [self.states_memory[mb_index], self.next_states_memory[mb_index], self.acts_memory[mb_index], self.rewards_memory[mb_index], self.done_memory[mb_index], self.hidden_state1_memory[mb_index], self.hidden_state2_memory[mb_index]]
        dict1=dict(zip(key,value))
        return dict1

    def findall(self):
        return self.states_memory,self.next_states_memory,self.acts_memory,self.rewards_memory,self.done_memory,self.hidden_state1_memory,self.hidden_state2_memory,self.tderrors_memory

    def replace_tderror(self, val):
        np.put(self.tderrors_memory, range(val.shape[0]), val)

    def placement(self, model1, model2):
        self.model1.set_weights(model1.get_weights())
        self.model2.set_weights(model2.get_weights())
        self.code = self._make_random_code()

    def integration(self, model1, model2, code):
        if self.code == code:
            return
        model1.set_weights(self.model1.get_weights())
        model2.set_weights(self.model2.get_weights()) 
        code = self.code

    def _make_random_code(self, n=10):
       return ''.join(random.choices(string.ascii_letters + string.digits, k=n))

In [6]:
class Agent:
    def __init__(self, model1, model2, memory, epsilon, batch_size=32):
        self.model1 = model1
        self.model2 = model2
        self.memory = memory
        self.gamma = 0.99
        self.alpha = 0.5
        self.td_epsilon = 0.0001
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.epsilon_min = 0.01
        self.code = 'init'
        self.local_state = np.empty((0,3), float)
        self.local_next_state = np.empty((0,3), float)
        self.safe_lag = False

    def local_states_reset(self):
        self.local_state = np.empty((0,3), float)
        self.local_next_state = np.empty((0,3), float)

    def local_states_moemory_and_replay(self, next_state):
        if len(self.local_next_state) >= 10:
            self.local_next_state = self.local_next_state[1:]
            self.local_next_state = np.append(self.local_next_state, np.array(next_state), axis=0)
            self.safe_lag = True
        else:
            self.local_next_state = np.append(self.local_next_state, np.array(next_state), axis=0)
            self.safe_lag = False

        return self.local_state, self.local_next_state, self.safe_lag

    def update_replay_memory(self, state, action, reward, next_state, done, hidden_states1, hidden_states2):
        self.memory.store_transition(state, action, reward, next_state, done, hidden_states1, hidden_states2)

    def act(self, state, s_flag=12):
        if len(self.local_state) >= 10:
            self.local_state = self.local_state[1:]
            self.local_state = np.append(self.local_state, np.array(state), axis=0)
            tmp_state = copy.deepcopy(self.local_state)
            tmp_state  = np.reshape(tmp_state, (1, 10, 3))

            ran_num = np.random.rand()
            if (ran_num <= self.epsilon) and (ran_num > self.epsilon_min):
                _ = self._predict(tmp_state,s_flag)
                hidden_state1, hidden_state2 = self._get_hidden_state()
                return np.random.choice(3), hidden_state1, hidden_state2
            act_values = self._predict(tmp_state,s_flag)
            hidden_state1, hidden_state2 = self._get_hidden_state()
            return np.argmax(act_values), hidden_state1, hidden_state2
        else:
            self.local_state = np.append(self.local_state, np.array(state), axis=0)
            hidden_state1, hidden_state2 = self._get_hidden_state()
            return 1, hidden_state1, hidden_state2

    def _predict(self, state, s_flag = 12):
        values = None
        q1 = self.model1.predict(state, batch_size=1)
        q2 = self.model2.predict(state, batch_size=1)
        if s_flag == 12:
            values = q1 + q2
        elif s_flag == 11:
            values = q1 + q1
        else:
            values = q2 + q2
        return values

    def replay(self):
        m_batch = self.memory.sampling()
        return m_batch

    def pioritized_experience_replay(self):
        if self.memory.size < self.batch_size:
            return

        prob = self._tderror_prob()
        num_np = np.random.choice(self.memory.size, self.batch_size, p=prob, replace=False)

        m_batch = self.memory.sampling(num_np)
        return m_batch

    def tderror(self):
        states, next_states, actions, rewards, done, hidden_state1, hidden_state2, tderror = self.memory.findall()
        
        n2_rewards = rewards[2:-1]
        next_rewards = rewards[1:-2]
        n3_next_states = next_states[3:]

        states = states[:-3]
        next_states = next_states[:-3]
        actions = actions[:-3]
        rewards = rewards[:-3]
        done = done[:-3]

        next_predict = self._predict(n3_next_states)
        predict = self._predict(states)

        n3_q = np.max(next_predict, axis = 1)
        target = rewards + self.gamma * next_rewards + (self.gamma ** 2) * n2_rewards + (self.gamma ** 3) * n3_q
        tderror = target - np.max(predict, axis = 1)
        self.memory.replace_tderror(tderror)

    def _absolute_tderror(self):
        absolute_tderror = 0
        tderror = self.memory.tderrors_memory
        for i in range(0, (len(tderror)-1)):
            absolute_tderror += abs(tderror[i]) + 0.0001
        return absolute_tderror

    def _tderror_prob(self):
        absolute_tderror = 0
        tderror = self.memory.tderrors_memory
        absolute_tderror = np.power(np.abs(tderror) + self.td_epsilon, self.alpha)
        return absolute_tderror / np.sum(absolute_tderror)

    def _get_hidden_state(self):
        lstm1 = self.model1.get_layer("lstm")
        hidden_states1 = np.concatenate([K.get_value(lstm1.states[0]), K.get_value(lstm1.states[1])])
        lstm2 = self.model2.get_layer("lstm")
        hidden_states2 = np.concatenate([K.get_value(lstm2.states[0]), K.get_value(lstm2.states[1])])
        return hidden_states1, hidden_states2

    def reset_model_state(self):
        self.model1.reset_states()
        self.model2.reset_states()

    def integration(self):
        self.memory.integration(self.model1, self.model2, self.code)

    def load(self, name, name2):
        self.model1.load_weights(name)
        self.model2.load_weights(name2)

    def save(self, name, name2):
        self.model1.save_weights(name)
        self.model2.save_weights(name2)

In [7]:
class Main:
    def __init__(self, env, agent, mdl_dir, name, i, term, episodes_times = 200, mode = 'test'):
        self.env            = env
        self.agent          = agent
        self.mdl_dir        = mdl_dir
        self.scaler         = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode           = mode
        self.name           = name
        self.agent_num      = i
        self.term           = term

        with open(csv_path, 'w') as f:
            row = 'FixedProfit,TradeTimes,TradeWin'
            print(row, file=f)

        if self.mode == 'test':
            self._load()
            self.agent.epsilon = 0.01

    def play_game(self, q):

        total_reward = [1000000]

        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            done  = False
            self.agent.reset_model_state()
            self.agent.local_states_reset()
        
            while not done:
                action, hidden_states1, hidden_states2 = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                l_state, ln_state, safe_lag = self.agent.local_states_moemory_and_replay(next_state)
                reward = self._reward_clipping(reward)

                if (self.mode == 'train') and safe_lag:
                    self.agent.update_replay_memory(l_state, action, reward, ln_state, done, hidden_states1, hidden_states2)
                    if (self.agent.memory.size > self.agent.memory.batch_size) and (self.agent.memory.cntr % 100 == 0):
                        self.agent.tderror()
                        m_batch = None
                        if mean(total_reward) < 1050000:
                            m_batch = self.agent.replay()
                        else:
                            print('ok1')
                            m_batch = self.agent.pioritized_experience_replay()
                            print('ok2')
                        q.put(m_batch)
                    self.agent.integration()

                state = next_state

            if self.mode == 'train':
                self.agent.integration()
                total_reward.append(info['cur_revenue'])

            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)
            with open(csv_path, 'a') as f:
                row = str(info['cur_revenue']) + ',' + str(info['trade_time']) + ',' + str(info['trade_win'])
                print(row, file=f)

        if self.mode == 'train':
            self._save()
            self.term[self.agent_num] = True
            if self.term.all():
                q.put("done_prosess")

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        result = 1 if val > 0 else 0 if val == 0 else -1
        return result

    def _load(self):
        with open('{}/{}_{}.pkl'.format(self.mdl_dir, self.name, str(self.agent_num)), 'rb') as f:
            self.scaler = pickle.load(f)
        self.agent.load('{}/{}_1.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        with open('{}/{}_{}.pkl'.format(self.mdl_dir, self.name, str(self.agent_num)), 'wb') as f:
            pickle.dump(self.scaler, f)
        self.agent.save('{}/{}_1.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))

In [None]:
initial_money=1000000
episodes_times = 20
commission = 0 if level == 1 else 0.002
batch_size = 32
max_size = 1000

brain1 = Brain()
model1 = brain1.model
brain2 = Brain()
model2 = brain2.model

memory = Memory(clone_model(model1), clone_model(model2), max_size, batch_size)
learner = Learner(model1, model2, memory)

thread_num = 4
epsilon_list = np.array([])
epsilon = 0.4
alfa = 7
for i in range(thread_num):
    num = 0.0
    if (not (thread_num - 1) == 0):
        num = epsilon ** (1 + i * alfa / (thread_num - 1))
    else:
        num = epsilon
    epsilon_list = np.append(epsilon_list, num)

workers = []
term = np.full((thread_num,), False)
for i in range(thread_num):
    env = Environment(df, initial_money=initial_money, mode = mode, commission = commission)
    epsilon = epsilon_list[i]
    agent = Agent(clone_model(model1), clone_model(model2), memory, epsilon, batch_size)
    main  = Main(env, agent, mdl_dir, name, i, term, episodes_times, mode)
    workers.append(main)

data = []
q = Queue()
with ThreadPoolExecutor() as executor:
    for i, w in enumerate(workers):
        print(f'start worker_{i}')
        def job1(): return w.play_game(q,)
        data.append(executor.submit(job1))

    if mode == 'train':
        print('start learner')
        def job2(): return learner.learn(q,)
        data.append(executor.submit(job2))
    for future in as_completed(data):
        print(future.result())
    executor.shutdown()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (1, 10, 12)               48        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (1, 10, 12)               0         
_________________________________________________________________
activation (Activation)      (1, 10, 12)               0         
_________________________________________________________________
lstm (LSTM)                  (1, 16)                   1856      
_________________________________________________________________
dense (Dense)                (1, 4)                    68        
_________________________________________________________________
lambda (Lambda)              (1, 3)                    0         
Total params: 1,972
Trainable params: 1,972
Non-trainable params: 0
______________________________________________________