<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/simple_rl_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
from matplotlib import pyplot as plt
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.optimizers import RMSprop

optimizer = RMSprop()

from sklearn.preprocessing import StandardScaler

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'sp500_test.csv'
exp_dir = 'Colab Notebooks/workspace/export/'
csv_path = '/content/drive/My Drive/' + exp_dir + 'sp500_train.csv'

models_folder = '/content/drive/My Drive/' + exp_dir + 'rl_models'
rewards_folder = '/content/drive/My Drive/' + exp_dir + 'rl_rewards'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=1000, mode = 'test'):


        self.df = df.dropna().reset_index()

        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.mode = mode
        self.trade_time = None
        self.trade_win = None
        self.brfore_buy_cash = None
        self.action_space = np.array([0, 1, 2])
        self.hold_a_position = None
        self.now_price = None
        self.cash_in_hand = None

        self.reset()
        
    def reset(self):

        self.trade_time = 0
        self.trade_win = 0
        self.brfore_buy_cash = 0
        self.end_step = self.df_total_steps
        self.now_step = 0
        self.hold_a_position = 0.0
        self.now_price = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand = self.initial_money

        return self._get_now_state()

    def step(self, action):

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        prev_revenue = self._get_revenue()

        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
        reward = cur_revenue - prev_revenue

        if self.mode == 'test':
            info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }
        else:
            info = { 'cur_revenue' : cur_revenue }


        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand

        return state

    def _get_revenue(self):
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        '''
        0 = sell
        1 = hold
        2 = buy
        売買ルール：
        1.空売りは認めない
        2.ポジションを持っている場合、追加注文を出せない。
        3.トレーニング期間は最後のステップでポジションを全て売却する。
        4.ポジションは全売り
        '''
        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
        else:
            if self.action_space[0] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    if self.mode == 'test':
                        self.trade_time += 1
                        if self.cash_in_hand > self.brfore_buy_cash:
                            self.trade_win += 1 
            if self.action_space[2] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    if self.mode == 'test':
                        self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price
                        else:
                            buy_flag = False

In [3]:
### The experience replay memory ###
class ReplayBuffer:
  def __init__(self, obs_dim, act_dim, size):
    # obs_dim = 3
    # act_dim = 3
    # size = 500
    self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.acts_buf = np.zeros(size, dtype=np.uint8)
    self.rews_buf = np.zeros(size, dtype=np.float32)
    self.done_buf = np.zeros(size, dtype=np.uint8)
    self.ptr, self.size, self.max_size = 0, 0, size
    '''
    self.obs1_buf.shape = (500, 3)
    '''
  def store(self, obs, act, rew, next_obs, done):
    self.obs1_buf[self.ptr] = obs
    self.obs2_buf[self.ptr] = next_obs
    self.acts_buf[self.ptr] = act
    self.rews_buf[self.ptr] = rew
    self.done_buf[self.ptr] = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.size+1, self.max_size)

  def sample_batch(self, batch_size=32):
    idxs = np.random.randint(0, self.size, size=batch_size)
    return dict(s=self.obs1_buf[idxs],
                s2=self.obs2_buf[idxs],
                a=self.acts_buf[idxs],
                r=self.rews_buf[idxs],
                d=self.done_buf[idxs])

In [4]:
class Brain:
    def __init__(self, n_hidden_layers=1, hidden_dim=32):

        n_mid = 3
        n_state = 3
        n_action = 3

        model = Sequential()
        model.add(Dense(n_mid, input_shape=(n_state,)))
        model.add(ReLU()) 
        model.add(Dense(n_mid))
        model.add(ReLU()) 
        model.add(Dense(n_action))
        model.compile(loss="mse", optimizer=optimizer)

        print((model.summary()))
        self.model = model

In [5]:
def get_scaler(env):
  # return scikit-learn scaler object to scale the states
  # Note: you could also populate the replay buffer here

  states = []
  for _ in range(env.df_total_steps):
    action = np.random.choice(env.action_space)
    state, reward, done, info = env.step(action)
    states.append(state)
    if done:
      break

  scaler = StandardScaler()
  scaler.fit(states)
  return scaler

In [6]:
class Agent:
  def __init__(self, state_size, action_size, brain):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplayBuffer(state_size, action_size, size=500)
    self.gamma = 0.95  # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.model = brain.model

  def update_replay_memory(self, state, action, reward, next_state, done):
    self.memory.store(state, action, reward, next_state, done)

  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return np.random.choice(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action

  def replay(self, batch_size=32):
    # first check if replay buffer contains enough data
    if self.memory.size < batch_size:
      return

    # sample a batch of data from the replay memory
    minibatch = self.memory.sample_batch(batch_size)
    states = minibatch['s']
    actions = minibatch['a']
    rewards = minibatch['r']
    next_states = minibatch['s2']
    done = minibatch['d']

    # Calculate the tentative target: Q(s',a)
    target = rewards + (1 - done) * self.gamma * np.amax(self.model.predict(next_states), axis=1)

    # target[done] = rewards[done]

    target_full = self.model.predict(states)

    '''
    batch_size = 32
    np.arange(batch_size)
    array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

    actions:[0 1 2 1 1 2 2 0 0 1 0 2 2 0 1 1 0 0 1 1 0 0 2 2 2 0 1 1 2 2 1 1]
    '''
    
    target_full[np.arange(batch_size), actions] = target

    '''
    Run one training step,ここでエラーが出ている。
    Runs a single gradient update on a single batch of data.
    1つのデータバッチに対して1回の勾配更新を実行します。
    states = [2420. 15.27 8.40000001]

    returns
    スカラーの学習損失（モデルが単一の出力を持ち、メトリクスを持たない場合）、
    またはスカラーのリスト（モデルが複数の出力やメトリクスを持つ場合）。
    model.metrics_namesという属性は、スカラー出力の表示ラベルを与えます。

    model.fitは1つ以上のエポックを学習します。model.train_on_batchは、その名の通り、1つのバッチのみを学習します。
    具体的な例として、10枚の画像を使ってモデルを学習する場合を考えてみましょう。
    model.fitは10枚の画像すべてで学習するので、勾配を5回更新することになります。
    model.train_on_batchは、バッチでしかモデルを与えていないので、勾配の更新は1回です。
    model.train_on_batchは、バッチでモデルを与えるだけなので、バッチサイズが2の場合は、2枚の画像を与えます。
    model.fitがmodel.train_on_batchを呼び出していると仮定すると、model.train_on_batchは複数回、おそらくループで呼び出されることになります。

    '''
    self.model.train_on_batch(states, target_full)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay

  def load(self, name):
    self.model.load_weights(name)

  def save(self, name):
    self.model.save_weights(name)

In [7]:
def play_game(env, agent , episodes_times = 50, mode = 'test', batch_size = 32):
    if mode == 'test':
        df_rec = pd.DataFrame(index=[], columns=['FixedProfit','TradeTimes','TradeWin'])
    else:
        df_rec = pd.DataFrame(index=[], columns=['FixedProfit'])

    for episode in range(episodes_times):
        state = env.reset()
        state = scaler.transform([state])
        done = False
        start_time = datetime.now()
       
        while not done:
            #乱数で1,2,3を出力
            # action = agent.act(state)
            # action = np.random.randint(3, size=1)[0]

            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            next_state = scaler.transform([next_state])

            if mode == 'train':
                agent.update_replay_memory(state, action, reward, next_state, done)
                agent.replay(batch_size)
            
        play_time = datetime.now() - start_time
        if mode == 'test':
            record = pd.Series([info['cur_revenue'],info['trade_time'],info['trade_win']], index=df_rec.columns)
            print(f"Episode: {episode + 1}/{episodes_times} RapTime: {play_time} FixedProfit: {info['cur_revenue']:.0f} TradeTimes: {info['trade_time']} TradeWin: {info['trade_win']}")
        else:
            record = pd.Series(info['cur_revenue'], index=df_rec.columns)
            print(f"Episode: {episode + 1}/{episodes_times} RapTime: {play_time} FixedProfit: {info['cur_revenue']:.0f}")

        
        state = next_state
        df_rec = df_rec.append(record, ignore_index=True)
    return df_rec

In [9]:
initial_money=1000000
episodes_times = 100
batch_size = 32
mode = 'test'
state_size = 3
action_size = 3
brain = Brain()
agent = Agent(state_size, action_size, brain)
# テストの場合は上書き処理
if mode == 'test':
  # then load the previous scaler
  with open(f'{models_folder}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

  # make sure epsilon is not 1!
  # no need to run multiple episodes if epsilon = 0, it's deterministic
  agent.epsilon = 0.01

  # load trained weights
  agent.load(f'{models_folder}/dqn.h5')

env = Environment(df, initial_money=initial_money, mode = mode)
scaler = get_scaler(env)
df_rec = play_game(env, agent , episodes_times = episodes_times, mode = mode,batch_size = batch_size)
'''
print(df_rec.describe())
if mode == 'test':
    df_rec = df_rec.drop(['TradeTimes','TradeWin'], axis=1)
df_rec.to_csv(csv_path)
df_rec.plot()
plt.show()
'''
# save the weights when we are done
if mode == 'train':
  # save the DQN
  agent.save(f'{models_folder}/dqn.h5')

    # save the scaler
  with open(f'{models_folder}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

df_rec.to_csv(csv_path)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 3)                 12        
_________________________________________________________________
re_lu_2 (ReLU)               (None, 3)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 12        
_________________________________________________________________
re_lu_3 (ReLU)               (None, 3)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 12        
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
None
Episode: 1/100 RapTime: 0:00:34.126828 FixedProfit: 1010732 TradeTimes: 5 TradeWin: 2
Episode: 2/100 RapTime: 0:00:34.4

KeyboardInterrupt: ignored