<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/a2c_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0-1. インストール

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from datetime import datetime
import random
import copy

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

from google.colab import drive

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'sp500_test.csv'
'''
exp_dir = 'Colab Notebooks/workspace/export/'
csv_path = '/content/drive/My Drive/' + exp_dir + 'random_test.csv'
'''

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Mounted at /content/drive/


In [2]:
def make_scaler(env):

    states = []
    for _ in range(env.df_total_steps):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [3]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test'):

        self.df = df.dropna().reset_index()
        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.mode = mode
        self.trade_time = None
        self.trade_win = None
        self.brfore_buy_cash = None
        self.action_space = np.array([0, 1, 2])
        self.hold_a_position = None
        self.now_price = None
        self.cash_in_hand = None

        self.reset()
        
    def reset(self):

        self.trade_time = 0
        self.trade_win = 0
        self.brfore_buy_cash = 0
        self.end_step = self.df_total_steps
        self.now_step = 0
        self.hold_a_position = 0.0
        self.now_price = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand = self.initial_money

        return self._get_now_state()

    def step(self, action):

        prev_revenue = self._get_revenue()
        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
 
        reward = cur_revenue - prev_revenue

        if self.mode == 'test':
            info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }
        else:
            info = { 'cur_revenue' : cur_revenue }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
        else:
            if self.action_space[0] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    if self.mode == 'test':
                        self.trade_time += 1
                        if self.cash_in_hand > self.brfore_buy_cash:
                            self.trade_win += 1 
            if self.action_space[2] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    if self.mode == 'test':
                        self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price
                        else:
                            buy_flag = False

## 2-3. A3C/A2C

In [4]:
class Brain:
    def __init__(self,n_action = 3):

        n_shape = 3
        n_action = n_action
        lr = 0.01  # 学習率

        c = input_ = keras.layers.Input(shape=n_shape)
        c = keras.layers.Dense(10, activation="relu")(c)
        c = keras.layers.Dense(10, activation="relu")(c)
        actor_layer = keras.layers.Dense(n_action, activation="linear")(c)
        critic_layer = keras.layers.Dense(1, activation="linear")(c)

        model = keras.Model(input_, [actor_layer, critic_layer])
        model.compile(optimizer=Adam(lr=lr))
        model.summary()
        self.model = model

    def predict(self, state):
        return self.model.predict(state)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [5]:
class Actor:
    def __init__(self,model):
        self.model = model

    def policynetwork(self, state, n_action):
        action_eval, _ = self.model(state.reshape((1,-1)))
        probs = tf.nn.softmax(action_eval)
        return np.random.choice(n_action, 1, p=probs[0].numpy())[0]

In [6]:
class Critic:
    def __init__(self,brain,model):
        self.brain = brain
        self.model = model

    def valuenetwork(self, experiences):

        gamma = 0.9  # 割引率
        # 現在からエピソード最後までの報酬を計算（後ろから計算）
        if experiences[-1]["done"]:
            # 最後が終わりの場合は全部使える
            G = 0
        else:
            # 最後が終わりじゃない場合は予測値vで補完する
            next_state = np.atleast_2d(experiences[-1]["next_state"])
            _, n_v = self.model(next_state)
            G = n_v[0][0].numpy()

        # 割引報酬を後ろから計算
        discounted_rewards = []
        for exp in reversed(experiences):
            if exp["done"]:
                G = 0
            G = exp["reward"] + gamma * G
            discounted_rewards.append(G)
        discounted_rewards.reverse()

        # 計算用にnp化して (batch_size,1) の形にする
        discounted_rewards = np.asarray(discounted_rewards).reshape((-1, 1))
        # ベースライン処理
        discounted_rewards -= np.mean(discounted_rewards)  # 報酬の平均を引く
        # データ形式を変形
        state_batch = np.asarray([e["state"] for e in experiences])
        action_batch = np.asarray([e["action"] for e in experiences])
        # アクションをonehotベクトルの形に変形
        onehot_actions = tf.one_hot(action_batch, n_action)

        # 勾配を計算する
        # Pythonのwithを使用すると、何かの処理の開始時と終了時に必須の処理を絶対に実行してくれます。
        with tf.GradientTape() as tape:

            action_eval, v = self.model(state_batch, training=True)
            #  π(a|s) を計算
            action_probs = tf.nn.softmax(action_eval)
            selected_action_probs = tf.reduce_sum(onehot_actions * action_probs, axis=1, keepdims=True)
            #--- アドバンテージを計算
            # アドバンテージ方策勾配で使うvは値として使うので、
            # 勾配で計算されないように tf.stop_gradient を使う
            advantage = discounted_rewards - tf.stop_gradient(v)
            # log(π(a|s)) * A(s,a) を計算
            selected_action_probs = tf.clip_by_value(selected_action_probs, 1e-10, 1.0)  # 0にならないようにclip
            policy_loss = tf.math.log(selected_action_probs) * advantage
            #--- Value loss
            # 平均二乗誤差で損失を計算
            value_loss = tf.reduce_mean((discounted_rewards - v) ** 2, axis=1, keepdims=True)
            #--- 方策エントロピー
            entropy = tf.reduce_sum(tf.math.log(selected_action_probs) * selected_action_probs, axis=1, keepdims=True)
            #--- batch loss
            value_loss_weight = 0.5
            entropy_weight = 0.1
            loss = -policy_loss + value_loss_weight * value_loss - entropy_weight * entropy
            # 全バッチのlossの平均(ミニバッチ処理?)
            loss = tf.reduce_mean(loss)

        # 勾配を元にoptimizerでモデルを更新
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

In [7]:
def play_game(env, actor,critic, scaler, episodes_times = 100, batch_size = 32, n_action = 3, mode = 'train'):
    experiences = []
    episode_rewards = []

    for episode in range(episodes_times):
        state = env.reset()
        state = scaler.transform([state])
        state = state.flatten()
        done = False
        total_reward = 0
        step = 0

        while not done:
            
            action = actor.policynetwork(state, n_action)
            next_state, reward, done, _ = env.step(action)
            next_state = scaler.transform([next_state])
            next_state = next_state.flatten()

            total_reward += reward

            if mode == 'train':
                experiences.append({"state": state, "action": action, "reward": reward, "next_state": next_state, "done": done,})
                if len(experiences) == batch_size:
                    critic.valuenetwork(experiences)
                    experiences = []
            
            state = next_state
            step += 1

        episode_rewards.append(total_reward)
        if episode % 10 == 0:
            print("{}/{}, {} step, reward: {}".format(episode, episodes_times, step, np.round(np.mean(episode_rewards[-10:]), decimals=2)))

### 学習の実行

In [8]:
initial_money=1000000
mode = 'train'

brain = Brain(n_action)
model = brain.model

actor = Actor(model)
critic = Critic(brain,model)

if mode == 'test':
    brain.load(f'a2c_model.h5')

env = Environment(df, initial_money = initial_money, mode=mode)
scaler = make_scaler(env)

play_game(env, actor, critic, scaler, mode = mode)

if mode == 'train':
    brain.save(f'a2c_model.h5')

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 10)           40          input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10)           110         dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 3)            33          dense_1[0][0]                    
______________________________________________________________________________________________