<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/A2C/a2c_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from datetime import datetime
import random
import copy
import pickle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

from dataclasses import dataclass

from tensorflow.keras.utils import Progbar

import math

from google.colab import drive

mode = 'test'
name = 'a2c'
level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Mounted at /content/drive/


In [None]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money
        self.sell_price      = 0
        self.buy_price       = 0

        return self._get_now_state()

    def step(self, action):

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self.sell_price = 0
        self._trade(action,done)
        reward = 0
        if (self.sell_price > 0) and (self.buy_price > 0) and ((self.sell_price - self.buy_price) != 0):
            reward = (self.sell_price - self.buy_price) / self.buy_price
            self.buy_price = 0
        cur_revenue = self._get_revenue()
 
        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            if self.hold_a_position != 0:
                self.cash_in_hand += self.now_price * self.hold_a_position
                self.hold_a_position = 0
                self.trade_time += 1
                if self.cash_in_hand > self.brfore_buy_cash:
                    self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.buy_price += self.now_price
                            self.cash_in_hand -= self.now_price + self.commission * self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.sell_price += self.now_price * self.hold_a_position
                    self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    self.trade_time += 1
                    if self.cash_in_hand > self.brfore_buy_cash:
                        self.trade_win += 1

In [None]:
class Brain:
    def __init__(self):

        opt = Adam(learning_rate=0.0001, epsilon=0.001)

        common = input_ = Input(shape=3)
        common = Dense(12, activation="relu")(common)

        actor = Dense(3, activation="softmax")(common)
        critic = Dense(1, activation="linear")(common)

        model = Model(input_, [actor, critic])
        model.compile(optimizer = opt)
        model.summary()
        Brain.model = model

    def load(self, name):
        Brain.model.load_weights(name)

    def save(self, name):
        Brain.model.save_weights(name)

In [None]:
class Actor(Brain):
    def __init__(self):
        super().__init__()

    def policynetwork(self, state):
        act_p, _ = Brain.model(state.reshape((1,-1)))
        return np.random.choice(3, p=act_p[0].numpy())

In [None]:
class Critic(Brain):
    def __init__(self):

        self.gamma = 0.997
        self.beta  = 0.1

    def valuenetwork(self, val):

        states, next_states, actions = val['state'], val['next_state'], val['act']
        rewards, dones = val['reward'], val['done']

        onehot_actions = tf.one_hot(actions, 3)

        with tf.GradientTape() as tape:

            act_p, v = Brain.model(states, training=True)
            _, next_v = Brain.model(next_states, training=True)

            a_pi = tf.reduce_sum(onehot_actions * act_p, axis=1, keepdims=True)
            a_pi = tf.clip_by_value(a_pi, 1e-10, 1.0)

            q = rewards + (1 - dones) * self.gamma * next_v
            advantage = q - v

            value_losses = self._value_losses(advantage)
            policy_losses = self._policy_losses(advantage, a_pi, v)
            total_loss = value_losses + policy_losses
            loss = tf.reduce_mean(total_loss)

        gradients = tape.gradient(loss, Brain.model.trainable_variables)
        Brain.model.optimizer.apply_gradients(zip(gradients, Brain.model.trainable_variables))

    def _value_losses(self,advantage):
        return (advantage)**2

    def _policy_losses(self,advantage,a_pi,v):

        a = tf.math.log(a_pi) * advantage
        b = self._entropy(v)
        policy_losses = - ( a + b )
        return policy_losses

    def _entropy(self, v):
        sigma = tf.math.reduce_std(v)
        sigma = tf.math.square(sigma)
        entropy = self.beta*0.5*(tf.math.log(2 * math.pi * sigma) + 1)
        return entropy

In [None]:
@dataclass
class ExperiencesMemory:
    state : np.ndarray = np.empty((0,3), int)
    next_state : np.ndarray = np.empty((0,3), int)
    action : np.ndarray = np.array([])
    reward : np.ndarray = np.array([])
    done : np.ndarray = np.array([])
    batch_size : int = 32

    def reset_experiences(self):
        self.state = np.empty((0,3), int)
        self.next_state = np.empty((0,3), int)
        self.action = np.array([])
        self.reward = np.array([])
        self.done = np.array([])

    def set_experiences(self, state, next_state, action, reward, done):
        state = np.reshape(state, [1, 3])
        self.state = np.append(self.state, state, axis=0)
        next_state = np.reshape(next_state, [1, 3])
        self.next_state = np.append(self.next_state, next_state, axis=0)
        self.action = np.append(self.action, np.array(action))
        self.reward = np.append(self.reward, np.array(reward))
        self.done = np.append(self.done, np.array(done))

    def get_experiences(self):
        mb_index = np.random.choice(len(self.action), self.batch_size, replace=False)
        key = ['state','next_state','act','reward','done']
        value = [self.state[mb_index], self.next_state[mb_index],
                 self.action[mb_index], self.reward[mb_index], self.done[mb_index]]
        dict1=dict(zip(key,value))
        return dict1

    def isGetter(self):
        return True if (len(self.action) > self.batch_size) else False

In [None]:
class Main:
    def __init__(self, env, actor, critic, experiences, mdl_dir, name, episodes_times = 1000, mode = 'test'):
        self.env = env
        self.actor = actor
        self.critic = critic
        self.experiences = experiences
        self.mdl_dir = mdl_dir
        self.scaler = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode = mode
        self.name = name
        self.df_rec = pd.DataFrame(index=[], columns=['FixedProfit','TradeTimes','TradeWin'])

        if self.mode == 'test':
            self._load()

    def play_game(self):

        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                if (int(str(self.episodes_times)[:-1])*10 == episode):
                    pb_i = Progbar(int(str(self.episodes_times)[-1]), stateful_metrics=metrics_names)
                else:
                    pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            state = state.flatten()
            done = False
            self.experiences.reset_experiences()
    
            while not done:
                
                action = self.actor.policynetwork(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                next_state = next_state.flatten()
                reward = self._reward_clipping(reward)

                if mode == 'train':
                    self.experiences.set_experiences(state, next_state, action, reward, done)
                    if self.experiences.isGetter():
                        m_batch = self.experiences.get_experiences()
                        self.critic.valuenetwork(m_batch)
                        self.experiences.reset_experiences()

                state = next_state
               
            record = pd.Series([info['cur_revenue'],info['trade_time'],info['trade_win']], index=self.df_rec.columns)
            self.df_rec = self.df_rec.append(record, ignore_index=True)
            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)

        if self.mode == 'train':
            self._save()
        self._save_csv()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        result = 1 if val > 0 else 0 if val == 0 else -1
        return result

    def _load(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'rb') as f:
            self.scaler = pickle.load(f)
        self.actor.load('{}/{}.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        self.actor.save('{}/{}.h5'.format(self.mdl_dir, self.name))
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'wb') as f:
            pickle.dump(self.scaler, f)

    def _save_csv(self):
        self.df_rec.to_csv(csv_path)

In [None]:
initial_money=1000000
episodes_times = 100
batch_size = 32
commission = 0 if level == 1 else 0.002

actor = Actor()
critic = Critic()
experiences = ExperiencesMemory(batch_size = batch_size)
env = Environment(df, initial_money=initial_money, mode = mode, commission = commission)
main = Main(env, actor, critic, experiences, mdl_dir, name, episodes_times, mode)
main.play_game()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 12)           48          input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 3)            39          dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 1)            13          dense[0][0]                      
Total params: 100
Trainable params: 100
Non-trainable params: 0
______________________________