<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/Double_Q-Learning/w_q_learning_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.optimizers import RMSprop
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.utils import Progbar

mode = 'train'
name = 'w_qlearning'
level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'sp500_{mode}.csv'

exp_dir = 'Colab Notebooks/workspace/export/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0):

        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.trade_time      = None
        self.trade_win       = None
        self.brfore_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):

        self.trade_time      = 0
        self.trade_win       = 0
        self.brfore_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand    = self.initial_money

        return self._get_now_state()

    def step(self, action):

        prev_revenue = self._get_revenue()
        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
 
        reward = cur_revenue - prev_revenue

        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand
        return state

    def _get_revenue(self): 
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
            self.trade_time += 1
            if self.cash_in_hand > self.brfore_buy_cash:
                self.trade_win += 1
        else:
            if self.action_space[0] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price + self.commission * self.now_price
                        else:
                            buy_flag = False
            if self.action_space[2] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    self.trade_time += 1
                    if self.cash_in_hand > self.brfore_buy_cash:
                        self.trade_win += 1

In [3]:
class Brain:
    def __init__(self):

        self.gamma = 0.9

        model = Sequential()
        model.add(Dense(3, input_shape=(3,)))
        model.add(ReLU()) 
        model.add(Dense(3))
        model.add(ReLU())
        model.add(Dense(3))
        model.compile(loss="mse", optimizer=RMSprop())
        model.summary()
        self.model = model
    
        model_2 = Sequential()
        model_2.add(Dense(3, input_shape=(3,)))
        model_2.add(ReLU()) 
        model_2.add(Dense(3))
        model_2.add(ReLU()) 
        model_2.add(Dense(3))
        model_2.compile(loss="mse", optimizer=RMSprop())
        model_2.summary()
        self.model_2 = model_2

    def train(self, state, action, reward, next_state, done, s_flag):

        if s_flag == 11:
            q = self.model.predict(state)  
            next_q = self.model_2.predict(next_state)
            target = np.copy(q)

            target[:, action] = reward + (1 - done) * self.gamma*np.amax(next_q, axis=1)
            self.model.train_on_batch(state, target)
        else:
            q = self.model_2.predict(state)  
            next_q = self.model.predict(next_state)
            target = np.copy(q)

            target[:, action] = reward + (1 - done) * self.gamma*np.amax(next_q, axis=1)
            self.model_2.train_on_batch(state, target)


    def _predict(self, state, s_flag = 12):
        values = None
        q1 = self.model.predict(state)
        q2 = self.model_2.predict(state)
        if s_flag == 12:
            values = np.array([q1[0,a] + q2[0,a] for a in range(3)])
        elif s_flag == 11:
            values = np.array([q1[0,a] + q1[0,a] for a in range(3)])
        else:
            values = np.array([q2[0,a] + q2[0,a] for a in range(3)])
        return values

    def load(self, name, name2):
        self.model.load_weights(name)
        self.model_2.load_weights(name2)

    def save(self, name, name2):
        self.model.save_weights(name)
        self.model_2.save_weights(name2)

In [4]:
class Agent(Brain):
    def __init__(self):
        super().__init__()
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.r = 0.995

    def act(self, state,s_flag=12):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(3)
        act_values = self._predict(state,s_flag)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.r
        return np.argmax(act_values)

In [5]:
class Main:
    def __init__(self, env, agent, mdl_dir, name, episodes_times = 1000, mode = 'test'):
        self.env = env
        self.agent = agent
        self.mdl_dir = mdl_dir
        self.scaler = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode = mode
        self.name = name
        self.df_rec = pd.DataFrame(index=[], columns=['FixedProfit','TradeTimes','TradeWin'])

        if self.mode == 'test':
            self._load()
            self.agent.epsilon = 0.01

    def play_game(self):

        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            done = False
        
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                reward = self._reward_clipping(reward)

                if self.mode == 'train':
                    s_flag = 11 if np.random.random() <= 0.5 else 22
                    agent.train(state, action, reward, next_state, done, s_flag)

                state = next_state

            record = pd.Series([info['cur_revenue'],info['trade_time'],info['trade_win']], index=self.df_rec.columns)
            self.df_rec = self.df_rec.append(record, ignore_index=True)

            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)

        if self.mode == 'train':
            self._save()
        self._save_csv()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        if val > 0:
            return 1
        elif val == 0:
            return 0
        else:
            return -1

    def _load(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'rb') as f:
            self.scaler = pickle.load(f)
        self.agent.load('{}/{}.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        self.agent.save('{}/{}.h5'.format(self.mdl_dir, self.name), '{}/{}_2.h5'.format(self.mdl_dir, self.name))
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'wb') as f:
            pickle.dump(self.scaler, f)

    def _save_csv(self):
        self.df_rec.to_csv(csv_path)

In [6]:
initial_money=1000000
episodes_times = 200
commission = 0 if level == 1 else 0.002

agent = Agent()
env = Environment(df, initial_money = initial_money, mode = mode, commission = commission)
main = Main(env, agent, mdl_dir, name, episodes_times, mode)
main.play_game()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3)                 12        
_________________________________________________________________
re_lu (ReLU)                 (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
re_lu_1 (ReLU)               (None, 3)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Sh