<a href="https://colab.research.google.com/github/sugiyama404/ReinfoceLearningForTrading/blob/main/q_learning_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy

from datetime import datetime
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.optimizers import RMSprop
from sklearn.preprocessing import StandardScaler

optimizer = RMSprop()

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/dataset/reinforcement_learning/'
nov_path = '/content/drive/My Drive/' + nov_dir + 'sp500_test.csv'
exp_dir = 'Colab Notebooks/workspace/export/'
csv_path = '/content/drive/My Drive/' + exp_dir + 'qlearning_test.csv'

models_folder = '/content/drive/My Drive/' + exp_dir + 'rl_models'
rewards_folder = '/content/drive/My Drive/' + exp_dir + 'rl_rewards'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')

Mounted at /content/drive/


In [None]:
def make_scaler(env):
    states = []
    for _ in range(env.df_total_steps):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [None]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test'):

        self.df = df.dropna().reset_index()
        self.df_total_steps = len(self.df)-1
        self.initial_money = initial_money
        self.mode = mode
        self.trade_time = None
        self.trade_win = None
        self.brfore_buy_cash = None
        self.action_space = np.array([0, 1, 2])
        self.hold_a_position = None
        self.now_price = None
        self.cash_in_hand = None

        self.reset()
        
    def reset(self):

        self.trade_time = 0
        self.trade_win = 0
        self.brfore_buy_cash = 0
        self.end_step = self.df_total_steps
        self.now_step = 0
        self.hold_a_position = 0.0
        self.now_price = self.df.loc[self.now_step, 'SP500']
        self.cash_in_hand = self.initial_money

        return self._get_now_state()

    def step(self, action):

        prev_revenue = self._get_revenue()

        self.now_step += 1
        self.now_price = self.df.loc[self.now_step, 'SP500']
 
        done = (self.end_step == self.now_step)

        self._trade(action,done)
        cur_revenue = self._get_revenue()
        reward = cur_revenue - prev_revenue

        if self.mode == 'test':
            info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }
        else:
            info = { 'cur_revenue' : cur_revenue }


        return self._get_now_state(), reward, done, info

    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position
        state[1] = self.now_price
        state[2] = self.cash_in_hand

        return state

    def _get_revenue(self):
        return self.hold_a_position * self.now_price + self.cash_in_hand

    def _trade(self, action,lastorder = False):
        if lastorder:
            self.cash_in_hand += self.now_price * self.hold_a_position
            self.hold_a_position = 0
        else:
            if self.action_space[0] == action: # sell
                if self.hold_a_position != 0:
                    self.cash_in_hand += self.now_price * self.hold_a_position
                    self.hold_a_position = 0
                    if self.mode == 'test':
                        self.trade_time += 1
                        if self.cash_in_hand > self.brfore_buy_cash:
                            self.trade_win += 1 
            if self.action_space[2] == action: # buy
                if self.hold_a_position == 0:
                    buy_flag = True
                    if self.mode == 'test':
                        self.brfore_buy_cash = copy.copy(self.cash_in_hand)
                    while buy_flag:
                        if self.cash_in_hand > self.now_price:
                            self.hold_a_position += 1
                            self.cash_in_hand -= self.now_price
                        else:
                            buy_flag = False

In [None]:
class Brain:
    def __init__(self, n_hidden_layers=1, hidden_dim=32):

        self.gamma = 0.9

        n_mid = 3
        n_state = 3
        n_action = 3

        model = Sequential()
        model.add(Dense(n_mid, input_shape=(n_state,)))
        model.add(ReLU()) 
        model.add(Dense(n_mid))
        model.add(ReLU()) 
        model.add(Dense(n_action))
        model.compile(loss="mse", optimizer=optimizer)

        print((model.summary()))
        self.model = model

    def train(self, state, action, reward, next_state, done):
        q = self.model.predict(state)  
        next_q = self.model.predict(next_state)
        t = np.copy(q)
        if done:
            t[:, action] = reward
        else:
            t[:, action] = reward + self.gamma*np.max(next_q, axis=1)
        self.model.train_on_batch(state, t)

    def predict(self, state):
        return self.model.predict(state)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [None]:
class Agent:
    def __init__(self, brain, state_size=3, action_size=3):
        self.state_size = state_size
        self.action_size = action_size
        self.brain = brain
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.r = 0.995

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.brain.predict(state)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.r
        return np.argmax(act_values[0])

    def train(self, state, action, reward, next_state, done):
        self.brain.train(state, action, reward, next_state, done)

    def load(self, name):
        self.brain.load(name)

    def save(self, name):
        self.brain.save(name)

In [None]:
def play_game(env, agent , episodes_times = 1000, mode = 'test', batch_size = 32):
    if mode == 'test':
        df_rec = pd.DataFrame(index=[], columns=['FixedProfit','TradeTimes','TradeWin'])
    else:
        df_rec = pd.DataFrame(index=[], columns=['FixedProfit'])

    for episode in range(episodes_times):
        state = env.reset()
        state = scaler.transform([state])
        done = False
        start_time = datetime.now()
       
        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            next_state = scaler.transform([next_state])

            if mode == 'train':
                agent.train(state, action, reward, next_state, done)
            
        play_time = datetime.now() - start_time
        if mode == 'test':
            record = pd.Series([info['cur_revenue'],info['trade_time'],info['trade_win']], index=df_rec.columns)
            print(f"Episode: {episode + 1}/{episodes_times} RapTime: {play_time} FixedProfit: {info['cur_revenue']:.0f} TradeTimes: {info['trade_time']} TradeWin: {info['trade_win']}")
        else:
            record = pd.Series(info['cur_revenue'], index=df_rec.columns)
            print(f"Episode: {episode + 1}/{episodes_times} RapTime: {play_time} FixedProfit: {info['cur_revenue']:.0f}")
    
        state = next_state
        df_rec = df_rec.append(record, ignore_index=True)
    return df_rec

In [None]:
initial_money=1000000
episodes_times = 100
batch_size = 32
mode = 'test'
brain = Brain()
agent = Agent(brain=brain)

if mode == 'test':
    with open(f'{models_folder}/scaler_ql.pkl', 'rb') as f:
        scaler = pickle.load(f)
    agent.epsilon = 0.01
    agent.load(f'{models_folder}/dqn_ql.h5')

env = Environment(df, initial_money=initial_money, mode = mode)
scaler = make_scaler(env)
df_rec = play_game(env, agent , episodes_times = episodes_times, mode = mode, batch_size = batch_size)

if mode == 'train':
    agent.save(f'{models_folder}/dqn_ql.h5')
    with open(f'{models_folder}/scaler_ql.pkl', 'wb') as f:
        pickle.dump(scaler, f)

df_rec.to_csv(csv_path)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3)                 12        
_________________________________________________________________
re_lu (ReLU)                 (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
re_lu_1 (ReLU)               (None, 3)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
None
Episode: 1/100 RapTime: 0:00:27.516604 FixedProfit: 1012608 TradeTimes: 3 TradeWin: 2
Episode: 2/100 RapTime: 0:00:25.302