In [None]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from datetime import datetime, timedelta
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import sgd
import talib
import traceback
import pickle

%matplotlib inline

# Deep Reinforcement Learning for Trading

## What is RL?
- Crossover between supervised and unsupervised learning
- Solving the problem of learning with delayed reward
- A system has a state S
- For every state we perform and action based on the state and prior experience
- A chain of actions leads to a reward (win/loss)
- Every action in the chain can be assigned a fractional reward

## Concepts
- greedy vs long-term
- exploration vs exploitation

## Q-Learning
- Q-Tables normally based on Markov chains
- For each state (x-axis) and each action (y-axis) we get a reward (matrix)
- Deep-RL uses neural networks to predict the Q-table
- The Q-table is now represented as the weights function in the NN
- We update the Q-table at every step based on prior experience
- Actions are predicted based on the existing Q-table

## Gamification
or straight (buy, sell, hold)
- State is the screen (technical indicators etc.)
- Reward is a win or loss at the end of the game (exit)

In [None]:
dft = pickle.load(open('PriceData10.pick','rb'))

In [None]:
dft['close'].plot()

In [None]:
def init_net(df,lkbk,START_IDX,max_mem):
    """
    This initialises the RL run by instantiating a new Game, 
    creating a new predictive neural network and instantiating
    experience replay.
    Args:
        df: This is the data frame with the market data
        lkbk: This is the lookback period, eg. a value of 10 means 10mins, 10hrs and 10days!
        START_IDX: This is the starting index for the main loop, allow enough for lkbk
        
    Returns:
        env: an instance of Game, our environment
        model: the neural network
        exp_replay: an instance of ExperienceReplay
    """
    num_actions = 3
    env = Game(df, lkbk=lkbk, init_idx=START_IDX)
    hidden_size = len(env.state)*2
    model = Sequential()
    model.add(Dense(hidden_size, input_shape=(len(env.state),), activation='relu'))
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dense(num_actions))
    model.compile(sgd(lr=.005), "mse")
    exp_replay = ExperienceReplay(max_memory=max_mem)
    return env,model,exp_replay  

In [None]:
class Game(object):

    def __init__(self, df, lkbk=20, max_game_len=1000, init_idx=None):
        self.df = df
        self.lkbk = lkbk
        
        self.is_over = False
        self.reward = 0
        self.pnl_sum = 0
        self.init_idx = init_idx
        self.reset()
        
    def _update_state(self, action):
        
        '''Here we update our state.
        The state consists of the current parameters of the system,
        similar to the current frame of a gaming screen. It includes
        current time, price, position and reward.
        We the add secondary features such as technical indicators in _assemble state.
        Args:
            action: The action suggested by the neural network based on past experience
        '''
        self.curr_idx += 1
        self.curr_time = self.df.index[self.curr_idx]
        self.curr_price = self.df['close'][self.curr_idx]
        self.pnl = (-self.entry + self.curr_price)*self.position/self.entry
        self._assemble_state()
        tm_lst = list(map(float,str(self.curr_time.time()).split(':')[:2]))
        self._time_of_day = (tm_lst[0]*60 + tm_lst[1])/(24*60) 
        self._day_of_week  = self.curr_time.weekday()/6
        
        '''This is where we define our policy and update our position'''
        if action == 0:  
            pass
        
        elif action == 2:
            """---Enter a long or exit a short position---"""
            if self.position == -1:
                self.is_over = True
                self._get_reward()
                self.trade_len = self.curr_idx - self.start_idx
   
            elif self.position == 0:
                self.position = 1
                self.entry = self.curr_price
                self.start_idx = self.curr_idx
            else: 
                pass
            
        elif action == 1:
            """---Enter a short or exit a long position---"""
            if self.position == 1:
                self.is_over = True
                self._get_reward()
                self.trade_len = self.curr_idx - self.start_idx

            elif self.position == 0:
                self.position = -1
                self.entry = self.curr_price
                self.start_idx = self.curr_idx
            else:
                pass
        
    
    def _assemble_state(self):
        '''Here we can add secondary features such as indicators and times to our current state.
        First, we create candlesticks for different bar sizes of 5mins, 1hr and 1d.
        We then add some state variables such as time of day, day of week and position.
        Next several indicators are added and subsequently z-scored.
        '''
        
        """---Adding Candlesticks---"""
        self._get_last_N_timebars()
        bars = [self.last5m,self.last1h,self.last1d]
        state = []
        candles = {j:{k:np.array([]) for k in ['open','high','low','close']} for j in range(len(bars))}
        for j,bar in enumerate(bars):
            for col in ['open','high','low','close']:
                candles[j][col] = np.asarray(bar[col])
                state += (list(np.asarray(bar[col]))[-10:])

        """---Adding State Variables---"""
        self.state = np.array([])
        self.state = np.append(self.state,state)
        self.state = np.append(self.state,self.position)
        np.append(self.state,np.sign(self.pnl_sum))
        self.state = np.append(self.state,self._time_of_day)
        self.state = np.append(self.state,self._day_of_week)
        
        """---Adding Techincal Indicators---"""
        for c in candles:
            try:
                sma1 = talib.SMA(candles[c]['close'],self.lkbk-1)[-1]
                sma2 = talib.SMA(candles[c]['close'],self.lkbk-8)[-1]
                self.state = np.append(self.state,(sma1-sma2)/sma2)
                self.state = np.append(self.state,talib.RSI(candles[c]['close'],self.lkbk-1)[-1])
                self.state = np.append(self.state,talib.MOM(candles[c]['close'],self.lkbk-1)[-1])
                self.state = np.append(self.state,talib.BOP(candles[c]['open'],
                                               candles[c]['high'],
                                               candles[c]['low'],
                                               candles[c]['close'])[-1])

                self.state = np.append(self.state,talib.AROONOSC(candles[c]['high'],
                                               candles[c]['low'],
                                               self.lkbk-3)[-1])
            except: print(traceback.format_exc())
                
        """---Z-scoring State---"""
        self.state = (np.array(self.state)-np.mean(self.state,axis=0))/np.std(self.state,axis=0)
        
    def _get_last_N_timebars(self):
        '''This function gets the timebars for the 5m, 1hr and 1d resolution based
        on the lookback we've specified.
        '''
        wdw5m = 9
        wdw1h = np.ceil(self.lkbk*15/24.)
        wdw1d = np.ceil(self.lkbk*15)
        
        """---creating the candlesticks based on windows---"""
        self.last5m = self.df[self.curr_time-timedelta(wdw5m):self.curr_time].iloc[-self.lkbk:]
        self.last1h = self.bars1h[self.curr_time-timedelta(wdw1h):self.curr_time].iloc[-self.lkbk:]
        self.last1d = self.bars1d[self.curr_time-timedelta(wdw1d):self.curr_time].iloc[-self.lkbk:]
        
        '''---Making sure that window lengths agree with lookback---'''
        try:
            assert(len(self.last5m)==self.lkbk)
            assert(len(self.last1h)==self.lkbk)
            assert(len(self.last1d)==self.lkbk)
        except:
            print('****Window length too short****')
            print(len(self.last5m),len(self.last1h),len(self.last1d))

            self.init_idx = self.curr_idx
            self.reset()


    def _get_reward(self):
        """Here we calculate the reward when the game is finished.
        Reward function design is very difficult and can significantly
        impact the performance of our algo.
        In this case we use a simple pnl reward but it is conceivable to use
        other metrics such as Sharpe ratio, average return, etc.
        """
        if self.position == 1 and self.is_over:
            pnl = (self.entry - self.curr_price)/self.entry
            self.reward = pnl
        elif self.position == -1 and self.is_over:
            pnl = (self.curr_price - self.entry)/self.entry
            self.reward = pnl
        return self.reward
            
    def observe(self):
        """This function returns the state of the system.
        Returns:
            self.state: the state including indicators, position and times.
        """
        return np.array([self.state])

    def act(self, action):
        """This function updates the state based on an action
        that was calculated by the NN.
        This is the point where the game interacts with the trading
        algo.
        """
        self._update_state(action)
        reward = self.reward
        game_over = self.is_over
        return self.observe(), reward, game_over

    def reset(self):
        """Resetting the system for each new trading game.
        Here, we also resample the bars for 1h and 1d.
        Ideally, we should do this on every update but this will take very long.
        """
        self.pnl = 0
        self.entry = 0
        self._time_of_day = 0
        self._day_of_week = 0
        self.curr_idx = self.init_idx         
        self.t_in_secs = (self.df.index[-1]-self.df.index[0]).total_seconds()
        self.start_idx = self.curr_idx
        self.curr_time = self.df.index[self.curr_idx]
        self.bars1h = self.df['close'].resample('1H',label='right',closed='right').ohlc().dropna()
        self.bars1d = self.df['close'].resample('1D',label='right',closed='right').ohlc().dropna()
        self._get_last_N_timebars()
        self.state = []
        self.position = 0
        self._update_state(0)

In [None]:
class ExperienceReplay(object):
    '''This class calculates the Q-Table.
    It gathers memory from previous experience and 
    creates a Q-Table with states and rewards for each
    action using the NN. At the end of the game the reward
    is calculated from the reward function. 
    The weights in the NN are constantly updated with each new
    batch of experience. 
    This is the heart of the RL algorithm.
    Args:
        state_tp1: state at time t+1
        state_t: state at time t
        action_t: int {0..2} hold, sell, buy taken at state_t 
        Q_sa: float, reward for state_tp1
        reward_t: reward for state_t
        self.memory: list of state_t, action_t and reward_t at time t as well as state_tp1
        targets: array(float) Nx2, weight of each action 
        inputs: an array with scrambled states at different times
        targets: Nx3 array of weights for each action for scrambled input states
    '''
    def __init__(self, max_memory=100, discount=.9):
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):
        '''Add states to time t and t+1 as well as  to memory'''
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def q_table(self, model, batch_size=10):
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]
        env_dim = self.memory[0][0][0].shape[1]
        
        """---Initialise input and target arrays---"""
        inputs = np.zeros((min(len_memory, batch_size), env_dim))
        targets = np.zeros((inputs.shape[0], num_actions))
        
        """Step randomly through different places in the memory
        and scramble them into a new input array (inputs) with the
        length of the pre-defined batch size"""
        for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
            
            """Obtain the parameters for Bellman from memory"""
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]

            inputs[i] = state_t
            
            """---Calculate the targets for the state at time t---"""
            targets[i] = model.predict(state_t)[0]
            
            """---Calculate the reward at time t+1 for action at time t---"""
            Q_sa = np.max(model.predict(state_tp1)[0])
           
            if game_over:
                """---When game is over we have a definite reward---"""
                targets[i, action_t] = reward_t
            else:
                """
                ---Update the part of the target for which action_t occured to new value---
                Q_new(s,a) = (1-gamma) * reward_t + gamma * max_a' Q(s', a')
                """
                targets[i, action_t] = (1-self.discount)*reward_t + self.discount * Q_sa
        
      
        return inputs, targets

In [None]:
def run(df):

    epsilon_0 = .001
    max_mem = 600
    max_trade_time = 1000
    batch_size = 500
    lkbk = 20
    START_IDX = 3000
    pnls = []
    pnl_dates = []
    e = 0

    """---Initialise a NN and a set up initial game parameters---"""
    env,model,exp_replay = init_net(df,lkbk,START_IDX,max_mem)

    """---Loop that steps through one trade (game) at a time---"""
    while True:
        
        """---Stop the algo when end is near to avoid exception---"""
        if env.curr_idx >= len(df)-max_trade_time:
            break
        
        e += 1
        
        """---Calculate epsilon for exploration vs exploitation random action generator---"""
        epsilon = epsilon_0**(np.log10(e))+0.001
         
        """---Initialise a new game---"""
        env = Game(df, lkbk=lkbk, init_idx=env.curr_idx)
        game_over = False
        state_tp1 = env.observe()
        cnt = 0
        
        """---Walk through time steps starting from the end of the last game---"""
        while not game_over:
            cnt += 1
            state_t = state_tp1
            
            #"""---Generate a random action---"""
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, 3, size=1)[0]
                if env.position == 0:
                    if action == 2:
                        exit_action = 1
                    elif action == 1:
                        exit_action = 2

            
            #"""---Action for opening a trade---"""
            elif env.position == 0:
                q = model.predict(state_t)
                action = np.argmax(q[0])
                if action:
                    exit_action = np.argmin(q[0][1:])+1
            
            #"""---Time Exit---"""
            elif cnt > max_trade_time:
                action = exit_action
                
            #"""---Action for closing a trade---"""
            elif env.position:
                q = model.predict(state_t)
                action = np.argmax(q[0])

            #"""---Updating the Game---"""
            state_tp1, reward, game_over = env.act(action)
            
            #"""---Adding state to memory---"""
            exp_replay.remember([state_t, action, reward, state_tp1], game_over)

            #"""---Creating a new Q-Table---"""
            inputs, targets = exp_replay.q_table(model, batch_size=batch_size)
            env.pnl_sum = sum(pnls)

            #"""---Update the NN model with a new Q-Table"""
            #if not env.curr_idx%50 or env.curr_idx<3500:
            loss = model.train_on_batch(inputs, targets)
            
        print("Trade {:03d} | pos {} | len {} | pnl {:,.2f}% | eps {:,.4f} | {} | {}".format(e,  
                                                                              env.position, 
                                                                              env.trade_len,
                                                                              sum(pnls)*100,
                                                                              epsilon,
                                                                              env.curr_time,
                                                                              env.curr_idx))

        pnls.append(env.pnl)
        pnl_dates.append(env.curr_time)
        pickle.dump([pnl_dates,pnls],open('pnls.pick','wb'))


In [None]:
np.random.seed(34)
run(dft)

In [None]:
dates,pnls=pickle.load(open('pnls_ma3_98-11.pick','rb'))

plt.plot(dates,np.cumsum(pnls))
plt.xticks(rotation=45);