In [None]:
import numpy as np
import pandas as pd
# import  as tf

# import gym
# import gym_anytrading

# import matplotlib.animation as animation 

from keras.models import Model,load_model,save_model
from keras.layers import Dense, Input,Dropout,LSTM
from keras.optimizers import Adam

# from keras import layers, models,
from keras import backend as K

from datetime import datetime
import itertools
import argparse
import re
import os
import pickle
import matplotlib.pyplot as plt
import talib
from sklearn.preprocessing import StandardScaler

In [1]:
class Memory():
    """Sets up a memory replay for actor-critic training.

    Args:
        gamma (float): The "discount rate" used to assess state values.
        batch_size (int): The number of elements to include in the buffer.
    """
    def __init__(self, gamma, batch_size):
        self.buffer = []
        self.gamma = gamma
        self.batch_size = batch_size

    def add(self, experience):
        """Adds an experience into the memory buffer.

        Args:
            experience: (state, action, reward, state_prime_value, done) tuple.
        """
        self.buffer.append(experience)

    def check_full(self):
        return len(self.buffer) >= self.batch_size

    def sample(self):
        """Returns formated experiences and clears the buffer.

        Returns:
            (list): A tuple of lists with structure [
                [states], [actions], [rewards], [state_prime_values], [dones]
            ]
        """
        # Columns have different data types, so numpy array would be awkward.
        batch = np.array(self.buffer).T.tolist()


        states_mb = np.array(batch[0], dtype=np.float32)

        actions_mb = np.array(batch[1], dtype=np.int8)

        rewards_mb = np.array(batch[2], dtype=np.float32)

        dones_mb = np.array(batch[3], dtype=np.int8)
        value_mb = np.squeeze(np.array(batch[4], dtype=np.float32))
        self.buffer = []
        return states_mb, actions_mb, rewards_mb, dones_mb, value_mb

In [2]:
class Agent():
    """Sets up a reinforcement learning agent to play in a game environment."""
    def __init__(self, actor, critic, policy, memory, action_size):
        """Initializes the agent with DQN and memory sub-classes.

        Args:
            network: A neural network created from deep_q_network().
            memory: A Memory class object.
            epsilon_decay (float): The rate at which to decay random actions.
            action_size (int): The number of possible actions to take.
        """
        self.actor = actor
        self.critic = critic
        self.policy = policy
        self.action_size = action_size
        self.memory = memory

    def act(self, state):
        """Selects an action for the agent to take given a game state.

        Args:
            state (list of numbers): The state of the environment to act on.
            traning (bool): True if the agent is training.

        Returns:
            (int) The index of the action to take.
        """
        # If not acting randomly, take action with highest predicted value.

        if len(state.shape)<3:
            state=np.reshape(state,(1,1,state.shape[1]))

        probabilities = self.policy.predict(state)[0]

        action = np.random.choice(self.action_size, p=probabilities)
        return action

    def learn(self, print_variables=False):
        """Trains the Deep Q Network based on stored experiences."""
        gamma = self.memory.gamma
        experiences = self.memory.sample()
        state_mb, action_mb, reward_mb, dones_mb, next_value = experiences

        # One hot enocde actions
        actions = np.zeros([len(action_mb), self.action_size])
        actions[np.arange(len(action_mb)), action_mb] = 1

        #Apply TD(0)
        discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)

        state_values = self.critic.predict([state_mb])
        advantages = discount_mb - np.squeeze(state_values)

        if print_variables:
            print("discount_mb", discount_mb)
            print("next_value", next_value)
            print("state_values", state_values)
            print("advantages", advantages)
        else:
            self.actor.train_on_batch(
                [state_mb, advantages], [actions, discount_mb])

In [3]:
def build_networks(state_shape, action_size, learning_rate, critic_weight,
                   hidden_neurons, entropy):
    """Creates Actor Critic Neural Networks.

    Creates a two hidden-layer Policy Gradient Neural Network. The loss
    function is altered to be a log-likelihood function weighted
    by an action's advantage.

    Args:
        space_shape: a tuple of ints representing the observation space.
        action_size (int): the number of possible actions.
        learning_rate (float): the nueral network's learning rate.
        critic_weight (float): how much to weigh the critic's training loss.
        hidden_neurons (int): the number of neurons to use per hidden layer.
        entropy (float): how much to enourage exploration versus exploitation.
    """
    state_input = Input(state_shape, name='frames')
    advantages = Input((1,), name='advantages')  # PG, A instead of G

    # PG
    actor_1 = LSTM(input_shape = (10,80),units=hidden_neurons, return_sequences = True)(state_input)
    actor_2 = LSTM(input_shape = (10,80), units=hidden_neurons, return_sequences = False)(actor_1)
    probabilities = Dense(action_size, activation='softmax')(actor_2)

    # DQN
    critic_1 = LSTM(input_shape = (10,80), units = hidden_neurons, return_sequences = True)(state_input)
    critic_2 = LSTM(input_shape = (10,80), units=hidden_neurons, return_sequences = False)(critic_1)
    values = Dense(1, activation='linear')(critic_2)

    def actor_loss(y_true, y_pred):  # PG
        y_pred_clipped = K.clip(y_pred, CLIP_EDGE, 1-CLIP_EDGE)
        log_lik = y_true*K.log(y_pred_clipped)
        entropy_loss = y_pred * K.log(K.clip(y_pred, CLIP_EDGE, 1-CLIP_EDGE))  # New
        return K.sum(-log_lik * advantages) - (entropy * K.sum(entropy_loss))

    # Train both actor and critic at the same time.
    actor = Model(
        inputs=[state_input, advantages], outputs=[probabilities, values])
    actor.compile(
        loss=[actor_loss, 'mean_squared_error'],  # [PG, DQN]
        loss_weights=[1, critic_weight],  # [PG, DQN]
        optimizer=Adam(lr=learning_rate))

    critic = Model(inputs=[state_input], outputs=[values])
    policy = Model(inputs=[state_input], outputs=[probabilities])
    return actor, critic, policy

In [4]:
class currencyEnv:
    """
    Enviroment de la moneda en cuestion.
    State: vector de largo 3 (num_currencies * 2 + 1)
     - # de posiciones abiertas (las dejaremos en 1 en todo momento)
     - # precio observado de la moneda
     - Saldo de la cuenta (o pips acumulados?)
    Action: variable categorica con 3 posibilidades (3 a la potencia de la cantidad de monedas-3^currencies)
     - 0 = Buy
     - 1 = Sell
     - 2 = Hold
    """
    
    def __init__(self,data,initial_investment=20000):
        #data
        self.currency_price_history = data
        self.n_step, self.n_features = self.currency_price_history.shape
        self.n_features = self.n_features-1
        self.n_currencies=1
        self.op_buy = None
        self.op_sell = None
        #instance attributes
        self.initial_investment = initial_investment
        self.cur_step = None
        self.pos_open = None #ex-stock_owned - max position opened at same time
        self.buy_open = None #buy pos opened
        self.sell_open = None #sell pos opened
        self.buy_open_price = None
        self.sell_open_price = None
        self.cur_price = None
        self.pips_cum = None
        #experimental
        self.prev_price = None
        self.ask_price = None
        self.cur_features = None
        self.index = None
        self.cash_in_hand = None
        
        self.action_space = np.arange(5**self.n_currencies)
        
        #action permutation
        #returns a nested list with elements like:
        #for 1 currency:
        #[0]
        #[1]
        #[2]
        #[2]
        #etc.
        # 0 = Buy
        # 1 = Sell
        # 2 = Hold
        self.action_list = list(map(list, itertools.product([0,1,2,3,4], repeat=self.n_currencies)))
        
        #calculate size of state
        self.state_dim = self.n_currencies*2+self.n_features+1+1 #self.n_features * 2 + 1
        
        self.reset()
        
    def reset(self):
        self.op_buy = {}
        self.op_sell = {}
        self.cur_step = 0
        self.pos_open = np.zeros(self.n_currencies)
        self.buy_open = 0
        self.sell_open = 0
        self.pips_cum = 0
        self.cur_price = self.currency_price_history.Close.iloc[self.cur_step]
        self.ask_price = self.currency_price_history.Ask.iloc[self.cur_step]
        self.sell_open_price = 0.0
        self.buy_open_price = 0.0
        self.cur_features = self.currency_price_history.iloc[self.cur_step,1:]
        self.cash_in_hand = self.initial_investment
        return self._get_obs()
    
    def step(self, action,ops):
        assert action in self.action_space
        
        #get current value before performin the action
        prev_val = self._get_val()
#         print(self.cur_step)
        #update price, i.e. go to next day or candle
        self.cur_step += 1

        self.cur_price = self.currency_price_history.Close.iloc[self.cur_step]
        self.ask_price = self.currency_price_history.Ask.iloc[self.cur_step]
        #Current features-agregado por ppsev
        self.cur_features = self.currency_price_history.iloc[self.cur_step,1:]
        #Index para ver donde hace los trades -- experimental
        self.index = self.currency_price_history.index[self.cur_step]
        #perform a trade
        self._trade(action,ops)
        
        #get the new value after taking the action
        cur_val = self._get_val()
        
        #reward is the increase or decrease of the investemt - for now
        #needs to change to sharpe ratio, kratio or some ratio
        reward = cur_val - prev_val
        
        #done if we hve run out of data
        done = self.cur_step == self.n_step-1

        #store the current value of the portfolio here
        info = {"cur_val": cur_val}
        
        #conform to the Gym API
        return self._get_obs(), reward,done,info
    
    def _get_obs(self):
        obs = np.empty(self.state_dim)
        obs[:self.n_currencies] = self.buy_open #cantidad de buy abiertas
        obs[self.n_currencies:self.n_currencies+1] = self.sell_open #cantidad de posiciones sell abiertas
        obs[self.n_currencies+1:self.n_currencies+2] = self.cur_price #Precio actual del par de divisa
        obs[self.n_currencies+2:-1] = self.cur_features #Current features
        obs[-1] = self.buy_open*((self.cur_price-self.buy_open_price)/0.0001) +\
            self.sell_open*((self.sell_open_price-self.ask_price)/0.0001)  #los pips que hayan en el estado actual
        return obs
    
    def _get_val(self):
        return self.pips_cum #self.buy_open*((self.cur_price-self.buy_open_price)/0.0001) +\
            #self.sell_open*((self.sell_open_price-self.cur_price)/0.0001) 

    
    def _trade(self, action,ops):
        #index the action we want to perform
        # 0 buy
        # 1 sell
        # 2 hold
        #eg: [0] means buy currency
        #eg: [0,1] means buy currency 1, sell currency 2
        
        action_vec = self.action_list[action]
        
        #determine currencies to buy and sell
        sell_index = [] #stores index of currencies we want to sell
        buy_index=[] #stores index of currenies we want to buy
        close_buy_index = [] #stores index of currencies we want to close buy
        close_sell_index = [] #stores index of currencies we want to close sell
        
        for i, a in enumerate(action_vec):
            if a==0:
                buy_index.append(i)
            elif a == 1:
                sell_index.append(i)
            elif a == 2:
                close_buy_index.append(i)
            elif a == 3:
                close_sell_index.append(i)
        
        #sell-buy when it corresponds
        #we buy at ask price and close buys at bid/cur price.
        #we sell at bid/cur price and close sells at ask price
        if sell_index:
            for i in sell_index:
                if self.sell_open==0:
                    self.sell_open_price = self.cur_price
                    self.op_sell = {"Magic_number":0, "Item":"EURUSD", "Type":"sell","Open_Time":self.index\
                              ,"Open_Price":self.cur_price}
                    self.sell_open = 1

        
        if buy_index:
            for i in buy_index:
                if self.buy_open==0:
                    self.buy_open_price=self.ask_price
                    self.op_buy = {"Magic_number":0, "Item":"EURUSD", "Type":"buy","Open_Time":self.index,\
                              "Open_Price":self.ask_price}
                    self.buy_open = 1


        if close_buy_index:
            for i in close_buy_index:
                if self.buy_open==1:
#                     self.prev_price = self.cur_price
                    self.op_buy["Close_Time"] = self.index
                    self.op_buy["Close_Price"] = self.cur_price
                    ops.append(self.op_buy)
                    self.buy_open = 0
                    self.pips_cum+= (self.cur_price-self.buy_open_price)/0.0001
        
        if close_sell_index:
            for i in close_sell_index:
                if self.sell_open==1:
#                     self.prev_price = self.cur_price
                    self.op_sell["Close_Time"] = self.index
                    self.op_sell["Close_Price"] = self.ask_price
                    ops.append(self.op_sell)
                    self.sell_open = 0
                    self.pips_cum+= (self.sell_open_price-self.ask_price)/0.0001

In [5]:
def get_scaler(env,ops):
    """
    Returns scikit-learn scaler object to scale the states
    """
    states=[]
    for _ in range(env.n_step):
        action = np.random.choice(env.action_space)
        state, reward, done, info = env.step(action,ops)
        states.append(state)
        if done:
            break
        
    scaler = StandardScaler()
    scaler.fit(states)
    return scaler

In [6]:
#the idea is to let the agent "see" not only the current value, but
#previous values. Just like a normal person would do
def create_window(data,window_size = 1):
    data_s = pd.DataFrame(data.values,index=data.index)
    data_c = data.copy()
    for i in range(window_size):
        data_c = pd.concat([data_c, data_s.shift((i + 1))], axis = 1)
        
    data_c.dropna(axis=0, inplace=True)
    return(data_c)

In [None]:
CLIP_EDGE = 1e-8

In [None]:
space_shape = env._get_obs()
space_shape = np.reshape(space_shape,(1,space_shape.shape[0])).shape
action_size = len(env.action_space)

space_shape,action_size

In [None]:
test_gamma = .9
test_batch_size = 128
test_learning_rate = .0001
test_hidden_neurons = 300
test_critic_weight = 0.5
test_entropy = 0.0001

In [None]:
ops=[]
scaler = get_scaler(env,ops)

In [None]:
test_memory = Memory(test_gamma, test_batch_size)
test_actor, test_critic, test_policy = build_networks(
    space_shape, action_size,
    test_learning_rate, test_critic_weight,
    test_hidden_neurons, test_entropy)
test_agent = Agent(
    test_actor, test_critic, test_policy, test_memory, action_size)

rew = []

for episode in range(2000):  
    state = env.reset()
    state = scaler.transform([state])
    state = np.reshape(state,(1,state.shape[1]))
    episode_reward = 0
    ops=[]
    done = False
    
    t0 = datetime.now()
    
    while not done:
        action = test_agent.act(state)
        state_prime, reward, done, _ = env.step(action,ops)
        episode_reward += reward

        state_prime = scaler.transform([state_prime])
        next_value = test_agent.critic.predict([[state_prime]]) 
        test_agent.memory.add((state, action, reward, done, next_value))
        state = state_prime
    test_agent.learn()
    t1 = datetime.now()-t0
    rew.append(episode_reward)
    total_buy=len(pd.DataFrame(ops)[pd.DataFrame(ops).Type=="buy"]) if len(ops) > 0 else 0
    total_sell=len(pd.DataFrame(ops)[pd.DataFrame(ops).Type=="sell"]) if len(ops) > 0 else 0
    print("Episode", episode, "Elapsed: ",t1,", Total buy:",total_buy,", Total sell:",total_sell,
          ", pips acum:",np.round(env.pips_cum,decimals=4),", Score =",np.round(episode_reward,decimals=4))