In [5]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
from utils import plot_learning_curve

In [6]:
class TwoInputsNet(nn.Module):
    # ****
    # tengo que revisar mejor sobre los channels y toda esta locura porque hay algo raro. sobretodo en la parte de los filtros.
    
    # revisa que todas tus dimensiones esten correctas.
    
    # puede ser que nn.sequential corre mejor que esta forma, tendre que ver si lo refactorizo. 
    # ****
    def __init__(self, lr, in_channels, out_channels, kernel_size, input_dims, fc1_dims, fc2_dims, n_actions):
        super(TwoInputsNet, self).__init__()
        self.conv_open = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_high = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_low = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_close = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_volume = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.fc_state = nn.Linear(2, 2)
        
        self.fc1 = nn.Linear(input_dims, fc1_dims) #input dims is the sum of all outputs of the conv layers.
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.fc3 = nn.Linear(fc2_dims, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr = lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        # Open High Low Close Volume in that order (also for the numpy array)
        # input6 son dos nodos de input para saber si el agente ha comprado/vendido y en cuanto.
    def forward(self, input1, input2, input3, input4, input5, input6):
        c1 = self.conv_open(input1.unsqueeze(dim = 1))
        c2 = self.conv_high(input2.unsqueeze(dim = 1))
        c3 = self.conv_low(input3.unsqueeze(dim = 1))
        c4 = self.conv_close(input4.unsqueeze(dim = 1))
        c5 = self.conv_volume(input5.unsqueeze(dim = 1))
        f1 = self.fc_state(input6)
        
        # now we can reshape to 2D and concat them
        combined = T.cat((c1.view(c1.size(0), -1), 
                          c2.view(c2.size(0), -1),
                          c3.view(c3.size(0), -1),
                          c4.view(c4.size(0), -1),
                          c5.view(c5.size(0), -1),
                          f1.view(f1.size(0), -1)), dim=1)
        
        x = F.relu(self.fc1(combined))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        
        return actions
    
class Agent():
    # ***hyperparameters
    def __init__(self, gamma, epsilon, lr, input_dims1, input_dims2, batch_size, n_actions,
                max_mem_size = 100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        # self.Q_eval = DeepQNetwork(self.lr, n_actions = n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256)
        # (self, lr, in_channels, out_channels, kernel_size, input_dims, fc1_dims, fc2_dims, n_actions)
        # en otras parabras (learning rate, cantidad de filtros de entrada, cantidad de filtros de salida, tamaño del kernel)
        #  ***hyperparameters
        self.Q_eval = TwoInputsNet(self.lr, 1, 4, 3, 562, 128, 128, n_actions)
        #                                             ^
        # tengo que encontrar una forma para hacer que se haga el tamaño del fc layer automaticamente
        
        # ***
        # tengo que reviar si el replay memory esta funcionando correctamente, sobre todo el *input_dims
        # as of now I checked the state_memory shape (10000, 30, 5) and it looks correct
        # ***
        self.state_memory = np.zeros((self.mem_size, *input_dims1), dtype = np.float32)
        self.state_memory2 = np.zeros((self.mem_size, *input_dims2), dtype = np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims1), dtype=np.float32)
        self.new_state_memory2 = np.zeros((self.mem_size, *input_dims2), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state[0]
        self.state_memory2[index] = state[1]
        self.new_state_memory[index] = state_[0]
        self.new_state_memory2[index] = state_[1]
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            # aqui tuve que separar la observacion en las 2 partes y transformarlas a pytorch tensor. 
            # no se si es lo mejor, pero sirve asi que...
            
            state = T.tensor([observation[0]]).to(self.Q_eval.device)
            state2 = T.tensor([observation[1]]).to(self.Q_eval.device)
#             print(state)
            actions = self.Q_eval.forward(state[:,:,0], 
                                          state[:,:,1], 
                                          state[:,:,2], 
                                          state[:,:,3], 
                                          state[:,:,4],
                                          state2[:])
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        state_batch2 = T.tensor(self.state_memory2[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        new_state_batch2 = T.tensor(self.new_state_memory2[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]

        q_eval = self.Q_eval.forward(state_batch[:,:,0],
                                     state_batch[:,:,1],
                                     state_batch[:,:,2],
                                     state_batch[:,:,3],
                                     state_batch[:,:,4],
                                     state_batch2[:])[batch_index, action_batch]
    
        q_next = self.Q_eval.forward(new_state_batch[:,:,0],
                                     new_state_batch[:,:,1],
                                     new_state_batch[:,:,2],
                                     new_state_batch[:,:,3],
                                     new_state_batch[:,:,4],
                                     new_state_batch2[:])
        q_next[terminal_batch] = 0.0
        
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
                        else self.eps_min


In [None]:
from environment import environment

historical_data = pd.read_csv(r'C:\Users\ratatosck\Desktop\pythonScripts\TradeBot\HistoricalData\EURUSD15.csv', sep='\t',header=None)
historical_data.drop(0, axis=1, inplace = True)
historical_data_np = historical_data.to_numpy(dtype = 'float32')

### hyperparameters
observation_size = 30

# env = environment(historical_data_np, observation_size)

# agent = Agent(gamma = 0.99, epsilon = 1.0, batch_size=64, n_actions = 3, 
#              eps_end = 0.01, input_dims1 = [observation_size, 5], input_dims2 = [2], lr=0.001)

###
scores, eps_history, all_scores_sum, all_scores_average = [], [], [], []
n_games = 1000
n_run = 10


# the nn is not reseting on new run, youll have to decide if this is a good or bad thing.

# also changing the graph to use the sum of scores (overall profit) after each episode, instead of individual scores for 
# each episode might be more clear as a metric for performance.

for j in range(n_run):
    scores = []
    sum_scores = []
    avg_scores = []
    eps_history = []
    
    env = environment(historical_data_np, observation_size)
    agent = Agent(gamma = 0.99, epsilon = 1.0, batch_size=64, n_actions = 3, 
             eps_end = 0.01, input_dims1 = [observation_size, 5], input_dims2 = [2], lr=0.001)
    
    for i in range (n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)

#         avg_score = np.mean(scores[-100:])
        avg_score = np.mean(scores)
        sum_score = np.sum(scores)
        avg_scores.append(avg_score)
        sum_scores.append(sum_score)

        print('episode', i, 'score %.2f' % score, 
                 'average score %.2f' % avg_score,
                 'sum score %.2f' % sum_score,
                 'epsilon %.2f'% agent.epsilon)
        
    all_scores_sum.append(sum(scores))
    all_scores_average.append(sum(scores)/len(scores))
        
    x = [i+1 for i in range(n_games)]
    # filename = 'results-v2'
    

    filename1 = 'plots/score-graph-' + str(j+1)
    filename2 = 'plots/sum-graph-' + str(j+1)
    filename3 = 'plots/average-graph-' + str(j+1)
    plot_learning_curve(x, scores, eps_history, filename1)
    plot_learning_curve(x, sum_scores, eps_history, filename2)
    plot_learning_curve(x, avg_scores, eps_history, filename3)

In [12]:
print("all_scores_sum")
print(all_scores_sum)
print("all_scores_average")
print(all_scores_average)


all_scores_sum
[142.50312328338623, 10.277478694915771, 95.39838194847107, 575.3482830524445, 2.6060569286346436, 846.4772915840149, -13.758081197738647, 83.10495138168335, 115.80050230026245, 359.78622794151306]
all_scores_average
[0.14250312328338624, 0.010277478694915772, 0.09539838194847107, 0.5753482830524445, 0.0026060569286346434, 0.8464772915840149, -0.013758081197738647, 0.08310495138168335, 0.11580050230026245, 0.35978622794151305]


There are a couple of problems right now. first, the training is'nt stable. This might be due to many things, there could be some catastrophic forgetting happening here, or it might just be the limit of the vanilla dqn, or it might just be that the environment is too random, or that the reward function is not good enough, maybe a bit of hyperparameter tuning might be good enough to solve the problem.

There are a lot of questions that remain from this implementation: is the experience replay working? is the pipeline working? does the environment work? this will take a while to figure out.

concerning the reward function, maybe giving negative reward when the agent is on the negative on a trade is good, enforcing early stopping through the reward function might be a good idea. 

also remember that as of now the reward scale is bound to increase as the portfolio increases, so you have to refactor it so that it is based not on profit but on percent return on investment.

other solutions might involve implementing the target network, hyperparameter tuning; like the observation size, the replay memory size, the neural network size/ arquitecture, etc.

however, there is no doubt that im going to have to try other algorithms.

*I did the sum of the scores, and it seems that it ends up positive. this is an indicator that the algorithm might actually be working? I'll have to test and average over multiple runs to see if this is actually the case. thought this brings up an interesting question, I sort of expected the shape of the graph to be trending towards positive, but now that I think about it it seems obvious that it might never be the case, and it's not necessarily a bad thing, you cant expect the algorithm to have a positive trade every time, just like with people, it is normal to have bad trades, but its the average return that determines if the trader is proficient or not. And just like this, you cant expect an algorithm to always have positive returns. in conclusion, the metric for success is not only the shape of the graph, but the overall return from the whole training procedure; by doing the sum of all the scores "sum(scores)" and the higher the number  the better, and then doing an average over multiple training loops.

*There is another idea that has to be investigated, should the agent be always "greedy" when deployed to production or should you always have a non zero epsilon? and should the agent still have a learning call or should you stop learning when the agent is deployed?

*And finally the question of transfer learning, will it be possible? the training environment would have to be very close to the actual production environment for it to work. unless I come up with some revolutionary way to do transfer learning in rl (not likely).

*another interesting idea might be to implement a sort of dual experience replay, that being 2 replay buffers, one for positive rewards and one for negative rewards and then sample from them equally. this might help with catastrophic forgetting.