In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
from utils import plot_learning_curve

In [None]:
class TwoInputsNet(nn.Module):
    # ****
    # tengo que revisar mejor sobre los channels y toda esta locura porque hay algo raro. sobretodo en la parte de los filtros.
    
    # revisa que todas tus dimensiones esten correctas.
    # ****
    def __init__(self, lr, in_channels, out_channels, kernel_size, input_dims, fc1_dims, fc2_dims, n_actions):
        super(TwoInputsNet, self).__init__()
        self.conv_open = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_high = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_low = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_close = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.conv_volume = nn.Conv1d(in_channels, out_channels, kernel_size)
        self.fc_state = nn.Linear(2, 2)
        
        self.fc1 = nn.Linear(input_dims, fc1_dims) #input dims is the sum of all outputs of the conv layers.
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.fc3 = nn.Linear(fc2_dims, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr = lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        # Open High Low Close Volume in that order (also for the numpy array)
        # input6 son dos nodos de input para saber si el agente ha comprado/vendido y en cuanto.
    def forward(self, input1, input2, input3, input4, input5, input6):
        c1 = self.conv_open(input1.unsqueeze(dim = 1))
        c2 = self.conv_high(input2.unsqueeze(dim = 1))
        c3 = self.conv_low(input3.unsqueeze(dim = 1))
        c4 = self.conv_close(input4.unsqueeze(dim = 1))
        c5 = self.conv_volume(input5.unsqueeze(dim = 1))
        f1 = self.fc_state(input6)
        
        # now we can reshape to 2D and concat them
        combined = T.cat((c1.view(c1.size(0), -1), 
                          c2.view(c2.size(0), -1),
                          c3.view(c3.size(0), -1),
                          c4.view(c4.size(0), -1),
                          c5.view(c5.size(0), -1),
                          f1.view(f1.size(0), -1)), dim=1)
        
        x = F.relu(self.fc1(combined))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        
        return actions
    
class Agent():
    # ***hyperparameters
    def __init__(self, gamma, epsilon, lr, input_dims1, input_dims2, batch_size, n_actions,
                max_mem_size = 100000, eps_end=0.01, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        # self.Q_eval = DeepQNetwork(self.lr, n_actions = n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256)
        # (self, lr, in_channels, out_channels, kernel_size, input_dims, fc1_dims, fc2_dims, n_actions)
        # en otras parabras (learning rate, cantidad de filtros de entrada, cantidad de filtros de salida, tamaño del kernel)
        #  ***hyperparameters
        self.Q_eval = TwoInputsNet(self.lr, 1, 4, 3, 562, 128, 128, n_actions)
        #                                             ^
        # tengo que encontrar una forma para hacer que se haga el tamaño del fc layer automaticamente
        
        # ***
        # tengo que reviar si el replay memory esta funcionando correctamente, sobre todo el *input_dims
        # as of now I checked the state_memory shape (10000, 30, 5) and it looks correct
        # ***
        self.state_memory = np.zeros((self.mem_size, *input_dims1), dtype = np.float32)
        self.state_memory2 = np.zeros((self.mem_size, *input_dims2), dtype = np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims1), dtype=np.float32)
        self.new_state_memory2 = np.zeros((self.mem_size, *input_dims2), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state[0]
        self.state_memory2[index] = state[1]
        self.new_state_memory[index] = state_[0]
        self.new_state_memory2[index] = state_[1]
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            # aqui tuve que separar la observacion en las 2 partes y transformarlas a pytorch tensor. 
            # no se si es lo mejor, pero sirve asi que...
            
            state = T.tensor([observation[0]]).to(self.Q_eval.device)
            state2 = T.tensor([observation[1]]).to(self.Q_eval.device)
#             print(state)
            actions = self.Q_eval.forward(state[:,:,0], 
                                          state[:,:,1], 
                                          state[:,:,2], 
                                          state[:,:,3], 
                                          state[:,:,4],
                                          state2[:])
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        state_batch2 = T.tensor(self.state_memory2[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        new_state_batch2 = T.tensor(self.new_state_memory2[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]

        q_eval = self.Q_eval.forward(state_batch[:,:,0], 
                                     state_batch[:,:,1], 
                                     state_batch[:,:,2], 
                                     state_batch[:,:,3], 
                                     state_batch[:,:,4],
                                     state_batch2[:])[batch_index, action_batch]
    
        q_next = self.Q_eval.forward(new_state_batch[:,:,0], 
                                     new_state_batch[:,:,1], 
                                     new_state_batch[:,:,2], 
                                     new_state_batch[:,:,3], 
                                     new_state_batch[:,:,4],
                                     new_state_batch2[:])
        q_next[terminal_batch] = 0.0
        
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
                        else self.eps_min
