# Algorithm Implement

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import quantstats as qs
import pandas as pd
import numpy as np
import sys
time_period = 2
sys.path.append('./')
from utlis import get_data, Stock_Env
import warnings
warnings.filterwarnings('ignore')
from collections import deque

class Q_Network(nn.Module):

    def __init__(self, state_size, action_size, hidden=[64, 64]):
        super(Q_Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.fc3 = nn.Linear(hidden[1], action_size)

    def forward(self, state):
        x = state
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Data Loading

In [2]:
stock_df_train, stock_df_test, stock_df_train_, stock_df_test_, codes = get_data()

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:01<00:00, 420.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 251/251 [00:00<00:00, 485.62it/s]


# Technical Indicators

# RL

In [3]:
codes_dict = dict(zip(codes, range(len(codes))))

In [4]:
tst = None
import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np

# from networks import *

class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, device):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.Q_local = Q_Network(self.state_size, self.action_size).to(device)
        self.Q_target = Q_Network(self.state_size, self.action_size).to(device)
        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = deque(maxlen=100000)
        self.tst = None
        self.mu = [0]
        self.last_action = 0

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                action_values = self.Q_local(state).reshape(-1)
            if (action_values).max() > np.max(self.mu):
                # self.mu = 0.95*self.mu + 0.05*action_values.max()

                self.mu.append(action_values.max().cpu().data.numpy())                
                if len(self.mu) > 10:
                    self.mu = self.mu[-10:]
                self.last_action = np.argmax(action_values.cpu().data.numpy())
                return self.last_action
            else:
                return self.last_action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action
            return action

    def learn(self):
        experiences = random.sample(self.memory, self.bs)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float().to(self.device)
        self.tst = states
        Q_values = self.Q_local(states).reshape(-1,11)
        Q_values = torch.gather(input=Q_values, dim=-1, index=actions)
        with torch.no_grad():
            Q_targets = self.Q_target(next_states)
            Q_targets, _ = torch.max(input=Q_targets, dim=-1, keepdim=True)
            Q_targets = rewards + self.gamma * (1 - dones) * Q_targets

        loss = (Q_values - Q_targets).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

# Train and Test

In [5]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 500
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
C = 4 # update weights every C steps

def validation(env, agent):
    # agent.mu=0
    env.mu=[0]
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    frame = env.reset()
    state = frame
    t = 0
    while not done and t < max_t:
        t += 1
        action = agent.act(state, eps)
        frame, reward, done = env.step(action)
        rewards_log.append(reward)
        episodic_reward += reward
    sharpe = qs.stats.sharpe(pd.DataFrame(rewards_log))
    return env.asset, episodic_reward, sharpe


def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    # global rewards_log, average_log, state_history, action_history, done_history, reward_history
    rewards_log = []
    average_log = []
    state_history = []
    action_history = []
    done_history = []
    reward_history = []
    validation_log = []
    validation_average_log = []
    sharpe_log = []
    average_sharpe = []
    eps = eps_init
    for i in range(1, 1 + num_episode):
        env.mu=[0]
        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        val_asset, val_reward, val_sharpe = validation(env_test, agent)

        validation_log.append(val_reward)
        validation_average_log.append(np.mean(validation_log[-100:]))
        sharpe_log.append(val_sharpe.values[0])
        average_sharpe.append(np.mean(sharpe_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, valReward {:.3f}, val Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}, Average Validation Sharpe {:.2f}'.format(i, episodic_reward, average_log[-1], val_reward, validation_average_log[-1], env.asset, val_asset, average_sharpe[-1]), end='')
        # print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [6]:
for code in codes:
    print(code, ' Begins')
    print('---------------------------------------------')
    env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, codes_dict=codes_dict, train=True, code=code, time_period = time_period, codes=codes)
    env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, codes_dict=codes_dict, train=False, code=code, time_period = time_period,  codes=codes)
    agent = Agent(2*5, 11, 64, 0.001, 0.001, 0.99, 'cuda')
    train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

AAPL  Begins
---------------------------------------------
Episode 100, Reward 0.016, Average Reward 0.034, valReward 0.337, val Average Reward 0.271, Asset 1012388.21, Validation Asset 1383553.93, Average Validation Sharpe 1.98
Episode 200, Reward -0.017, Average Reward 0.049, valReward 0.275, val Average Reward 0.265, Asset 983067.05, Validation Asset 1304301.24, Average Validation Sharpe 1.90
Episode 300, Reward 0.055, Average Reward 0.053, valReward 0.207, val Average Reward 0.265, Asset 1051481.58, Validation Asset 1217639.48, Average Validation Sharpe 1.96
Episode 400, Reward -0.052, Average Reward 0.040, valReward 0.298, val Average Reward 0.247, Asset 948315.80, Validation Asset 1334452.48, Average Validation Sharpe 1.79
Episode 500, Reward 0.029, Average Reward 0.054, valReward 0.275, val Average Reward 0.263, Asset 1027383.16, Validation Asset 1305168.92, Average Validation Sharpe 1.91
NFLX  Begins
---------------------------------------------
Episode 100, Reward 0.054, Avera

In [7]:
stock_df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,dividends,stock splits,pctchange,Bearish,Nay,...,To the Moon!!,SMA42,SMA5,SMA15,AO,OVB,VW_MACD,MACD_SIGNAL,RSI,CMO
date,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019-01-02,AAPL,37.166266,38.116480,37.007896,37.893322,148158800.0,0.0,0.0,0.019562,-1.710099,-1.197546,...,-2.621091,37.851179,37.858785,37.853029,-0.056491,1.164988e+09,0.003596,0.000719,1.000000e+02,100.000000
2019-01-02,AMZN,73.260002,77.667999,73.046501,76.956497,159662000.0,0.0,0.0,0.050457,-2.674999,-2.481966,...,0.591993,75.142741,75.470102,75.222369,0.027849,8.350510e+09,0.164550,0.032910,1.000000e+02,100.000000
2019-01-02,GOOGL,51.360001,53.039501,51.264000,52.734001,31868000.0,0.0,0.0,0.026752,-2.832723,-2.270680,...,-0.144085,52.275953,52.345201,52.280401,-0.042477,4.704500e+09,0.026870,-0.007829,9.765337e+01,98.631824
2019-01-03,AAPL,34.548387,34.965906,34.073282,34.118874,365248800.0,0.0,0.0,-0.012432,-2.151437,-1.261331,...,-2.134226,37.803454,37.138432,37.641692,-0.519031,7.997388e+08,-0.546423,-0.109285,0.000000e+00,-100.000000
2019-01-03,AMZN,76.000504,76.900002,74.855499,75.014000,139512000.0,0.0,0.0,-0.012980,-1.399991,7.789349,...,-2.883474,76.910247,76.567998,76.826997,0.088791,8.210998e+09,-0.139385,-0.027877,0.000000e+00,-100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,NFLX,332.959991,333.820007,326.010010,329.089996,5036100.0,0.0,0.0,-0.011623,-1.110599,-1.486563,...,-1.727317,332.545719,331.922003,332.394004,-0.736940,4.626725e+09,-0.362229,-0.072448,3.698702e-09,-100.000000
2019-12-30,AAPL,70.500293,71.286988,69.467609,71.002022,144114400.0,0.0,0.0,0.007117,-3.005409,-2.026314,...,0.194338,70.593089,70.666896,70.611042,-0.086838,4.284226e+09,0.033036,0.006607,1.000000e+02,100.000000
2019-12-30,AMZN,93.699997,94.199997,92.030998,92.344498,73494000.0,0.0,0.0,-0.014466,-1.702069,-2.459761,...,8.046641,93.462724,93.260898,93.413631,-0.182487,9.786208e+09,-0.059640,-0.011870,5.704353e-03,-99.999315
2019-12-30,GOOGL,67.840500,67.849998,66.891998,66.985497,19994000.0,0.0,0.0,-0.012603,-1.892095,-1.283335,...,-2.685248,67.742192,67.582701,67.682235,-0.107514,4.959846e+09,-0.047176,-0.002702,4.939676e+00,-97.175735
