In [3]:
import os
import math
import time
import datetime
import argparse

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import talib as ta
import qlib
from qlib.data import D
from qlib.data.dataset.loader import QlibDataLoader
from qlib.constant import REG_CN, REG_US


from utlis import get_base_company, get_data, get_features_n_labels, get_windows
from model import DCGNN, TotalLoss

torch.set_num_threads(6)

In [2]:
parser = argparse.ArgumentParser(description='Stock Prediction')

parser.add_argument('--prestart_time', type=str, default='2000-01-01')
parser.add_argument('--start_time', type=str, default='2004-10-31')
parser.add_argument('--end_time', type=str, default='2020-01-01')
parser.add_argument('--lagend_time', type=str, default='2020-10-31')
parser.add_argument('--save_path', type=str, default='./output')
parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--weight_decay', type=float, default=5e-4)
parser.add_argument('--epochs', type=int, default=400)
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--window_size', type=int, default=12)

args = parser.parse_args(args=[
    '--save_path', './output',
    '--prestart_time', '2015-06-01',
    '--start_time', '2016-07-01',
    '--end_time', '2020-01-01',
    '--lagend_time', '2020-10-30',
    ])

In [4]:
selected_tickers = get_base_company('2014-01-01', '2020-01-01')
all_timestamps, all_tickers, all_data = get_data('2014-01-01', '2016-06-30', selected_tickers, market='nasdaq100')

In [5]:
## features
features, labels, company_final, final_timestamps = get_features_n_labels(args=args, selected_tickers=all_tickers)
binary_labels = (labels > 0) * torch.ones_like(labels)

Loading base technical data...
Loading indicators...


In [None]:
learning_rate = 0.0005
total_epoch = 200

device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Device: '{}'.".format(device))

In [1]:
output_path = args.save_path + '/' + (time.strftime(r'%Y-%m-%d_%H-%M-%S',time.localtime(time.time())))
output_filename = output_path + '/' + 'dcgnn.pt'
train_log_filename = output_path + '/' + 'trainlog.txt'
if not os.path.exists(output_path):
    os.makedirs(output_path)
    print("Output dir '{}' has been created.".format(output_path))
else:
    print("Output dir '{}' is existed.".format(output_path))
with open(train_log_filename, 'w', encoding='utf-8') as f:
    f.write('Train Log:' + '\n')

In [8]:
def train(model, optimizer, criterion, rmv_feature_num):
    model.train()
    total_loss = np.array([])
    for x, y in get_windows(inputs=features, targets=binary_labels, dataset='train', device=device, shuffle=True):
        y_hat = model(x)
        loss = criterion(y_hat, y.long())
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_value_(model.parameters(), 3.)
        optimizer.step()
        total_loss = np.append(total_loss, loss.item())
    train_loss = total_loss.mean()
    return train_loss

In [9]:
def test(model, dataset, cls_report=False):
    all_acc = np.array([])
    predictions = torch.Tensor([])
    ys = torch.Tensor([])
    model.eval()
    
    with torch.no_grad():
        for x, y in get_windows(inputs=features, targets=binary_labels, dataset=dataset, device=device):
            y_hat = model(x)
            prediction = y_hat.max(1)[1]
            acc = torch.eq(prediction, y).float().mean().cpu().numpy()
            predictions = torch.cat([predictions, prediction.cpu()], dim=0)
            ys = torch.cat([ys, y.cpu()], dim=0)
            all_acc = np.append(all_acc, acc)
        accuracy = torch.eq(predictions, ys).float().mean().cpu().numpy()
    if cls_report == False:
        return accuracy
    else:
        return accuracy, ys, predictions

In [10]:
def main(model, optimizer, criterion, total_epoch, scheduler, pt=True):
    with open(train_log_filename, 'w', encoding='utf-8') as f:
        f.write('Train Log:' + '\n')

    best_val = -math.inf
    loss_train_history = []
    val_acc_history = []
    for epoch in range(1, total_epoch+1):
        # ---------training------------
        train_loss = train(model, optimizer, criterion)
        lr_temp = optimizer.param_groups[-1]['lr']
        scheduler.step()
        # --------evaluation-----------
        train_acc = test(model=model, dataset='train')
        val_acc, ys, preds = test(model=model, dataset='valid', cls_report=True)
        loss_train_history.append(train_loss)
        val_acc_history.append(val_acc)
        if pt:
            print("| Epoch {:3d} | TrainLoss {:6.4f} | TrainAcc {:6.4} | ValAcc {:6.4f} | ValMCC {:6.4f} | lr {:6.8f} |".format(epoch, train_loss, train_acc, val_acc, metrics.matthews_corrcoef(ys, preds), lr_temp))
        with open(train_log_filename, 'a', encoding='utf-8') as f:
            f.write("| Epoch {:3d} | TrainLoss {:6.4f} | TrainAcc {:6.4} | ValAcc {:6.4f} | ValMCC {:6.4f} | lr {:6.8f} |".format(epoch, train_loss, train_acc, val_acc, metrics.matthews_corrcoef(ys, preds), lr_temp) + '\n')
        test_acc, ys, preds = test(model=model, dataset='test', cls_report=True)
        if pt:
            print("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)))
        with open(train_log_filename, 'a', encoding='utf-8') as f:
            f.write("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)) + '\n')
        # -----------------------------
        if (epoch % 5 == 0) or (val_acc > best_val):
            if (val_acc > best_val):
                torch.save(model, output_filename)
                best_val = val_acc
            valid_acc, ys, preds = test(model=model, dataset='valid', cls_report=True)
            if pt:
                print('VALID CLASSIFICATION: ')
                print(metrics.classification_report(ys, preds))
            test_acc, ys, preds = test(model=model, dataset='test', cls_report=True)
            if pt:
                print("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)))
                print('TEST CLASSIFICATION: ')
                print(metrics.classification_report(ys, preds))
            with open(train_log_filename, 'a', encoding='utf-8') as f:
                f.write("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)) + '\n')

    testmodel = torch.load(output_filename)
    final_train_acc = test(model=testmodel, dataset='train')
    final_val_acc = test(model=testmodel, dataset='valid')
    final_test_acc = test(model=testmodel, dataset='test')
    print("| TrainAcc {:6.4} | ValAcc {:6.4f} | TestAcc {:6.4f} |".format(final_train_acc, final_val_acc, final_test_acc))
    with open(train_log_filename, 'a', encoding='utf-8') as f:
        f.write("| TrainAcc {:6.4} | ValAcc {:6.4f} | TestAcc {:6.4f} |".format(final_train_acc, final_val_acc, final_test_acc) + '\n')

In [2]:
print("Creating model...")
n_feat = features.size(2)
num_nodes = features.size(1)
model = DCGNN(n_feat=n_feat).to(device)
criterion = TotalLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.95)
print("Done.")

train_log_filename = output_path + '/' + 'trainlog.txt'
main(model, optimizer, criterion, total_epoch, scheduler, pt=False)