## DATA GENERATION

In [109]:
import numpy as np
import pandas as pd
import quandl
import ta
import finta
import os
import pathlib
import shutil
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from PIL import Image
import time
import torch
import copy

In [110]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
torch.cuda.is_available()

GeForce RTX 2070


True

In [111]:
training_directory = os.path.join(os.getcwd(), 'data','train')
test_directory = os.path.join(os.getcwd(), 'data','test')
ordered_directory = os.path.join(os.getcwd(), 'data','ordered')
WINDOW_LOOK_AHEAD = 5

In [112]:
#equites to train models for
equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO','KO','DIS','XOM','GE','GS','HD','IBM','INTC','JNJ','JPM','MCD','MRK','MSFT','NKE','PFE','PG','TRV','UTX','UNH','VZ','WMT']
# equity_list = ['MMM','AXP','AAPL','BA']  #Short version for debugging
# equity_list = ['AAPL']  #Short version for debugging
feature_list = ['RSI','Williams','WMA','EMA','SMA','HMA','3EMA','CCI','CMO','MACD','PPO','ROC','CMFI','DMI','PSI']

In [113]:
# For each equity you chose above, downloads the data and computes the financial indicators
def get_eq_dict(eq_list):
    equities_dfs = {}
    for equity in eq_list:
#         print(equity)
        output_df = pd.DataFrame()
        #Add your quandl key below
        temp_df = quandl.get("EOD/"+equity, authtoken="YOUR KEY HERE")
        date_range_df = temp_df.loc['2001-11-01':'2017-2-28']
        data_range_df_finta = date_range_df.copy() 
        data_range_df_finta.columns = data_range_df_finta.columns.str.lower()
        output_df['RSI'] = ta.momentum.rsi(date_range_df['Close'])
        output_df['Williams'] = ta.momentum.wr(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])
        output_df['WMA'] = ta.trend.wma_indicator(date_range_df['Close'])
        output_df['EMA'] = ta.trend.ema_indicator(date_range_df['Close'])
        output_df['SMA'] = ta.trend.sma_indicator(date_range_df['Close'])
        output_df['HMA'] =  finta.TA.HMA(data_range_df_finta[['open','high','low','close']])
        output_df['3EMA'] = ta.trend.trix(date_range_df['Close'])  #Tripple EMA
        output_df['CCI'] = ta.trend.cci(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])  
        output_df['CMO'] =  finta.TA.CMO(data_range_df_finta[['open','high','low','close']])
        output_df['MACD'] = ta.trend.macd(date_range_df['Close'])  
        output_df['PPO'] = ta.momentum.PercentagePriceOscillator(date_range_df['Close']).ppo()
        output_df['ROC'] = ta.momentum.ROCIndicator(date_range_df['Close']).roc()
        output_df['CMFI'] = ta.volume.ChaikinMoneyFlowIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close'], date_range_df['Volume']).chaikin_money_flow()
        output_df['DMI'] =  ta.trend.ADXIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).adx() # ADX is average direction movement index
        output_df['PSI'] = ta.trend.PSARIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).psar()
        output_df['Close'] = date_range_df['Close']
        output_df['Label'] = 'hold'
        output_df = output_df.reset_index()
        equities_dfs[equity] = output_df
    return equities_dfs

In [114]:
#Scales all features to be between 0 and 1
def scale_and_get_scalers(df_dict,eq_list, features, scaler = None):
    if scaler == None:
        scalers = {}
        for eq in eq_list:
            scalers[eq] = MinMaxScaler()
            scalers[eq].fit(df_dict[eq][features])
            df_dict[eq][features] = scalers[eq].transform(df_dict[eq][features])
        return scalers
    else:
        scalers = scaler
        for eq in eq_list:
            df_dict[eq][features] = scalers[eq].transform(df_dict[eq][features])
        return scalers

In [115]:
# Lables all days buy, hold or sell. See paper in repository for more details.
def label_data(df_dict,eq_list):
    for eq in eq_list:
        for i in range(len(df_dict[eq])):
            # I went 2 months beyond at each end so we could have data for all dates
            if i > 6 and i+5 < len(df_dict[eq]):
                buy = True
                sell = True
                for j in range(1,6):
                    if df_dict[eq].loc[i-j,'Close'] <= df_dict[eq].loc[i,'Close'] or df_dict[eq].loc[i+j,'Close'] <= df_dict[eq].loc[i,'Close']:
                        sell = False
                    if df_dict[eq].loc[i-j,'Close'] >= df_dict[eq].loc[i,'Close'] or df_dict[eq].loc[i+j,'Close'] >= df_dict[eq].loc[i,'Close']:
                        buy = False
                if buy == True:
                    df_dict[eq].loc[i,'Label'] = 'buy'
                elif sell == True:
                    df_dict[eq].loc[i,'Label'] = 'sell'
                else:
                    df_dict[eq].loc[i,'Label'] = 'hold'

In [116]:
# For each equity, generates the training data for this time window in datadir.  Ordered_dir is for keeping the
#  dates in order for predictions.  Otherwise the dates are sorted by the labels.
def gen_training_data(df_dict,eq_list,start, end, features, data_dir, test = False, ordered_dir=None):
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)
    for eq in eq_list:
        os.makedirs(os.path.join(data_dir,eq))
        os.makedirs(os.path.join(data_dir,eq,'buy'))
        os.makedirs(os.path.join(data_dir,eq,'hold'))
        os.makedirs(os.path.join(data_dir,eq,'sell'))

    
    if test == True:
        if os.path.exists(ordered_dir):
            shutil.rmtree(ordered_dir)
        os.makedirs(ordered_dir)
        for eq in eq_list:
            os.makedirs(os.path.join(ordered_dir,eq))
            os.makedirs(os.path.join(ordered_dir,eq,'hold'))
        
    for eq in eq_list:
        train_df = df_dict[eq][(df_dict[eq]['Date']>=(pd.to_datetime(start)- pd.DateOffset(days=25)))&(df_dict[eq]['Date']<=end)]
        train_df.sort_values(by=['Date'])
        train_df.reset_index(drop=True, inplace = True)
        train_df.fillna(0)
        far_enough = False
        first_row = train_df[train_df.Date >= start].index[0]
        for i in range(first_row,len(train_df)):
            np_im = train_df.loc[i-14:i,features].to_numpy()
            img = Image.fromarray(np.uint8(np_im*255))
            if train_df.loc[i,'Label'] == 'buy':
                img.save(os.path.join(data_dir,eq,'buy', str(eq) + str(i)  +'.png'))                    
            elif train_df.loc[i,'Label'] == 'sell':
                img.save(os.path.join(data_dir,eq,'sell', str(eq) + str(i)  +'.png'))                    
            else:
                img.save(os.path.join(data_dir,eq,'hold', str(eq) + str(i)  +'.png')) 
            if test == True:
                img.save(os.path.join(ordered_dir,eq,'hold', str(eq) + str(i)  +'.png'))

## CNN

In [117]:
# from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os

In [118]:
num_epochs = 100
num_classes = 3
batch_size = 256
learning_rate = 0.01

In [119]:
# Used to weight the loss function
def get_num_per_class(dataset):
    labels = torch.zeros(3)
    for _, target in dataset:
#         print(target)
        labels[target] += 1
    return labels

In [120]:
#Called in train model, all of the data loaders are built here for each time window
def get_loaders(equity_name,batch_size=128):
    
    ## Use start and end date to generate the data here?
    
    data_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.25, 0.25, 0.25]) ])
    train_data_dir = os.path.join(os.getcwd(), 'data','train',equity_name)
    test_data_dir = os.path.join(os.getcwd(), 'data','test',equity_name)
    train_data = datasets.ImageFolder(train_data_dir, data_transform)
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,shuffle = True)
    test_data = datasets.ImageFolder(test_data_dir, data_transform)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=300, shuffle=False, num_workers=4)
    
    ordered_data_dir = os.path.join(os.getcwd(), 'data','ordered',equity_name)
    test_data_ordered = datasets.ImageFolder(ordered_data_dir, data_transform)
    test_dataloader_ordered = torch.utils.data.DataLoader(test_data_ordered, batch_size=300, shuffle=False, num_workers=4)

    
    
    train_dataset_size = len(train_data)
    test_dataset_size = len(test_data)
    ordered_dataset_size = len(test_data_ordered)
    
    class_names = train_data.classes
    return train_dataloader, test_dataloader, train_dataset_size, test_dataset_size, train_data, test_dataloader_ordered,ordered_dataset_size

In [121]:
from torch.nn import Linear, ReLU, Sequential, Conv2d, MaxPool2d, Softmax, BatchNorm2d, Dropout, Module, CrossEntropyLoss, Flatten

In [122]:
# Simple CNN-Tar model.  See paper for architecture
class Basic_CNN(Module):   
    def __init__(self,linear_size=64*7*7):
        super(Basic_CNN, self).__init__()
        self.layer1 = Sequential(
            Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            ReLU(inplace=True),
            BatchNorm2d(32))
        self.layer2 =  Sequential(
            Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            ReLU(inplace=True),
            BatchNorm2d(64),
            MaxPool2d(kernel_size=2, stride=2),
            Flatten()
        )
            
        self.Dropout1 = Dropout(0.25)
        
        self.Dropout2 = Dropout(0.5)

        self.linear_layer1 = Linear(linear_size, 128)
        self.linear_layer2 = Linear(128, 3)
        self.soft_m = Softmax()
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.Dropout1(x)
        x = self.linear_layer1(x)
        x = self.Dropout2(x)
        x = self.linear_layer2(x)
        x = self.soft_m(x)
        return x

In [138]:
# A custom inception block used for the inception experiments.  If changing the number of features change the linear_size as well
class Inception(Module):   
    def __init__(self,linear_size=173475):
        super(Inception, self).__init__()
        self.CONV1 = Sequential(Conv2d(3, 64, kernel_size=1),ReLU(inplace=True),BatchNorm2d(64))
        self.CONV2 = Sequential(Conv2d(3, 64, kernel_size=3, padding=1),ReLU(inplace=True),BatchNorm2d(64))
        self.CONV3 = Sequential(Conv2d(3, 64, kernel_size=5, padding=2),ReLU(inplace=True),BatchNorm2d(64))
        self.CONV4 = Sequential(Conv2d(3, 64, kernel_size=7, padding=3),ReLU(inplace=True),BatchNorm2d(64))
        self.MP1 = Sequential(MaxPool2d(kernel_size = 3,stride=1,padding=1))
        
        self.CONV5 = Sequential(Conv2d(259, 128, kernel_size=1),ReLU(inplace=True),BatchNorm2d(128))
        self.CONV6 = Sequential(Conv2d(259, 128, kernel_size=3, padding=1),ReLU(inplace=True),BatchNorm2d(128))
        self.CONV7 = Sequential(Conv2d(259, 128, kernel_size=5, padding=2),ReLU(inplace=True),BatchNorm2d(128))
        self.CONV8 =Sequential( Conv2d(259, 128, kernel_size=7, padding=3),ReLU(inplace=True),BatchNorm2d(128))
        self.MP2 = Sequential(MaxPool2d(kernel_size = 3,stride=1,padding=1))
        self.Flat = Sequential( Flatten())
        self.Dropout1 = Dropout(0.25)
        
        self.Dropout2 = Dropout(0.5)

        self.linear_layer1 = Linear(linear_size, 128)
        self.linear_layer2 = Linear(128, 3)
        self.soft_m = Softmax()
        
    def forward(self, x):
        x1 = self.CONV1(x)
        x2 = self.CONV2(x)
        x3 = self.CONV3(x)
        x4 = self.CONV4(x)
        m1 = self.MP1(x)
                              
        x = torch.cat([x1,x2,x3,x4,m1], 1)
                              
        x5 = self.CONV5(x)
        x6 = self.CONV6(x)
        x7 = self.CONV7(x)
        x8 = self.CONV8(x)
        m2 = self.MP1(x)
                              
        x = torch.cat([x5,x6,x7,x8,m2], 1)
                              
        x = self.Flat(x)
        print(x.shape)
        x = self.Dropout1(x)
        x = self.linear_layer1(x)
        x = self.Dropout2(x)
        x = self.linear_layer2(x)
        x = self.soft_m(x)
        return x


In [125]:
# Initialized the model, criterion and loss fucniton for the baseline and additional data experiments
def initialize(train_data, linear_size=(64*7*7)):
    model = Basic_CNN(linear_size).cuda()
    class_weights = 1/ get_num_per_class(train_data).cuda()
    class_weights[0] *=2.0
    class_weights[2] *=2.0
    criterion = CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    return model, criterion, optimizer

In [126]:
# Initialized the model, criterion and loss fucniton for the inception experiments
def initialize_inception(train_data, linear_size=(173475)):
    model = Inception(linear_size).cuda()
    class_weights = 1/ get_num_per_class(train_data).cuda()
    class_weights[0] *=1.15
    class_weights[2] *=1.15
    criterion = CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    return model, criterion, optimizer

In [127]:
# I swapped the columns to match the order in the paper so it's easier to compare
def PrintConfMatrix(data):
    print('                       Predicted')
    print('                  Hold       Buy      Sell')
    print('         Hold  ' ,data[1,1].item() ,' ' ,data[1,0].item() ,'  ',data[1,2].item())
    print('Actual   Buy   ' ,data[0,1].item() ,'  ' ,data[0,0].item() ,'  ',data[0,2].item())
    print('         Sell  ' ,data[2,1].item() ,'  ' ,data[2,0].item() ,'   ',data[2,2].item())

In [128]:
# For reporting recall, precision and F1 in the order I wanted them
def eval_mat(data):
    print('                   Hold                  Buy                    Sell')
    Hrecall = data[1,1].item()/(data[1,1].item() + data[1,0].item() +data[1,2].item())
    Brecall = data[0,0].item()/(data[0,0].item() + data[0,1].item() + data[0,2].item())
    Srecall = data[2,2].item()/(data[2,2].item()+data[2,0].item()+data[2,1].item())
    
    Hprec = data[1,1].item()/(data[1,1].item()+data[0,1].item() +data[2,1].item())
    Bprec = data[0,0].item()/(data[0,0].item()+data[1,0].item() +data[2,0].item())
    Sprec = data[2,2].item()/(data[2,2].item()+data[0,2].item() +data[1,2].item())
    
    
    print('Recall      ' ,Hrecall,'  '  ,Brecall ,'   ',Srecall)
    print('Precision   ', Hprec ,'  ' ,Bprec ,'  ',Sprec)
    print('F1 Score    ', 2*(Hrecall*Hprec)/(Hrecall+Hprec) ,'  ' ,2*(Brecall*Bprec)/(Brecall+Bprec) ,'   ',2*(Srecall*Sprec)/(Srecall+Sprec))
    
    
    

In [129]:
# Trains on one time window of 5 years.  Called for all models
def train_model(equity, old_model = None,linear_size=(64*7*7), inception = False, batch_size = 128):
    # Train the model
    print(equity)
    if inception == True:
        linear_size = 173475
    
    train_dataloader, test_dataloader, train_dataset_size, test_dataset_size, train_data, test_dataloader_ordered,ordered_dataset_size = get_loaders(equity,batch_size)
    if inception == False:
        model, criterion, optimizer = initialize(train_data, linear_size)
    else:
        model, criterion, optimizer = initialize_inception(train_data, linear_size)
    if old_model != None:
        model = old_model
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    total_step = len(train_dataloader)
    loss_list = []
    acc_list = []
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        running_corrects = 0
        start_time = time.time()
        for i, (inputs, labels) in enumerate(train_dataloader):
            inputs.to(device)
            labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs.cuda())
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels.cuda())
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.cuda().data)

        epoch_loss = running_loss / train_dataset_size
        epoch_acc = running_corrects.double() / train_dataset_size
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        #This was outputting more info than we needed.  I'm leaving it here in case someone wants it for testing
#         print('Epoch: {} Loss: {:.4f} Acc: {:.4f}   Time: {:.4f}'.format(
#             epoch, epoch_loss, epoch_acc,time.time()-start_time))   




    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, test_dataloader, test_dataloader_ordered


In [130]:
# Tests 1 year(or whatever you set it to) window following the training period
def test_model(model, test_dataloader):
    nb_classes = 3
    model.eval()
    predictions = None
    confusion_matrix = torch.zeros(nb_classes, nb_classes)
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            if predictions == None:
                predictions = preds
            else:
                torch.cat((predictions, preds))
            for t, p in zip(labels.view(-1), preds.view(-1)):
                    confusion_matrix[t.long(), p.long()] += 1

#     PrintConfMatrix(confusion_matrix)
    return model, confusion_matrix, predictions

## BASELINE

In [None]:
equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO','KO','DIS','XOM','GE','GS','HD','IBM','INTC','JNJ','JPM','MCD','MRK','MSFT','NKE','PFE','PG','TRV','UTX','UNH','VZ','WMT']
# equity_list = ['MMM','AXP','AAPL','BA']  #Short version for debugging
# equity_list = ['AAPL']  #Short version for debugging
feature_list = ['RSI','Williams','WMA','EMA','SMA','HMA','3EMA','CCI','CMO','MACD','PPO','ROC','CMFI','DMI','PSI']

# These calls build the dataframe, scale it and get the labels
equities_dfs = get_eq_dict(equity_list)
scaler = scale_and_get_scalers(equities_dfs,equity_list,feature_list)
label_data(equities_dfs,equity_list)

In [89]:
total_confusion_matrix = torch.zeros(3, 3)
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
model_dict_base = {}
output_dict = {}
for equity in equity_list:
    output_dict[equity] = []
    model_dict_base[equity] = []
# 10 sliding windows
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    # These generate the training data for all equites.  See the folder structure generated if needed
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)
    # For each equity in this time window
    for equity in equity_list:
        # This is here if it predicts all of one class.  The imbalanced data caused this to happen a small amount of the time
        tries = 1
        while tries <= 10:
            tries += 1
            model, test_dataloader, test_dataloader_ordered = train_model(equity,batch_size=batch_size)
            _, confusion_matrix, _ = test_model(model, test_dataloader)
            
            _, _, predictions_ord = test_model(model, test_dataloader_ordered)
            
            if torch.max(torch.sum(confusion_matrix, dim = 0)).item() != torch.sum(confusion_matrix).item():
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
                break
            #If it didn't get it after 10 move on.  Generally gets it second try though
            if tries == 11:
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
        output_dict[equity].append([test_start,predictions_ord])
        model_dict_base[equity].append([test_start,model])

0
MMM


  x = self.soft_m(x)


AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
1
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
2
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
3
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
4
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
5
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
6
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
7
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
8
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
9
MMM
A

In [90]:
PrintConfMatrix(total_confusion_matrix)

                       Predicted
                  Hold       Buy      Sell
         Hold   23693.0   21588.0    16622.0
Actual   Buy    504.0    3672.0    78.0
         Sell   558.0    133.0     3656.0


In [91]:
eval_mat(total_confusion_matrix)

                   Hold                  Buy                    Sell
Recall       0.38274397040531155    0.8631875881523272     0.8410397975615367
Precision    0.957099575843264    0.14460678139644784    0.1796030654352525
F1 Score     0.5468162200835468    0.24771477721185955     0.2959964376796341


In [92]:
#Get the predicitons for each date
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
output_dict = {}
for equity in equity_list:
    output_dict[equity] = []
    
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)    
    
    for equity in equity_list:
        _, _, _, _, _, test_dataloader_ordered,_ = get_loaders(equity)
        _, _, predictions_ord = test_model(model_dict_base[equity][i][1], test_dataloader_ordered)
        output_dict[equity].append([model_dict_base[equity][i][0],predictions_ord])

0


  x = self.soft_m(x)


1
2
3
4
5
6
7
8
9


In [93]:
# Write to CSV for analysis
for equity in equity_list:
    eq_df = pd.DataFrame()
    for preds in output_dict[equity]:
        if eq_df.empty == True:
            eq_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
        else:
            temp_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
            eq_df = pd.concat([eq_df, temp_df],axis = 1)
    eq_df.to_csv(equity+'_baseline.csv')

## Additional Features

In [94]:
num_epochs = 150

In [95]:
equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO','KO','DIS','XOM','GE','GS','HD','IBM','INTC','JNJ','JPM','MCD','MRK','MSFT','NKE','PFE','PG','TRV','UTX','UNH','VZ','WMT']
# equity_list = ['MMM']
feature_list = ['RSI','Williams','WMA','EMA','SMA','HMA','3EMA','CCI','CMO','MACD','PPO','ROC','CMFI','DMI','PSI','SSMA','EFI','CFI','QSTICK','EVWMA','VFI','FVE','STC','MOM','SAR','VAMA','PERCENT_B','FISH','ER']

In [96]:
# Buildes the eq dict for the new and original features
def get_eq_dict_additional_ft(eq_list):
    equities_dfs = {}
    for equity in eq_list:
#         print(equity)
        output_df = pd.DataFrame()
        temp_df = quandl.get("EOD/"+equity, authtoken="6y4QKxqZxio2nBP3VSwZ")
        date_range_df = temp_df.loc['2001-11-01':'2018-2-28']
        data_range_df_finta = date_range_df.copy() 
        data_range_df_finta.columns = data_range_df_finta.columns.str.lower()
        output_df['RSI'] = ta.momentum.rsi(date_range_df['Close'])
        output_df['Williams'] = ta.momentum.wr(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])
        output_df['WMA'] = ta.trend.wma_indicator(date_range_df['Close'])
        output_df['EMA'] = ta.trend.ema_indicator(date_range_df['Close'])
        output_df['SMA'] = ta.trend.sma_indicator(date_range_df['Close'])
        output_df['HMA'] =  finta.TA.HMA(data_range_df_finta[['open','high','low','close']])
        output_df['3EMA'] = ta.trend.trix(date_range_df['Close'])  #Tripple EMA
        output_df['CCI'] = ta.trend.cci(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])  
        output_df['CMO'] =  finta.TA.CMO(data_range_df_finta[['open','high','low','close']])
        output_df['MACD'] = ta.trend.macd(date_range_df['Close'])  
        output_df['PPO'] = ta.momentum.PercentagePriceOscillator(date_range_df['Close']).ppo()
        output_df['ROC'] = ta.momentum.ROCIndicator(date_range_df['Close']).roc()
        output_df['CMFI'] = ta.volume.ChaikinMoneyFlowIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close'], date_range_df['Volume']).chaikin_money_flow()
        output_df['DMI'] =  ta.trend.ADXIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).adx() # ADX is average direction movement index
        output_df['PSI'] = ta.trend.PSARIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).psar()
        
        #New Features
        output_df['SSMA'] = finta.TA.SSMA(data_range_df_finta[['open','high','low','close','volume']])
        output_df['EFI'] = finta.TA.EFI(data_range_df_finta[['open','high','low','close','volume']])
        output_df['CFI'] = finta.TA.CFI(data_range_df_finta[['open','high','low','close','volume']])
        output_df['QSTICK'] = finta.TA.QSTICK(data_range_df_finta[['open','high','low','close','volume']])
        output_df['EVWMA'] =  finta.TA.EVWMA(data_range_df_finta[['open','high','low','close','volume']])
        output_df['VFI'] = finta.TA.VFI(data_range_df_finta[['open','high','low','close','volume']])
        output_df['FVE'] = finta.TA.FVE(data_range_df_finta[['open','high','low','close','volume']])
        output_df['STC'] = finta.TA.STC(data_range_df_finta[['open','high','low','close','volume']])
        output_df['MOM'] = finta.TA.MOM(data_range_df_finta[['open','high','low','close','volume']])
        output_df['SAR'] = finta.TA.SAR(data_range_df_finta[['open','high','low','close','volume']])
        output_df['VAMA'] = finta.TA.VAMA(data_range_df_finta[['open','high','low','close','volume']])
        output_df['PERCENT_B'] = finta.TA.PERCENT_B(data_range_df_finta[['open','high','low','close','volume']])
        output_df['FISH'] = finta.TA.FISH(data_range_df_finta[['open','high','low','close','volume']])
        output_df['ER'] = finta.TA.ER(data_range_df_finta[['open','high','low','close','volume']])

        
        output_df['Close'] = date_range_df['Close']
        output_df['Label'] = 'hold'
        output_df = output_df.reset_index()
        equities_dfs[equity] = output_df
    return equities_dfs

In [97]:
#Everything after this point functions the same as the baseline model
equities_dfs = get_eq_dict_additional_ft(equity_list)
scaler = scale_and_get_scalers(equities_dfs,equity_list,feature_list)
label_data(equities_dfs,equity_list)

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [104]:
total_confusion_matrix = torch.zeros(3, 3)
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
model_dict_ad = {}
output_dict_ad = {}
for equity in equity_list:
    output_dict_ad[equity] = []
    model_dict_ad[equity] = []
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)
    for equity in equity_list:
        tries = 1
        while tries <= 10:
            tries += 1
            model, test_dataloader, test_dataloader_ordered = train_model(equity,linear_size=(64*7*14))
            _, confusion_matrix, _ = test_model(model, test_dataloader)
            _, _, predictions_ord = test_model(model, test_dataloader_ordered)
            if torch.max(torch.sum(confusion_matrix, dim = 0)).item() != torch.sum(confusion_matrix).item():
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
                break
            if tries == 11:
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
        output_dict_ad[equity].append([test_start,predictions_ord])
        model_dict_ad[equity].append([test_start,model])

0
MMM


  x = self.soft_m(x)


AXP
AXP
AAPL
AAPL
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
XOM
GE
GE
GS
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
PG
PG
TRV
UTX
UNH
UNH
UNH
VZ
WMT
1
MMM
AXP
AAPL
AAPL
AAPL
AAPL
AAPL
BA
CAT
CVX
CSCO
KO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JNJ
JPM
MCD
MCD
MCD
MRK
MSFT
NKE
PFE
PG
TRV
TRV
UTX
UNH
VZ
WMT
2
MMM
MMM
AXP
AAPL
AAPL
AAPL
AAPL
AAPL
BA
CAT
CAT
CVX
CSCO
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
MSFT
NKE
NKE
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
3
MMM
MMM
AXP
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
BA
CAT
CVX
CSCO
KO
DIS
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JNJ
JPM
MCD
MRK
MSFT
NKE
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
4
MMM
AXP
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
BA
CAT
CVX
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
MSFT
NKE
PFE
PG
PG
PG
TRV
UTX
UNH
VZ
WMT
5
MMM
MMM
AXP
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
AAPL
BA
CAT
CVX
CVX
CSCO
KO
DIS
XOM
XOM
GE
GS
HD
HD
IBM
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
PG
TRV
UTX
UNH
UNH
VZ
WMT
6
MMM
AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE


In [105]:
PrintConfMatrix(total_confusion_matrix)

                       Predicted
                  Hold       Buy      Sell
         Hold   26796.0   19205.0    15902.0
Actual   Buy    714.0    3436.0    104.0
         Sell   755.0    149.0     3443.0


In [106]:
eval_mat(total_confusion_matrix)

                   Hold                  Buy                    Sell
Recall       0.43287078170686394    0.8077103902209685     0.7920404876926616
Precision    0.9480275959667434    0.15076788064940763    0.17702709650881793
F1 Score     0.5943571998935321    0.2541044224227185     0.2893763657757606


In [107]:
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
output_dict_ad = {}
for equity in equity_list:
    output_dict_ad[equity] = []
    
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)    
    
    for equity in equity_list:
        _, _, _, _, _, test_dataloader_ordered,_ = get_loaders(equity)
        _, _, predictions_ord = test_model(model_dict_ad[equity][i][1], test_dataloader_ordered)
        output_dict_ad[equity].append([model_dict_ad[equity][i][0],predictions_ord])

0


  x = self.soft_m(x)


1
2
3
4
5
6
7
8
9


In [108]:
for equity in equity_list:
    eq_df = pd.DataFrame()
    for preds in output_dict_ad[equity]:
        if eq_df.empty == True:
            eq_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
        else:
            temp_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
            eq_df = pd.concat([eq_df, temp_df],axis = 1)
    eq_df.to_csv(equity+'_additional.csv')

## INCEPTION

In [140]:
# equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO','KO','DIS','XOM','GE','GS','HD','IBM','INTC','JNJ','JPM','MCD','MRK','MSFT','NKE','PFE','PG','TRV','UTX','UNH','VZ','WMT']
equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO']


# equity_list = ['MMM','AXP','AAPL','BA']  #Short version for debugging
# equity_list = ['AAPL']  #Short version for debugging
feature_list = ['RSI','Williams','WMA','EMA','SMA','HMA','3EMA','CCI','CMO','MACD','PPO','ROC','CMFI','DMI','PSI']

equities_dfs = get_eq_dict(equity_list)
scaler = scale_and_get_scalers(equities_dfs,equity_list,feature_list)
label_data(equities_dfs,equity_list)


  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [141]:
num_epochs = 100
num_classes = 3
batch_size = 512
learning_rate = 0.01

In [None]:
total_confusion_matrix = torch.zeros(3, 3)
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
model_dict_in = {}
output_dict_in = {}
for equity in equity_list:
    output_dict_in[equity] = []
    model_dict_in[equity] = []
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)
    for equity in equity_list:
        tries = 1
        while tries <= 10:
            tries += 1
            #note that inception = True below
            model, test_dataloader, test_dataloader_ordered = train_model(equity,inception = True,batch_size = batch_size)
            _, confusion_matrix, _ = test_model(model, test_dataloader)
            _, _, predictions_ord = test_model(model, test_dataloader_ordered)
            # Here I had it retry if any of the labels was never chosen.  Inception was more prone to falling into local minima.
            if torch.min(torch.sum(confusion_matrix, dim = 0)).item() != 0.0:
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
                break
            #If it didn't get it after 10 move on.  Generally gets it second try though
            if tries == 11:
                total_confusion_matrix = torch.add(total_confusion_matrix, confusion_matrix)
        output_dict_in[equity].append([test_start,predictions_ord])
        model_dict_in[equity].append([test_start,model])

In [None]:
PrintConfMatrix(total_confusion_matrix)

In [None]:
eval_mat(total_confusion_matrix)

In [None]:
start_date = pd.to_datetime('2002-01-01')
end_date = pd.to_datetime('2006-12-31')
output_dict_in = {}
for equity in equity_list:
    output_dict_in[equity] = []
    
for i in range(10):
    print(i)
    start = start_date +pd.offsets.DateOffset(years=i)
    end = end_date + pd.offsets.DateOffset(years=i)
    test_start = start_date +pd.offsets.DateOffset(years=(i+5))
    test_end = end_date +pd.offsets.DateOffset(years=(i+1))
    gen_training_data(equities_dfs,equity_list,start,end,feature_list,training_directory)
    gen_training_data(equities_dfs,equity_list,test_start,test_end,feature_list,test_directory, test = True, ordered_dir = ordered_directory)    
    
    for equity in equity_list:
        _, _, _, _, _, test_dataloader_ordered,_ = get_loaders(equity)
        _, _, predictions_ord = test_model(model_dict_in[equity][i][1], test_dataloader_ordered)
        output_dict_in[equity].append([model_dict_in[equity][i][0],predictions_ord])

In [None]:
for equity in equity_list:
    eq_df = pd.DataFrame()
    for preds in output_dict_in[equity]:
        if eq_df.empty == True:
            eq_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
        else:
            temp_df = pd.DataFrame({str(preds[0]):preds[1].tolist()})
            eq_df = pd.concat([eq_df, temp_df],axis = 1)
    eq_df.to_csv(equity+'_inception.csv')