In [1]:
import numpy as np
import pandas as pd
import quandl
import ta
import finta
import os
import pathlib
import shutil
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from PIL import Image
import time

In [2]:
training_directory = os.path.join(os.getcwd(), 'train')
test_directory = os.path.join(os.getcwd(), 'test')
WINDOW_LOOK_AHEAD = 5

In [3]:
equity_list = ['MMM','AXP','AAPL','BA','CAT','CVX','CSCO','KO','DIS','XOM','GE','GS','HD','IBM','INTC','JNJ','JPM','MCD','MRK','MSFT','NKE','PFE','PG','TRV','UTX','UNH','VZ','WMT']
# equity_list = ['MMM','AXP','AAPL','BA']  #Short version for debugging
feature_list = ['RSI','Williams','WMA','EMA','SMA','HMA','3EMA','CCI','CMO','MACD','PPO','ROC','CMFI','DMI','PSI']

In [4]:
def get_eq_dict(eq_list):
    equities_dfs = {}
    for equity in eq_list:
        print(equity)
        output_df = pd.DataFrame()
        temp_df = quandl.get("EOD/"+equity, authtoken="6y4QKxqZxio2nBP3VSwZ")
        date_range_df = temp_df.loc['2001-11-01':'2017-2-28']
        data_range_df_finta = date_range_df.copy() 
        data_range_df_finta.columns = data_range_df_finta.columns.str.lower()
        output_df['RSI'] = ta.momentum.rsi(date_range_df['Close'])
        output_df['Williams'] = ta.momentum.wr(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])
        output_df['WMA'] = ta.trend.wma_indicator(date_range_df['Close'])
        output_df['EMA'] = ta.trend.ema_indicator(date_range_df['Close'])
        output_df['SMA'] = ta.trend.sma_indicator(date_range_df['Close'])
        output_df['HMA'] =  finta.TA.HMA(data_range_df_finta[['open','high','low','close']])
        output_df['3EMA'] = ta.trend.trix(date_range_df['Close'])  #Tripple EMA
        output_df['CCI'] = ta.trend.cci(date_range_df['High'],date_range_df['Low'],date_range_df['Close'])  
        output_df['CMO'] =  finta.TA.CMO(data_range_df_finta[['open','high','low','close']])
        output_df['MACD'] = ta.trend.macd(date_range_df['Close'])  
        output_df['PPO'] = ta.momentum.PercentagePriceOscillator(date_range_df['Close']).ppo()
        output_df['ROC'] = ta.momentum.ROCIndicator(date_range_df['Close']).roc()
        output_df['CMFI'] = ta.volume.ChaikinMoneyFlowIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close'], date_range_df['Volume']).chaikin_money_flow()
        output_df['DMI'] =  ta.trend.ADXIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).adx() # ADX is average direction movement index
        output_df['PSI'] = ta.trend.PSARIndicator(date_range_df['High'],date_range_df['Low'],date_range_df['Close']).psar()
        output_df['Close'] = date_range_df['Close']
        output_df['Label'] = 'hold'
        output_df = output_df.reset_index()
        equities_dfs[equity] = output_df
    return equities_dfs

In [5]:
#Scales all features to be between 0 and 1
def scale_and_get_scalers(df_dict,eq_list, features):
    scalers = {}
    for eq in eq_list:
        scalers[eq] = MinMaxScaler()
        scalers[eq].fit(df_dict[eq][features])
        df_dict[eq][features] = scalers[eq].transform(df_dict[eq][features])
    return scalers

In [6]:
def label_data(df_dict,eq_list):
    for eq in eq_list:
        for i in range(len(df_dict[eq])):
            # I went 2 months beyond at each end so we could have data for all dates
            if i > 6 and i+5 < len(df_dict[eq]):
                buy = True
                sell = True
                for j in range(1,6):
                    if df_dict[eq].loc[i-j,'Close'] <= df_dict[eq].loc[i,'Close'] or df_dict[eq].loc[i+j,'Close'] <= df_dict[eq].loc[i,'Close']:
                        sell = False
                    if df_dict[eq].loc[i-j,'Close'] >= df_dict[eq].loc[i,'Close'] or df_dict[eq].loc[i+j,'Close'] >= df_dict[eq].loc[i,'Close']:
                        buy = False
                if buy == True:
                    df_dict[eq].loc[i,'Label'] = 'buy'
                elif sell == True:
                    df_dict[eq].loc[i,'Label'] = 'sell'
                else:
                    df_dict[eq].loc[i,'Label'] = 'hold'

In [7]:
#This may need to get changed later, I'm making it up as I go
def gen_training_data(df_dict,eq_list,start, end, features, data_dir):
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)
    for eq in eq_list:
        os.makedirs(os.path.join(data_dir,eq))
        os.makedirs(os.path.join(data_dir,eq,'buy'))
        os.makedirs(os.path.join(data_dir,eq,'hold'))
        os.makedirs(os.path.join(data_dir,eq,'sell'))
    for eq in eq_list:
        #This needs to be modified.  I used 10 to account for days off and weekends but it can change so need to do it by rows
        #Find a way to check how to do that
        train_df = pd.DataFrame(df_dict[eq].loc[pd.to_datetime(start)- pd.DateOffset(days=10):end])
        train_df = train_df.reset_index()
        for i in range(len(train_df)):
            #Need to skip first 14 day as they don't have enough data to make the image
            if i > 14:
                np_im = train_df.loc[i-14:i,features].to_numpy()
#                 np_im = (np_im * 255.0).astype(int)
                img = Image.fromarray(np.uint8(np_im*255))
#                 img = Image.fromarray(np_im)
#                 if i == 105:
#                     print(train_df.loc[i-14:i,features].to_numpy())
#                     print(np_im.shape)
#                     print(np_im)
                if train_df.loc[i,'Label'] == 'buy':
                    img.save(os.path.join(data_dir,eq,'buy', str(eq) + str(i)  +'.png'))                    
                elif train_df.loc[i,'Label'] == 'sell':
                    img.save(os.path.join(data_dir,eq,'sell', str(eq) + str(i)  +'.png'))                    
                else:
                    img.save(os.path.join(data_dir,eq,'hold', str(eq) + str(i)  +'.png')) 

In [8]:
start_t = time.time()
equities_dfs = get_eq_dict(equity_list)
print(time.time()-start_t)
scaler = scale_and_get_scalers(equities_dfs,equity_list,feature_list)
label_data(equities_dfs,equity_list)
for eq in equity_list:
    equities_dfs[eq] = equities_dfs[eq].set_index('Date')

MMM


  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


AXP
AAPL
BA
CAT
CVX
CSCO
KO
DIS
XOM
GE
GS
HD
IBM
INTC
JNJ
JPM
MCD
MRK
MSFT
NKE
PFE
PG
TRV
UTX
UNH
VZ
WMT
154.012051820755


In [None]:
start_time = time.time()
gen_training_data(equities_dfs,equity_list,'2002-01-01','2006-12-31',feature_list,training_directory)
print(time.time()-start_time)