In [1]:
# IMPORT STATEMENTS

import sqlalchemy
# import pymysql
import os
import shutil
import model_config as mcfig
import pandas as pd
import numpy as np
import datetime as dt

import sys
sys.path.append('../')
import private_info.db_info as dbi

In [2]:
#CUSTOM ERROR CLASSES

In [3]:
class AssetNotExistInDB(Exception):
    """One of the assets requested does not exist in the database!"""
    pass

In [4]:
#GET NECESSARY VARIABLES FROM MODEL CONFIGURATION FILE

In [5]:
asset_pred_list = mcfig.asset_pred_list

In [6]:
train_start = mcfig.train_start
train_end = mcfig.train_end

In [7]:
train_start_dt = dt.datetime.strptime(train_start, '%Y-%m-%d')
train_end_dt = dt.datetime.strptime(train_end, '%Y-%m-%d')

In [8]:
test_start = mcfig.test_start
test_end = mcfig.test_end

In [9]:
test_start_dt = dt.datetime.strptime(test_start, '%Y-%m-%d')
test_end_dt = dt.datetime.strptime(test_end, '%Y-%m-%d')

In [10]:
# DELETE OLD FILES

In [11]:
if os.path.isdir('p_data') == True:
    shutil.rmtree('p_data')
    
os.mkdir('p_data')


In [12]:
engine = sqlalchemy.create_engine(f'mysql+pymysql://{dbi.username}:{dbi.password}@{dbi.hostname}:{dbi.port}/{dbi.database}')

In [13]:
db_sqla_conn = engine.connect()

In [14]:
# GET ALL DATABASE TABLES

In [15]:
db_tables = engine.execute(" SHOW TABLES; ")
db_tables = db_tables.fetchall()

In [16]:
db_tables_list = []
for each in db_tables:
    db_tables_list.append(each[0])

In [17]:
# BREAK TABLES INTO MACRO AND ASSET LISTS

In [18]:
macro_list = ['fvx', 'tnx', 'vix']

In [19]:
asset_list = []

for each in db_tables_list:
    if each not in macro_list:
        asset_list.append(each)

In [20]:
# COMPARE USER INPUT TO EXISTING DB TABLES

In [21]:
for i in range (0, len(asset_pred_list)):
    asset_pred_list[i] = asset_pred_list[i].lower()

for each in asset_pred_list:
    if each not in asset_list:
        raise AssetNotExistInDB

In [22]:
# GET MACRO (MINUS FF) INFO FROM DB

In [23]:
tmp_macro_list = []
for each in macro_list:
    tmp_macro_list.append(pd.read_sql(each, engine))
        

In [24]:
# CLEAN AND MERGE MACRO DATA

In [25]:
inc = 0
for each in tmp_macro_list:
    each.drop(['index', 'High', 'Low', 'Open', 'Volume', 'Adj Close' ], axis=1, inplace=True)
    each.rename({'Close': macro_list[inc] + '_close' }, axis=1, inplace=True)
    inc = inc + 1

In [26]:
macro_merged = tmp_macro_list[0]
for i in range(0, len(tmp_macro_list) - 1):
    macro_merged = pd.merge(macro_merged, tmp_macro_list[i + 1], on='Date')

In [27]:
# GET FED FUNDS DATA

In [28]:
ff_df = pd.read_sql('fed_funds', engine)

In [29]:
ff_df.drop('index', axis=1, inplace=True)

In [30]:
# MERGE IN FED FUNDS DATA TO MACRO DATA

In [31]:
macro_merged = pd.merge(macro_merged, ff_df, on = 'Date')

In [32]:
macro_merged

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF
0,2000-06-01,6.435,6.195,22.360001,6.65
1,2000-06-02,6.368,6.148,21.480000,6.44
2,2000-06-05,6.323,6.101,22.709999,6.51
3,2000-06-06,6.345,6.127,23.049999,6.47
4,2000-06-07,6.338,6.114,22.480000,6.50
...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08
5419,2021-12-23,1.243,1.493,17.959999,0.08
5420,2021-12-27,1.252,1.481,17.680000,0.08
5421,2021-12-28,1.245,1.481,17.540001,0.08


In [33]:
# GETTING ASSET INFO FROM DB

In [34]:
asset_pred_df_list = []
for each in asset_pred_list:
    asset_pred_df_list.append(pd.read_sql(each, engine))
    

In [35]:
# CLEAN AND MERGE ASSET DATA

In [36]:
inc = 0
for each in asset_pred_df_list:
    each.drop(['index', 'High', 'Low', 'Open', 'Adj Close' ], axis=1, inplace=True)
    each.rename({'Close': asset_pred_list[inc] + '_close' }, axis=1, inplace=True)
    each.rename({'Volume': asset_pred_list[inc] + '_volume' }, axis=1, inplace=True)
    inc = inc + 1

In [37]:
asset_pred_df_list[0]

Unnamed: 0,Date,spy_close,spy_volume
0,2000-06-01,145.312500,8961600.0
1,2000-06-02,147.843750,8962200.0
2,2000-06-05,147.125000,6998100.0
3,2000-06-06,146.468750,4858900.0
4,2000-06-07,147.484375,4919500.0
...,...,...,...
5418,2021-12-22,467.690002,58890200.0
5419,2021-12-23,470.600006,56384300.0
5420,2021-12-27,477.260010,56808600.0
5421,2021-12-28,476.869995,38944403.0


In [38]:
# MERGING ASSET DATA WITH MACRO DATA

In [39]:
base_data_list = []

for each in asset_pred_df_list:
    base_merged = pd.merge(macro_merged, each, on = 'Date')
    base_data_list.append(base_merged)

In [40]:
base_data_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume
0,2000-06-01,6.435,6.195,22.360001,6.65,145.312500,8961600.0
1,2000-06-02,6.368,6.148,21.480000,6.44,147.843750,8962200.0
2,2000-06-05,6.323,6.101,22.709999,6.51,147.125000,6998100.0
3,2000-06-06,6.345,6.127,23.049999,6.47,146.468750,4858900.0
4,2000-06-07,6.338,6.114,22.480000,6.50,147.484375,4919500.0
...,...,...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08,467.690002,58890200.0
5419,2021-12-23,1.243,1.493,17.959999,0.08,470.600006,56384300.0
5420,2021-12-27,1.252,1.481,17.680000,0.08,477.260010,56808600.0
5421,2021-12-28,1.245,1.481,17.540001,0.08,476.869995,38944403.0


In [41]:
# FIXING FED FUNDS COLUMN
# Idea is to shift it down one row so yesterdays value is used to help with current day's prediciton. 
# This seems to make logical sense because the fed funds rate for the current day is not posted until 9 am the day of.
# In theory this could me that the previous days value will affect current day the most. 
# We will operate under this assumption

In [42]:
for each in base_data_list:
    each['DFF'] = each['DFF'].shift(1)

In [43]:
base_data_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume
0,2000-06-01,6.435,6.195,22.360001,,145.312500,8961600.0
1,2000-06-02,6.368,6.148,21.480000,6.65,147.843750,8962200.0
2,2000-06-05,6.323,6.101,22.709999,6.44,147.125000,6998100.0
3,2000-06-06,6.345,6.127,23.049999,6.51,146.468750,4858900.0
4,2000-06-07,6.338,6.114,22.480000,6.47,147.484375,4919500.0
...,...,...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08,467.690002,58890200.0
5419,2021-12-23,1.243,1.493,17.959999,0.08,470.600006,56384300.0
5420,2021-12-27,1.252,1.481,17.680000,0.08,477.260010,56808600.0
5421,2021-12-28,1.245,1.481,17.540001,0.08,476.869995,38944403.0


In [44]:
# PROCESSING BASE DATA

In [45]:
p_data_list = base_data_list

In [46]:
#CREATING ZSCORE COLUMN

In [47]:
for each in p_data_list:
    
    closePriceList = each[each.columns[5]].tolist()
    std_dev_14 = []
    
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 13):
            std_dev_14.append(None)
            continue
        temp_list = []
        for j in range(i - 14, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        std_dev = np.std(temp_list)
        obs_val = closePriceList[i]
        z = (obs_val - mean)/std_dev
        
        std_dev_14.append(z)
            
    each['Zscore'] = std_dev_14

In [48]:
# CREATING 100 SMA COLUMN

In [49]:
for each in p_data_list:
    
    sma100 = []
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 99):
            sma100.append(None)
            continue
        temp_list = []
        for j in range(i - 100, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        
        sma100.append(mean)
            
    each['SMA100'] = sma100

In [50]:
# CREATING 200 SMA COLUMN

In [51]:
for each in p_data_list:
    
    sma200 = []
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 199):
            sma200.append(None)
            continue
        temp_list = []
        for j in range(i - 200, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        
        sma200.append(mean)
            
    each['SMA200'] = sma200
    

In [52]:
#CREATING DELTA PRICE COLUMN

In [53]:
for each in p_data_list:
    
    each[ each.columns[5] + "_next" ] = each[each.columns[5]].shift(-1)
    each ['DeltaPrice'] = each[ each.columns[5] + "_next" ] - each[ each.columns[5] ]
    each.drop( each.columns[5] + "_next", axis=1, inplace=True )
    each.fillna(value=0, inplace=True)
    


In [54]:
# DROPPING FIRST 200 ROW
# Gets rid of all NaNs

In [55]:
for each in p_data_list:
    
    each.drop(each.head(200).index, axis=0, inplace=True)
    each.reset_index(drop=True, inplace=True)
    
    
   

In [56]:
# CONVERTING DATE COLUMN TO DATETIME TYPE
# This will allow us to more easily query by date later on

In [57]:
for each in p_data_list:
    for i in range (0, each.shape[0]):
        value = each.loc[i, 'Date']
        dt_value = dt.datetime.strptime(value, '%Y-%m-%d')
        each.loc[i, 'Date'] = dt_value
        

In [58]:
p_data_list[1]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,qqq_close,qqq_volume,Zscore,SMA100,SMA200,DeltaPrice
0,2001-03-19 00:00:00,4.530,4.815,29.780001,5.40,43.259998,53248200.0,-1.018243,47.961266,49.680359,-2.910000
1,2001-03-20 00:00:00,4.493,4.791,30.959999,5.38,40.349998,104796200.0,-1.927228,47.922141,49.662828,0.000000
2,2001-03-21 00:00:00,4.441,4.768,31.930000,5.14,40.349998,96200700.0,-1.590109,47.879703,49.627391,2.450001
3,2001-03-22 00:00:00,4.382,4.708,32.840000,5.05,42.799999,130208100.0,-0.578583,47.841297,49.589516,0.000000
4,2001-03-23 00:00:00,4.480,4.799,30.450001,5.04,42.799999,95277500.0,-0.493341,47.791297,49.547016,-0.989998
...,...,...,...,...,...,...,...,...,...,...,...
5218,2021-12-22 00:00:00,1.222,1.457,18.629999,0.08,393.950012,37042200.0,0.587962,224.409500,224.219900,2.970001
5219,2021-12-23 00:00:00,1.243,1.493,17.959999,0.08,396.920013,29574300.0,1.039448,224.412800,224.185200,6.559998
5220,2021-12-27 00:00:00,1.252,1.481,17.680000,0.08,403.480011,32820700.0,2.069535,224.424600,224.134300,-1.870026
5221,2021-12-28 00:00:00,1.245,1.481,17.540001,0.08,401.609985,30843855.0,1.393975,224.484700,224.086950,-1.089996


In [59]:
# CREATING TRAINING AND TEST SETS

In [60]:
train_list = []
test_list = []

for each in p_data_list:
    
    
    
    mask_train = (each['Date'] >= train_start_dt) & (each['Date'] <= train_end_dt)
    tmp = each.loc[mask_train]
    tmp.reset_index(drop=True, inplace=True)
    train_list.append(tmp)
    
    mask_test = (each['Date'] >= test_start_dt) & (each['Date'] <= test_end_dt)
    tmp = each.loc[mask_test]
    tmp.reset_index(drop=True, inplace=True)
    test_list.append(tmp)
    

In [61]:
inc = 0
for each in train_list:
    each.to_csv(f'./p_data/{asset_pred_list [inc]}_train.csv')
    inc = inc + 1

In [62]:
inc = 0
for each in test_list:
    each.to_csv(f'./p_data/{asset_pred_list [inc]}_test.csv')
    inc = inc + 1

In [63]:
train_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume,Zscore,SMA100,SMA200,DeltaPrice
0,2001-03-19 00:00:00,4.530,4.815,29.780001,5.40,117.349998,10067800.0,-1.329967,47.961266,49.680359,-3.150002
1,2001-03-20 00:00:00,4.493,4.791,30.959999,5.38,114.199997,15083900.0,-1.947728,47.922141,49.662828,-1.939995
2,2001-03-21 00:00:00,4.441,4.768,31.930000,5.14,112.260002,19004600.0,-2.049258,47.879703,49.627391,-1.139999
3,2001-03-22 00:00:00,4.382,4.708,32.840000,5.05,111.120003,28624800.0,-1.913244,47.841297,49.589516,3.360001
4,2001-03-23 00:00:00,4.480,4.799,30.450001,5.04,114.480003,12861700.0,-0.940018,47.791297,49.547016,1.459999
...,...,...,...,...,...,...,...,...,...,...,...
4212,2017-12-22 00:00:00,2.251,2.485,9.900000,1.42,267.510010,78720900.0,1.052659,145.829100,142.295950,-0.320007
4213,2017-12-26 00:00:00,2.237,2.467,10.250000,1.42,267.190002,45244400.0,0.704789,145.958700,142.382100,0.130005
4214,2017-12-27 00:00:00,2.199,2.414,10.470000,1.42,267.320007,57751000.0,0.700043,146.096200,142.467100,0.549988
4215,2017-12-28 00:00:00,2.234,2.432,10.180000,1.42,267.869995,45116100.0,1.174825,146.225400,142.554600,-1.010010


In [64]:
test_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume,Zscore,SMA100,SMA200,DeltaPrice
0,2020-01-02 00:00:00,1.671,1.882,12.470000,1.55,324.869995,59151200.0,2.122587,155.5764,154.853299,-2.459991
1,2020-01-03 00:00:00,1.586,1.788,14.020000,1.55,322.410004,77709700.0,0.810642,155.7280,154.907399,1.230011
2,2020-01-06 00:00:00,1.611,1.811,13.850000,1.55,323.640015,55653900.0,1.395338,155.8926,154.966749,-0.910004
3,2020-01-07 00:00:00,1.619,1.827,13.790000,1.55,322.730011,40496400.0,0.732357,156.0434,155.017150,1.720001
4,2020-01-08 00:00:00,1.665,1.874,13.450000,1.55,324.450012,68296000.0,1.805870,156.2307,155.093050,2.199982
...,...,...,...,...,...,...,...,...,...,...,...
498,2021-12-22 00:00:00,1.222,1.457,18.629999,0.08,467.690002,58890200.0,0.746567,224.4095,224.219900,2.910004
499,2021-12-23 00:00:00,1.243,1.493,17.959999,0.08,470.600006,56384300.0,1.174834,224.4128,224.185200,6.660004
500,2021-12-27 00:00:00,1.252,1.481,17.680000,0.08,477.260010,56808600.0,2.505897,224.4246,224.134300,-0.390015
501,2021-12-28 00:00:00,1.245,1.481,17.540001,0.08,476.869995,38944403.0,1.935848,224.4847,224.086950,-0.220001
