In [134]:
# IMPORT STATEMENTS

import sqlalchemy
# import pymysql
import os
import shutil
import model_config as mcfig
import pandas as pd
import numpy as np
import datetime as dt

import sys
sys.path.append('../')
import private_info.db_info as dbi

In [135]:
#CUSTOM ERROR CLASSES

In [136]:
class AssetNotExistInDB(Exception):
    """One of the assets requested does not exist in the database!"""
    pass

In [137]:
#GET NECESSARY VARIABLES FROM MODEL CONFIGURATION FILE

In [138]:
asset_pred_list = mcfig.asset_pred_list

In [139]:
train_start = mcfig.train_start
train_end = mcfig.train_end

In [140]:
train_start_dt = dt.datetime.strptime(train_start, '%Y-%m-%d')
train_end_dt = dt.datetime.strptime(train_end, '%Y-%m-%d')

In [141]:
test_start = mcfig.test_start
test_end = mcfig.test_end

In [142]:
test_start_dt = dt.datetime.strptime(test_start, '%Y-%m-%d')
test_end_dt = dt.datetime.strptime(test_end, '%Y-%m-%d')

In [143]:
# DELETE OLD FILES

In [144]:
if os.path.isdir('p_data') == True:
    shutil.rmtree('p_data')
    
os.mkdir('p_data')


In [145]:
engine = sqlalchemy.create_engine(f'mysql+pymysql://{dbi.username}:{dbi.password}@{dbi.hostname}:{dbi.port}/{dbi.database}')

In [146]:
db_sqla_conn = engine.connect()

In [147]:
# GET ALL DATABASE TABLES

In [148]:
db_tables = engine.execute(" SHOW TABLES; ")
db_tables = db_tables.fetchall()

In [149]:
db_tables_list = []
for each in db_tables:
    db_tables_list.append(each[0])

In [150]:
# BREAK TABLES INTO MACRO AND ASSET LISTS

In [151]:
macro_list = ['fvx', 'tnx', 'vix']

In [152]:
asset_list = []

for each in db_tables_list:
    if each not in macro_list:
        asset_list.append(each)

In [153]:
# COMPARE USER INPUT TO EXISTING DB TABLES

In [154]:
for i in range (0, len(asset_pred_list)):
    asset_pred_list[i] = asset_pred_list[i].lower()

for each in asset_pred_list:
    if each not in asset_list:
        raise AssetNotExistInDB

In [155]:
# GET MACRO (MINUS FF) INFO FROM DB

In [156]:
tmp_macro_list = []
for each in macro_list:
    tmp_macro_list.append(pd.read_sql(each, engine))
        

In [157]:
# CLEAN AND MERGE MACRO DATA

In [158]:
inc = 0
for each in tmp_macro_list:
    each.drop(['index', 'High', 'Low', 'Open', 'Volume', 'Adj Close' ], axis=1, inplace=True)
    each.rename({'Close': macro_list[inc] + '_close' }, axis=1, inplace=True)
    inc = inc + 1

In [159]:
macro_merged = tmp_macro_list[0]
for i in range(0, len(tmp_macro_list) - 1):
    macro_merged = pd.merge(macro_merged, tmp_macro_list[i + 1], on='Date')

In [160]:
# GET FED FUNDS DATA

In [161]:
ff_df = pd.read_sql('fed_funds', engine)

In [162]:
ff_df.drop('index', axis=1, inplace=True)

In [163]:
# MERGE IN FED FUNDS DATA TO MACRO DATA

In [164]:
macro_merged = pd.merge(macro_merged, ff_df, on = 'Date')

In [165]:
macro_merged

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF
0,2000-06-01,6.435,6.195,22.360001,6.65
1,2000-06-02,6.368,6.148,21.480000,6.44
2,2000-06-05,6.323,6.101,22.709999,6.51
3,2000-06-06,6.345,6.127,23.049999,6.47
4,2000-06-07,6.338,6.114,22.480000,6.50
...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08
5419,2021-12-23,1.243,1.493,17.959999,0.08
5420,2021-12-27,1.252,1.481,17.680000,0.08
5421,2021-12-28,1.245,1.481,17.540001,0.08


In [166]:
# GETTING ASSET INFO FROM DB

In [167]:
asset_pred_df_list = []
for each in asset_pred_list:
    asset_pred_df_list.append(pd.read_sql(each, engine))
    

In [168]:
# CLEAN AND MERGE ASSET DATA

In [169]:
inc = 0
for each in asset_pred_df_list:
    each.drop(['index', 'High', 'Low', 'Open', 'Adj Close' ], axis=1, inplace=True)
    each.rename({'Close': asset_pred_list[inc] + '_close' }, axis=1, inplace=True)
    each.rename({'Volume': asset_pred_list[inc] + '_volume' }, axis=1, inplace=True)
    inc = inc + 1

In [170]:
asset_pred_df_list[0]

Unnamed: 0,Date,spy_close,spy_volume
0,2000-06-01,145.312500,8961600.0
1,2000-06-02,147.843750,8962200.0
2,2000-06-05,147.125000,6998100.0
3,2000-06-06,146.468750,4858900.0
4,2000-06-07,147.484375,4919500.0
...,...,...,...
5418,2021-12-22,467.690002,58890200.0
5419,2021-12-23,470.600006,56384300.0
5420,2021-12-27,477.260010,56808600.0
5421,2021-12-28,476.869995,38944403.0


In [171]:
# MERGING ASSET DATA WITH MACRO DATA

In [172]:
base_data_list = []

for each in asset_pred_df_list:
    base_merged = pd.merge(macro_merged, each, on = 'Date')
    base_data_list.append(base_merged)

In [173]:
base_data_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume
0,2000-06-01,6.435,6.195,22.360001,6.65,145.312500,8961600.0
1,2000-06-02,6.368,6.148,21.480000,6.44,147.843750,8962200.0
2,2000-06-05,6.323,6.101,22.709999,6.51,147.125000,6998100.0
3,2000-06-06,6.345,6.127,23.049999,6.47,146.468750,4858900.0
4,2000-06-07,6.338,6.114,22.480000,6.50,147.484375,4919500.0
...,...,...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08,467.690002,58890200.0
5419,2021-12-23,1.243,1.493,17.959999,0.08,470.600006,56384300.0
5420,2021-12-27,1.252,1.481,17.680000,0.08,477.260010,56808600.0
5421,2021-12-28,1.245,1.481,17.540001,0.08,476.869995,38944403.0


In [174]:
# FIXING FED FUNDS COLUMN
# Idea is to shift it down one row so yesterdays value is used to help with current day's prediciton. 
# This seems to make logical sense because the fed funds rate for the current day is not posted until 9 am the day of.
# In theory this could me that the previous days value will affect current day the most. 
# We will operate under this assumption

In [175]:
for each in base_data_list:
    each['DFF'] = each['DFF'].shift(1)

In [176]:
base_data_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume
0,2000-06-01,6.435,6.195,22.360001,,145.312500,8961600.0
1,2000-06-02,6.368,6.148,21.480000,6.65,147.843750,8962200.0
2,2000-06-05,6.323,6.101,22.709999,6.44,147.125000,6998100.0
3,2000-06-06,6.345,6.127,23.049999,6.51,146.468750,4858900.0
4,2000-06-07,6.338,6.114,22.480000,6.47,147.484375,4919500.0
...,...,...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08,467.690002,58890200.0
5419,2021-12-23,1.243,1.493,17.959999,0.08,470.600006,56384300.0
5420,2021-12-27,1.252,1.481,17.680000,0.08,477.260010,56808600.0
5421,2021-12-28,1.245,1.481,17.540001,0.08,476.869995,38944403.0


In [177]:
# PROCESSING BASE DATA

In [178]:
p_data_list = base_data_list

In [179]:
#CREATING DELTA PRICE COLUMN

In [180]:
for each in p_data_list:
    
    each[ each.columns[5] + "_next" ] = each[each.columns[5]].shift(1)
    each ['DeltaPrice'] = each[ each.columns[5] ] - each[ each.columns[5] + "_next" ]
    each.drop( each.columns[5] + "_next", axis=1, inplace=True )
    


In [181]:
p_data_list[0]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,spy_close,spy_volume,DeltaPrice
0,2000-06-01,6.435,6.195,22.360001,,145.312500,8961600.0,
1,2000-06-02,6.368,6.148,21.480000,6.65,147.843750,8962200.0,2.531250
2,2000-06-05,6.323,6.101,22.709999,6.44,147.125000,6998100.0,-0.718750
3,2000-06-06,6.345,6.127,23.049999,6.51,146.468750,4858900.0,-0.656250
4,2000-06-07,6.338,6.114,22.480000,6.47,147.484375,4919500.0,1.015625
...,...,...,...,...,...,...,...,...
5418,2021-12-22,1.222,1.457,18.629999,0.08,467.690002,58890200.0,4.630005
5419,2021-12-23,1.243,1.493,17.959999,0.08,470.600006,56384300.0,2.910004
5420,2021-12-27,1.252,1.481,17.680000,0.08,477.260010,56808600.0,6.660004
5421,2021-12-28,1.245,1.481,17.540001,0.08,476.869995,38944403.0,-0.390015


In [182]:
#CREATING ZSCORE COLUMN

In [183]:
for each in p_data_list:
    
    closePriceList = each[each.columns[5]].tolist()
    std_dev_14 = []
    
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 13):
            std_dev_14.append(None)
            continue
        temp_list = []
        for j in range(i - 14, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        std_dev = np.std(temp_list)
        obs_val = closePriceList[i]
        z = (obs_val - mean)/std_dev
        
        std_dev_14.append(z)
            
    each['Zscore'] = std_dev_14

In [184]:
# CREATING 100 SMA COLUMN

In [185]:
for each in p_data_list:
    
    sma100 = []
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 99):
            sma100.append(None)
            continue
        temp_list = []
        for j in range(i - 100, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        
        sma100.append(mean)
            
    each['SMA100'] = sma100

In [186]:
# CREATING 200 SMA COLUMN

In [187]:
for each in p_data_list:
    
    sma200 = []
    for i in range(0, len(closePriceList)):
        if (0 <= i <= 199):
            sma200.append(None)
            continue
        temp_list = []
        for j in range(i - 200, i):
            temp_list.append(closePriceList[j])
            
        mean = np.mean(temp_list)
        
        sma200.append(mean)
            
    each['SMA200'] = sma200
    

In [188]:
# DROPPING FIRST 200 ROW
# Gets rid of all NaNs

In [189]:
for each in p_data_list:
    
    each.drop(each.head(200).index, axis=0, inplace=True)
    each.reset_index(drop=True, inplace=True)
   

In [190]:
# CONVERTING DATE COLUMN TO DATETIME TYPE
# This will allow us to more easily query by date later on

In [191]:
for each in p_data_list:
    for i in range (0, each.shape[0]):
        value = each.loc[i, 'Date']
        dt_value = dt.datetime.strptime(value, '%Y-%m-%d')
        each.loc[i, 'Date'] = dt_value
        

In [192]:
p_data_list[1]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,qqq_close,qqq_volume,DeltaPrice,Zscore,SMA100,SMA200
0,2001-03-19 00:00:00,4.530,4.815,29.780001,5.40,43.259998,53248200.0,2.059998,-1.018243,63.254150,77.781684
1,2001-03-20 00:00:00,4.493,4.791,30.959999,5.38,40.349998,104796200.0,-2.910000,-1.927228,62.824875,77.561109
2,2001-03-21 00:00:00,4.441,4.768,31.930000,5.14,40.349998,96200700.0,0.000000,-1.590109,62.403375,77.294734
3,2001-03-22 00:00:00,4.382,4.708,32.840000,5.05,42.799999,130208100.0,2.450001,-0.578583,62.028125,77.027188
4,2001-03-23 00:00:00,4.480,4.799,30.450001,5.04,42.799999,95277500.0,0.000000,-0.493341,61.657375,76.784938
...,...,...,...,...,...,...,...,...,...,...,...
5218,2021-12-22 00:00:00,1.222,1.457,18.629999,0.08,393.950012,37042200.0,4.740021,0.587962,378.846901,358.603701
5219,2021-12-23 00:00:00,1.243,1.493,17.959999,0.08,396.920013,29574300.0,2.970001,1.039448,379.140401,359.019051
5220,2021-12-27 00:00:00,1.252,1.481,17.680000,0.08,403.480011,32820700.0,6.559998,2.069535,379.441501,359.413451
5221,2021-12-28 00:00:00,1.245,1.481,17.540001,0.08,401.609985,30843855.0,-1.870026,1.393975,379.802901,359.853551


In [193]:
# CREATING TRAINING AND TEST SETS

In [194]:
train_list = []
test_list = []

for each in p_data_list:
    
    
    
    mask_train = (each['Date'] >= train_start_dt) & (each['Date'] <= train_end_dt)
    tmp = each.loc[mask_train]
    tmp.reset_index(drop=True, inplace=True)
    train_list.append(tmp)
    
    mask_test = (each['Date'] >= test_start_dt) & (each['Date'] <= test_end_dt)
    tmp = each.loc[mask_test]
    tmp.reset_index(drop=True, inplace=True)
    test_list.append(tmp)
    

In [195]:
inc = 0
for each in train_list:
    each.to_csv(f'./p_data/{asset_pred_list [inc]}_train.csv')
    inc = inc + 1

In [196]:
inc = 0
for each in test_list:
    each.to_csv(f'./p_data/{asset_pred_list [inc]}_test.csv')
    inc = inc + 1

In [197]:
test_list[1]

Unnamed: 0,Date,fvx_close,tnx_close,vix_close,DFF,qqq_close,qqq_volume,DeltaPrice,Zscore,SMA100,SMA200
0,2018-01-02 00:00:00,2.250,2.465,9.770000,1.33,158.490005,32573300.0,2.730011,1.876991,149.473100,143.749900
1,2018-01-03 00:00:00,2.247,2.447,9.150000,1.42,160.029999,29383600.0,1.539993,3.192729,149.616800,143.883900
2,2018-01-04 00:00:00,2.268,2.453,9.220000,1.42,160.309998,24776100.0,0.279999,2.594668,149.806800,144.025000
3,2018-01-05 00:00:00,2.285,2.476,9.220000,1.42,161.919998,26992300.0,1.610001,3.244123,149.988900,144.177500
4,2018-01-08 00:00:00,2.285,2.480,9.520000,1.42,162.550003,23159100.0,0.630005,2.750763,150.168800,144.333650
...,...,...,...,...,...,...,...,...,...,...,...
961,2021-10-26 00:00:00,1.183,1.619,15.980000,0.08,379.119995,47191300.0,1.190002,1.549889,363.326501,344.567051
962,2021-10-27 00:00:00,1.138,1.529,16.980000,0.08,380.000000,45760500.0,0.880005,1.443443,363.761701,344.890551
963,2021-10-28 00:00:00,1.188,1.568,16.530001,0.08,384.220001,38007000.0,4.220001,1.772426,364.195701,345.220951
964,2021-10-29 00:00:00,1.188,1.557,16.260000,0.08,386.109985,37225600.0,1.889984,1.731285,364.670301,345.561851
