In [20]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

## Func

In [21]:
## 데이터 구성함수
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])

    return np.array(data), np.array(labels)

## Data

In [22]:
## 필터링버전
df_SPY = pd.read_csv('data/engineered/edf_SPY.csv', index_col=0)
df_QQQ = pd.read_csv('data/engineered/edf_QQQ.csv', index_col=0)
df_VEA = pd.read_csv('data/engineered/edf_VEA.csv', index_col=0)
df_VWO = pd.read_csv('data/engineered/edf_VWO.csv', index_col=0)
df_TLT = pd.read_csv('data/engineered/edf_TLT.csv', index_col=0)
df_IEF = pd.read_csv('data/engineered/edf_IEF.csv', index_col=0)
df_SHY = pd.read_csv('data/engineered/edf_SHY.csv', index_col=0)
df_IAU = pd.read_csv('data/engineered/edf_IAU.csv', index_col=0)
## df_XLF = pd.read_csv('data/edf_XLF.csv', index_col=0)

In [23]:
## 논필터링버전
df_SPY = pd.read_csv('data/engineered/edf_SPY_ori.csv', index_col=0)
df_QQQ = pd.read_csv('data/engineered/edf_QQQ_ori.csv', index_col=0)
df_VEA = pd.read_csv('data/engineered/edf_VEA_ori.csv', index_col=0)
df_VWO = pd.read_csv('data/engineered/edf_VWO_ori.csv', index_col=0)
df_TLT = pd.read_csv('data/engineered/edf_TLT_ori.csv', index_col=0)
df_IEF = pd.read_csv('data/engineered/edf_IEF_ori.csv', index_col=0)
df_SHY = pd.read_csv('data/engineered/edf_SHY_ori.csv', index_col=0)
df_IAU = pd.read_csv('data/engineered/edf_IAU_ori.csv', index_col=0)
## df_XLF = pd.read_csv('data/edf_XLF.csv', index_col=0)

In [24]:
## 구글트랜드 버전
df_SPY = pd.read_csv('data/engineered/df_SPY_gt.csv', index_col=0)
df_QQQ = pd.read_csv('data/engineered/df_QQQ_gt.csv', index_col=0)
df_VEA = pd.read_csv('data/engineered/df_VEA_gt.csv', index_col=0)
df_VWO = pd.read_csv('data/engineered/df_VWO_gt.csv', index_col=0)
df_TLT = pd.read_csv('data/engineered/df_TLT_gt.csv', index_col=0)
df_IEF = pd.read_csv('data/engineered/df_IEF_gt.csv', index_col=0)
df_SHY = pd.read_csv('data/engineered/df_SHY_gt.csv', index_col=0)
df_IAU = pd.read_csv('data/engineered/df_IAU_gt.csv', index_col=0)
## df_XLF = pd.read_csv('data/edf_XLF.csv', index_col=0)

In [25]:
df_ls_rb = [df_SPY, df_QQQ, df_VEA, df_VWO, df_TLT, df_IEF, df_SHY, df_IAU]

## Restructure

In [26]:
assets = ['SPY', 'QQQ', 'VEA', 'VWO', 'TLT', 'IEF', 'SHY', 'IAU']
for i in range(len(df_ls_rb)):    
    df = df_ls_rb[i].copy()
    dataset = df.values
    train_start = 0
    train_end = len(dataset)-60 ## 최근 5년 ~ 1년(4개년도)
    history_size = 24
    target_size = 0
    step = 1
    single_step = True

    valid_start = train_end-history_size
    valid_end = len(dataset)-12 ## 최근 1년(1개년도)

    test_start = valid_end-history_size
    test_end = None

    target = dataset[:, -1]

    X_train, y_train = multivariate_data(dataset, target, train_start, train_end, history_size, target_size, step, single_step)
    X_valid, y_valid = multivariate_data(dataset, target, valid_start, valid_end, history_size, target_size, step, single_step)
    X_test, y_test = multivariate_data(dataset, target, test_start, test_end, history_size, target_size, step, single_step)

    train_date = pd.DataFrame(df[train_start+history_size:train_end].index)
    valid_date = pd.DataFrame(df[valid_start+history_size:valid_end].index)
    test_date = pd.DataFrame(df[test_start+history_size:test_end].index)

    np.save(f'data/modeling/{assets[i]}_train_X.npy',X_train)
    np.save(f'data/modeling/{assets[i]}_train_y.npy',y_train)
    np.save(f'data/modeling/{assets[i]}_valid_X.npy',X_valid)
    np.save(f'data/modeling/{assets[i]}_valid_y.npy',y_valid)
    np.save(f'data/modeling/{assets[i]}_test_X.npy',X_test)
    np.save(f'data/modeling/{assets[i]}_test_y.npy',y_test)
    
    train_date.to_csv(f'data/modeling/{assets[i]}_train_ans.csv')
    valid_date.to_csv(f'data/modeling/{assets[i]}_valid_ans.csv')
    test_date.to_csv(f'data/modeling/{assets[i]}_test_ans.csv')