In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import math
import datetime as dt

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (7,5)

In [3]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

## Funcs

In [4]:
## 극단값 보정 # 윈저라이징 #3std
def clean_outlier(srs, n = 3):
    srs = srs.copy()
    ut = srs.mean() + n*srs.std()
    lt = srs.mean() - n*srs.std()
    srs[(srs > ut)] = ut
    srs[(srs < lt)] = lt
    return srs

## Data

In [5]:
macro = pd.read_csv('data/macro_encoded_sav.csv', index_col=0)
## macro = pd.read_csv('data/macro.csv', index_col=0)
macro.index = pd.to_datetime(macro.index).dropna()

assets = pd.read_csv('data/assets_encoded_sav.csv', index_col=0)
## assets = pd.read_csv('data/assets.csv', index_col=0)
assets.index = pd.to_datetime(assets.index).dropna()

In [6]:
asss = ['SPY', 'QQQ', 'VEA', 'VWO', 'TLT', 'IEF', 'SHY', 'IAU']
assets = assets[asss]

### 변수선택

In [7]:
## 분석 대상 데이터 ## 선행성
dataset = macro.dropna().shift(1).dropna()
## 이상값 제거
dataset_v2 = dataset.dropna()
for mac in dataset_v2.columns:
    dataset_v2[mac] = clean_outlier(dataset_v2[mac], n = 2)

## 활용설명변수
dataset_v3 = dataset_v2.dropna()
X_ = dataset_v3[['VIX', 'DG10', 'DG2', 'SPPE']]
## 수익률
df_rets = assets.dropna()
## 자산별 12-1M
df121M = assets.rolling(12).sum().shift(1).dropna()
df121M.columns = df121M.columns + '12-1M'


## 타겟자산과 다른자산(-1) 상관계수
df_ls = []
for col in df_rets.columns:
    assets_ = list(df_rets.columns)
    assets_.remove(col)
    temp = df_rets[assets_].shift(1)
    temp[col] = df_rets[col]
    temp_corr = temp.corr()[col]
    df_ls.append(temp)

## 후보 변수 결합
df_ls2 = []
for i in range(len(df_ls)):
    temp = pd.concat([X_, df121M, df_ls[i]], axis=1).dropna()
    df_ls2.append(temp)
    
## 상관계수 0.1이상 변수만 남김
df_ls3 = []
for i in range(len(df_ls2)):
    df = df_ls2[i]
    temp_corr = df.corr()[df.columns[-1]] 
    df_ls3.append(df[temp_corr[abs(temp_corr) >= .25].index])
    
## vif 10이하 변수만 남김
df_ls4 = []
for i in range(len(df_ls3)):
    df = df_ls3[i]
    X = df[df.columns[:-1]] 
    vif = pd.DataFrame()
    vif["features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    temp = df[vif['features'][vif['VIF'] < 10]]
    temp[df.columns[-1]] = df[df.columns[-1]]
    df_ls4.append(temp)

### Set 구성

In [8]:
## 모델링 대상 데이터 lag없음
dataset = macro.dropna().dropna()
## 이상값 제거
dataset_v2 = dataset.dropna()
for mac in dataset_v2.columns:
    dataset_v2[mac] = clean_outlier(dataset_v2[mac], n = 2)

## 활용설명변수
dataset_v3 = dataset_v2.dropna()
X_ = dataset_v3[['VIX', 'DG10', 'DG2', 'SPPE']]
## 수익률
df_rets = assets.dropna()
## 자산별 12-1M
df121M = assets.rolling(12).sum().shift(1).dropna()
df121M.columns = df121M.columns + '12-1M'

df_ = []
for col in df_rets.columns:
    assets_ = list(df_rets.columns)
    assets_.remove(col)
    temp = df_rets[assets_]
    temp[col] = df_rets[col]
    temp_corr = temp.corr()[col]
    df_.append(temp)

## 후보 변수 결합
dfs = []
for i in range(len(df_ls4)):
    temp = pd.concat([X_, df121M, df_[i]], axis=1).dropna()
    temp = temp[df_ls4[i].columns]
    dfs.append(temp)

In [9]:
df_SPY, df_QQQ, df_VEA, df_VWO, df_TLT, df_IEF, df_SHY, df_IAU = dfs

In [10]:
df_SPY.to_csv('data/engineered/edf_SPY.csv')
df_QQQ.to_csv('data/engineered/edf_QQQ.csv')
df_VEA.to_csv('data/engineered/edf_VEA.csv')
df_VWO.to_csv('data/engineered/edf_VWO.csv')
df_TLT.to_csv('data/engineered/edf_TLT.csv')
df_IEF.to_csv('data/engineered/edf_IEF.csv')
df_SHY.to_csv('data/engineered/edf_SHY.csv')
df_IAU.to_csv('data/engineered/edf_IAU.csv')
## df_XLF.to_csv('data/edf_XLF.csv')