### OLS 회귀분석

In [17453]:
### 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

from scipy import stats
from scipy.stats import shapiro

import statsmodels.api as sm
from statsmodels.stats.stattools import jarque_bera
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.stats import shapiro

### 환경설정
np.random.seed(4)


### 함수 정의
def preprocess_transaction_flag(df):
    df['transaction_flag'] = df['transaction_count'].apply(lambda x: 1 if x > 0 else 0)
    return df

def trim_date(df, start, end):
    ### df date 형식 맞춰주기
    df['date'] = df['date'].astype(str)
    if len(df['date'][0]) == 8:
        df['date'] = df['date'].apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:])


    return df[(df['date']>=start) & (df['date']<=end)]

In [17454]:
price_df = pd.read_csv('./data/price/bitcoin_price_coin_market_cap.csv').set_index('date')
price_df

Unnamed: 0_level_0,open,high,low,close,volume,marketCap
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01,320.434998,320.434998,314.002991,314.248993,8.036550e+06,4.297536e+09
2015-01-02,314.079010,315.838989,313.565002,315.032013,7.860650e+06,4.309551e+09
2015-01-03,314.846008,315.149994,281.082001,281.082001,3.305440e+07,3.846270e+09
2015-01-04,281.145996,287.230011,257.612000,264.195007,5.562910e+07,3.616321e+09
2015-01-05,265.084015,278.341003,265.084015,274.473999,4.396280e+07,3.758098e+09
...,...,...,...,...,...,...
2025-03-08,86742.652990,86847.267594,85247.484188,86154.593210,1.820612e+10,1.708771e+12
2025-03-09,86154.305891,86471.130163,80052.486979,80601.041311,3.089935e+10,1.598205e+12
2025-03-10,80597.149784,83955.926252,77420.592186,78532.001808,5.406110e+10,1.558444e+12
2025-03-11,78523.871544,83577.755959,76624.252490,82862.208181,5.470284e+10,1.643251e+12


### 데이터 세팅

In [17455]:
# ### 학습용 데이터 읽기
# test_df = pd.read_csv("./data/test/test_df.csv")
# test_df.set_index('date', inplace=True)
### 학습용 데이터 읽기
test_df = pd.read_csv("./data/test/test_df.csv")
test_df.set_index('date', inplace=True)
test_df

magazine_pro = pd.read_csv("./data/test/magazine_preprocessed_index.csv").set_index('date')

test_df = test_df.merge(magazine_pro, left_index=True, right_index=True, how='left')

# test_df['close'] = price_df['close']
# test_df['open'] = price_df['open']
# test_df['high'] = price_df['high']
# test_df['low'] = price_df['low']
# test_df['volume'] = price_df['volume']

test_df['returns'] = test_df['close'].pct_change()
test_df['returns_during_2'] = test_df['close'].pct_change(2)
test_df['returns_during_3'] = test_df['close'].pct_change(3)
test_df['returns_during_4'] = test_df['close'].pct_change(4)
test_df['returns_during_5'] = test_df['close'].pct_change(5)
test_df['returns_during_6'] = test_df['close'].pct_change(6)
test_df['returns_during_7'] = test_df['close'].pct_change(7)
test_df['returns_during_8'] = test_df['close'].pct_change(8)
test_df['returns_during_9'] = test_df['close'].pct_change(9)
test_df['returns_during_10'] = test_df['close'].pct_change(10)
test_df['returns_during_11'] = test_df['close'].pct_change(11)
test_df['returns_during_12'] = test_df['close'].pct_change(12)
test_df['returns_during_13'] = test_df['close'].pct_change(13)
test_df['returns_during_14'] = test_df['close'].pct_change(14)
test_df['returns_during_15'] = test_df['close'].pct_change(15)
test_df['returns_during_16'] = test_df['close'].pct_change(16)
test_df['returns_during_17'] = test_df['close'].pct_change(17)
test_df['returns_during_18'] = test_df['close'].pct_change(18)
test_df['returns_during_19'] = test_df['close'].pct_change(19)
test_df['returns_during_20'] = test_df['close'].pct_change(20)



### Realized Volatility 추가
test_df['rv'] = test_df['returns'].rolling(5).std()
test_df['rv'] = test_df['rv'].fillna(test_df['rv'].mean())

### 이벤트용 데이터 읽기
event_df = pd.read_csv("./data/event/final/bitcoin_event_details_sentiment_250324.csv")
event_df.index = event_df['Date'].astype(str).apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:])
# event_df = event_df[event_df['predict']==1]

target_event_receive_df = pd.read_csv("./data/target/final/event_2_day_transactions_over_120_threshold_1000_receive.csv")
# target_event_receive_df = pd.read_csv("./data/target/final/without_conference_event_2_day_transactions_over_40_threshold_1000_from_2018_receive.csv")
# target_event_receive_df = pd.read_csv("./data/target/screened_event_2_day_transactions_over_20_threshold_500_from_2018_receive.csv")
# target_event_receive_df = pd.read_csv("./data/target/without_conference_event_2_day_transactions_over_40_threshold_500_from_2018_receive.csv")

target_event_receive_df = preprocess_transaction_flag(target_event_receive_df)
target_event_receive_df.columns = ['date', 'transaction_count', 'transaction_amount', 'transaction_flag']

target_event_receive_df = trim_date(target_event_receive_df, (test_df.index)[0], (test_df.index)[-1])


### Bitcoin Event Flag 추가 / 긍정, 부정, 애매 이벤트 필터링
# pos, neg, neu, all
event_sentiment = 'all'
if event_sentiment == 'all':
    pass
elif event_sentiment == 'pos':
    event_df = event_df[event_df['classification']==1]
elif event_sentiment == 'neg':
    event_df = event_df[event_df['classification']==0]
elif event_sentiment == 'neu':
    event_df = event_df[event_df['classification']==2]
event_df['classification'] = 1
event_df = event_df['classification']
event_df = event_df[~event_df.index.duplicated(keep='first')]

test_df['event_flag'] = event_df
test_df['event_flag'] = test_df['event_flag'].fillna(0)
event_flag_copy = test_df['event_flag'].values

test_df['big_returns_flag'] = test_df['abs_returns'].apply(lambda x: 1 if x > 0.05 else 0)
big_returns_flag_copy = test_df['big_returns_flag'].values

### 분석 데이터 선택
target_df = target_event_receive_df.copy().set_index('date')

### test_df에 target_df 데이터 적용
test_df['transaction_count'] = target_df['transaction_count']
test_df['transaction_amount'] = target_df['transaction_amount']
test_df['transaction_flag'] = target_df['transaction_flag']
test_df['transaction_amount_usd'] = test_df['transaction_amount'] * test_df['close']

### test_df의 target 선택
test_df['target'] = test_df['transaction_amount_usd']
test_df['target'] = np.where(test_df['target'] == 0, 1e-10, test_df['target'])

### target 관련 파생변수 생성
test_df['target_delta'] = test_df['target'].diff(1)
test_df['target_returns'] = test_df['target'].pct_change(1)
test_df['target_flag'] = test_df['target_delta'].apply(lambda x: 1 if x > 0 else 0)
target_flag_copy = test_df['target_flag'].values

### VPIN 관련 변수 추가
vpin = pd.read_csv('./data/vpin/vpin.csv')
vpin['ma_10'] = vpin['vpin'].rolling(10).mean()
vpin['ma_10'] = vpin['ma_10'].fillna(vpin['ma_10'].mean())
vpin = vpin.set_index('date')

test_df['vpin'] = vpin['vpin']
test_df['vpin_ma_10'] = vpin['ma_10']

### 회귀분석 돌릴때 Cheating 방지용으로 transaction 관련 target값 삭제
transaction_count = test_df['transaction_count'].copy()
transaction_amount = test_df['transaction_amount'].copy()
transaction_amount_usd = test_df['transaction_amount_usd'].copy()

### 수익률 관련 변수
returns_copy = test_df['returns'].copy()

##### 전처리 관련
### 변화율에서 inf값 나올텐데 그거 어떻게 처리할지
test_df = test_df.replace([np.inf, -np.inf], np.nan)

### NaN값 처리 방법
test_df = test_df.fillna(test_df.mean())

### 스케일링
scaler = StandardScaler()
test_df = pd.DataFrame(scaler.fit_transform(test_df), columns=test_df.columns, index=test_df.index)

### 스케일링 유의 변수
test_df['returns'] = returns_copy
test_df['event_flag'] = event_flag_copy
test_df['big_returns_flag'] = big_returns_flag_copy
test_df['target_flag'] = target_flag_copy

  test_df['returns'] = test_df['close'].pct_change()
  test_df['returns_during_2'] = test_df['close'].pct_change(2)
  test_df['returns_during_3'] = test_df['close'].pct_change(3)
  test_df['returns_during_4'] = test_df['close'].pct_change(4)
  test_df['returns_during_5'] = test_df['close'].pct_change(5)
  test_df['returns_during_6'] = test_df['close'].pct_change(6)
  test_df['returns_during_7'] = test_df['close'].pct_change(7)
  test_df['returns_during_8'] = test_df['close'].pct_change(8)
  test_df['returns_during_9'] = test_df['close'].pct_change(9)
  test_df['returns_during_10'] = test_df['close'].pct_change(10)
  test_df['returns_during_11'] = test_df['close'].pct_change(11)
  test_df['returns_during_12'] = test_df['close'].pct_change(12)
  test_df['returns_during_13'] = test_df['close'].pct_change(13)
  test_df['returns_during_14'] = test_df['close'].pct_change(14)
  test_df['returns_during_15'] = test_df['close'].pct_change(15)
  test_df['returns_during_16'] = test_df['close'].pct

In [17456]:
target_event_receive_df

Unnamed: 0,date,transaction_count,transaction_amount,transaction_flag
0,2017-01-01,5,10000.0,1
1,2017-01-02,0,0.0,0
2,2017-01-03,0,0.0,0
3,2017-01-04,5,7500.0,1
4,2017-01-05,0,0.0,0
...,...,...,...,...
2825,2024-09-26,0,0.0,0
2826,2024-09-27,0,0.0,0
2827,2024-09-28,2,6400.0,1
2828,2024-09-29,0,0.0,0


In [17457]:
pd.set_option('display.max_columns', None)
test_df

Unnamed: 0_level_0,transaction_count,transaction_amount,transaction_flag,returns,close,abs_returns,volume,open,high,low,high_low,daily_avg_trades,daily_sum_trades,daily_sum_taker_buy_base_asset_volume,daily_sum_taker_buy_quote_asset_volume,daily_avg_spread,daily_quote_asset_volume,daily_avg_quote_asset_volume,daily_low,daily_high,daily_avg_price,daily_vwap,daily_avg_volume,daily_morning_returns,daily_afternoon_returns,daily_night_returns,daily_dawn_returns,daily_avg_morning_returns,daily_avg_afternoon_returns,daily_avg_night_returns,daily_avg_dawn_returns,daily_avg_morning_volume,daily_avg_afternoon_volume,daily_avg_night_volume,daily_avg_dawn_volume,daily_morning_volatility,daily_afternoon_volatility,daily_night_volatility,daily_dawn_volatility,daily_volatility,avg-block-size,blocks-size,cost-per-transaction-percent,cost-per-transaction,difficulty,estimated-transaction-volume-usd,estimated-transaction-volume,fees-usd-per-transaction,hash-rate,market-price,median-confirmation-time,miners-revenue,n-transactions-excluding-popular,n-transactions-per-block,n-transactions-total,n-transactions,n-unique-addresses,output-volume,trade-volume,transaction-fees-usd,transaction-fees,n-payments-per-block,n-payments,avg-confirmation-time,10Yr+ HODL Wave,1Yr+ HODL Wave,5Yr+ HODL Wave,Addresses with > 0 BTC,Addresses with > 0.01 BTC,Addresses with > 0.1 BTC,Addresses with > 1 BTC,Addresses with > 10 BTC,Addresses with > 100 BTC,"Addresses with > 1,000 BTC","Addresses with > 10,000 BTC",Balanced Price,Active Addresses 7DMA,Hash Rate 7DMA,Miner Difficulty,Circulating Supply,CDD (90dma),CVDD,Delta Top,Long-Term Holder Realized Price,Miner Revenue - Block Rewards,Miner Revenue - Fees,Miner Fees %,Miner Revenue - Total,New Addresses,Addresses in Loss,Addresses in Profit,Realized Price,MVOCD,Short-Term Holder Realized Price,Supply Adjusted CDD (90dma),Terminal Price,Top Cap,VDD Multiple,Whale Shadows,returns_during_2,returns_during_3,returns_during_4,returns_during_5,returns_during_6,returns_during_7,returns_during_8,returns_during_9,returns_during_10,returns_during_11,returns_during_12,returns_during_13,returns_during_14,returns_during_15,returns_during_16,returns_during_17,returns_during_18,returns_during_19,returns_during_20,rv,event_flag,big_returns_flag,transaction_amount_usd,target,target_delta,target_returns,target_flag,vpin,vpin_ma_10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1
2017-01-01,-0.296743,-0.183598,0.894569,,-1.111536,0.000000,-0.880727,-1.112165,-1.107929,-1.107056,-0.752190,-1.669788e-16,0.000000,0.000000,0.000000,-6.677695e-17,0.000000,0.000000,0.000000,1.906936e-16,0.000000,0.000000,0.000000,-1.616865e-18,0.000000,6.472890e-18,-6.366045e-18,-1.438860e-18,4.067258e-18,2.330567e-18,0.000000,0.000000,0.000000,-1.219491e-16,-1.393576e-16,0.000000,-1.384638e-16,-1.385767e-16,0.000000,-1.318389e-16,6.953688e-16,0.000000,3.645692e-16,0.000000,0.000000,1.936683e-16,0.000000,0.000000,1.652036e-16,0.000000,-5.403927e-16,2.357973e-16,-5.153963e-16,0.000000,-4.867979e-16,-5.204247e-16,-1.099428e-15,-1.524501e-16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.391100,-0.004522,-1.283663,-1.827698,-1.924960,-1.896433,-2.052649,-1.119430,2.277760,-1.345102,0.596044,-1.329585,-1.582273,-1.011193,-1.010653,-2.076837,-0.621870,-1.235382,-1.360063,-1.203051,1.212060,-0.149535,-0.373813,1.080115,-2.249975,-1.468576,1.468576,-1.329661,-0.880017,-1.153263,-0.341108,-1.277422,-1.173342,0.442407,-0.092031,-1.736282e-17,0.000000,0.000000,0.000000,-1.942717e-17,0.000000,-1.647062e-17,0.000000,0.000000,0.000000,0.000000,0.000000,-4.740562e-17,2.269673e-17,2.180099e-17,-2.100669e-17,0.000000,3.916300e-17,0.000000,0.000000,0.0,0,-0.349973,-0.349973,-5.403487e-19,-1.521418e-17,0,1.169821,-0.006735
2017-01-02,-0.313983,-0.194081,-1.117856,-0.041238,-1.121003,0.653487,-0.875523,-1.110900,-1.113896,-1.121457,-0.640992,-1.669788e-16,0.000000,0.000000,0.000000,-6.677695e-17,0.000000,0.000000,0.000000,1.906936e-16,0.000000,0.000000,0.000000,-1.616865e-18,0.000000,6.472890e-18,-6.366045e-18,-1.438860e-18,4.067258e-18,2.330567e-18,0.000000,0.000000,0.000000,-1.219491e-16,-1.393576e-16,0.000000,-1.384638e-16,-1.385767e-16,0.000000,-1.318389e-16,-8.700713e-01,-1.572275,-8.474623e-01,-1.315835,-1.010971,-1.097035e+00,1.558431,-0.527752,-1.004574e+00,-1.187699,5.654632e-02,-1.371472e+00,-2.384392e-01,-0.469625,-1.648762e+00,-2.587986e-01,-7.066792e-01,-1.715789e-01,-0.862324,-0.450657,0.238978,0.000000,0.000000,0.000000,-1.391100,-0.013588,-1.282459,-1.827330,-1.931272,-1.908202,-2.071927,-1.183467,2.271355,-1.355049,0.596044,-1.329310,-1.473518,-1.010932,-1.010653,-2.074966,-0.612814,-1.235247,-1.359731,-1.203050,1.282554,0.237572,-0.029245,1.227230,-0.627523,-1.465874,1.465874,-1.329410,-0.880017,-1.152886,-0.331977,-1.277262,-1.173257,0.330135,-0.104884,-1.736282e-17,0.000000,0.000000,0.000000,-1.942717e-17,0.000000,-1.647062e-17,0.000000,0.000000,0.000000,0.000000,0.000000,-4.740562e-17,2.269673e-17,2.180099e-17,-2.100669e-17,0.000000,3.916300e-17,0.000000,0.000000,0.0,0,-0.358394,-0.358394,-8.721760e-03,-1.369370e-01,0,0.463199,-0.006735
2017-01-03,-0.313983,-0.194081,-1.117856,0.007694,-1.119310,-0.554939,-0.886047,-1.120372,-1.123688,-1.126337,-0.714619,-1.669788e-16,0.000000,0.000000,0.000000,-6.677695e-17,0.000000,0.000000,0.000000,1.906936e-16,0.000000,0.000000,0.000000,-1.616865e-18,0.000000,6.472890e-18,-6.366045e-18,-1.438860e-18,4.067258e-18,2.330567e-18,0.000000,0.000000,0.000000,-1.219491e-16,-1.393576e-16,0.000000,-1.384638e-16,-1.385767e-16,0.000000,-1.318389e-16,-9.900269e-01,-1.571226,-1.022963e+00,-1.314618,-1.010971,-1.046857e+00,2.376360,-0.527185,-1.004224e+00,-1.186651,8.272219e-01,-1.364727e+00,-1.502490e-01,-0.457000,-1.647571e+00,-1.630155e-01,-8.453647e-01,7.497004e-02,-0.858258,-0.448662,0.276395,0.000000,0.000000,0.000000,-1.391100,-0.019387,-1.281964,-1.823330,-1.927518,-1.909651,-2.085157,-1.225724,2.367422,-1.409756,0.690728,-1.329012,-1.467737,-1.010783,-1.010653,-2.073037,-0.595491,-1.235152,-1.359389,-1.203032,1.400043,0.275020,-0.021708,1.342993,-0.482287,-1.468453,1.468453,-1.329152,-0.877110,-1.152458,-0.313995,-1.277151,-1.173171,0.382106,-0.150281,-7.416925e-01,0.000000,0.000000,0.000000,-1.942717e-17,0.000000,-1.647062e-17,0.000000,0.000000,0.000000,0.000000,0.000000,-4.740562e-17,2.269673e-17,2.180099e-17,-2.100669e-17,0.000000,3.916300e-17,0.000000,0.000000,0.0,0,-0.358394,-0.358394,3.189024e-04,-1.369370e-01,0,0.073948,-0.006735
2017-01-04,-0.296743,-0.186219,0.894569,-0.012969,-1.122186,-0.364922,-0.884944,-1.118678,-1.122305,-1.116298,-0.831921,-1.669788e-16,0.000000,0.000000,0.000000,-6.677695e-17,0.000000,0.000000,0.000000,1.906936e-16,0.000000,0.000000,0.000000,-1.616865e-18,0.000000,6.472890e-18,-6.366045e-18,-1.438860e-18,4.067258e-18,2.330567e-18,0.000000,0.000000,0.000000,-1.219491e-16,-1.393576e-16,0.000000,-1.384638e-16,-1.385767e-16,0.000000,-1.318389e-16,-7.410921e-01,-1.570188,-1.218568e+00,-1.322409,-1.010971,-9.721868e-01,3.265989,-0.492497,-1.005100e+00,-1.185058,3.252687e-01,-1.359205e+00,9.462941e-02,-0.048387,-1.646335e+00,7.819014e-02,-5.565193e-01,4.229876e-01,-0.855507,-0.416767,0.889936,0.000000,0.000000,0.000000,-1.391100,-0.020930,-1.283164,-1.819461,-1.924488,-1.910285,-2.082915,-1.215647,2.276479,-1.369969,0.880097,-1.328312,-1.433424,-1.010888,-1.010653,-2.071238,-0.556247,-1.234955,-1.358539,-1.202801,1.141566,0.889058,0.582283,1.236635,-0.110535,-1.480191,1.480191,-1.328551,-0.875050,-1.150955,-0.273104,-1.276915,-1.173074,0.463781,-0.144840,-1.714349e-01,-0.830150,0.000000,0.000000,-1.942717e-17,0.000000,-1.647062e-17,0.000000,0.000000,0.000000,0.000000,0.000000,-4.740562e-17,2.269673e-17,2.180099e-17,-2.100669e-17,0.000000,3.916300e-17,0.000000,0.000000,0.0,0,-0.352371,-0.352371,6.784844e-03,-1.187232e-01,1,0.993515,-0.006735
2017-01-05,-0.313983,-0.194081,-1.117856,-0.017201,-1.125952,-0.212439,-0.882056,-1.121556,-1.127098,-1.122940,-0.809869,-1.669788e-16,0.000000,0.000000,0.000000,-6.677695e-17,0.000000,0.000000,0.000000,1.906936e-16,0.000000,0.000000,0.000000,-1.616865e-18,0.000000,6.472890e-18,-6.366045e-18,-1.438860e-18,4.067258e-18,2.330567e-18,0.000000,0.000000,0.000000,-1.219491e-16,-1.393576e-16,0.000000,-1.384638e-16,-1.385767e-16,0.000000,-1.318389e-16,-7.469367e-01,-1.569134,-1.518388e+00,-1.324045,-1.010971,-8.648799e-01,5.103564,-0.524272,-1.006325e+00,-1.180402,3.911818e-01,-1.379628e+00,-2.708111e-01,-0.163944,-1.644994e+00,-2.807036e-01,-1.025878e+00,7.609642e-01,-0.816171,-0.448256,0.256032,0.000000,0.000000,0.000000,-1.391100,-0.036431,-1.282638,-1.817074,-1.919086,-1.904376,-2.068707,-1.042067,2.326434,-1.444570,0.974782,-1.327951,-1.436833,-1.011016,-1.010653,-2.069591,-0.452536,-1.234550,-1.357990,-1.202967,0.836092,0.254640,0.097888,0.821275,-0.614468,-1.282200,1.282200,-1.328156,-0.874126,-1.150684,-0.164836,-1.276422,-1.172991,0.677911,-0.104685,-6.633287e-01,-0.442657,-0.965021,0.000000,-1.942717e-17,0.000000,-1.647062e-17,0.000000,0.000000,0.000000,0.000000,0.000000,-4.740562e-17,2.269673e-17,2.180099e-17,-2.100669e-17,0.000000,3.916300e-17,0.000000,0.000000,0.0,0,-0.358394,-0.358394,-6.147039e-03,-1.369370e-01,0,0.911420,-0.006735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-26,-0.313983,-0.194081,-1.117856,0.032018,2.150497,0.321312,-0.526193,2.044470,2.107518,2.107128,1.413474,1.561361e+00,1.562737,-0.500311,0.147195,-4.617230e-01,0.080017,0.078041,2.107235,2.107501e+00,2.105099,2.120871,-0.530120,7.381808e-01,0.594271,-1.363229e-02,4.622276e-01,1.374012e-01,1.291623e-01,-1.166222e-02,0.050923,-0.600225,-0.296437,-5.776052e-01,-6.303255e-01,-0.607295,-4.859555e-01,-5.975799e-01,-0.603801,-5.779171e-01,1.174461e+00,1.961814,3.645692e-16,-0.390542,2.547483,1.936683e-16,0.000000,-0.418565,2.148307e+00,0.000000,-2.457743e-02,2.341021e-01,1.417173e+00,1.943296,2.034930e+00,1.384660e+00,-8.364778e-01,-6.511815e-01,0.000000,-0.297113,-0.605359,1.159411,0.710658,-0.025713,1.384808,0.913852,1.797979,1.761196,1.572595,1.621818,1.527573,0.438896,-0.550449,0.350838,-0.350800,1.917542,-0.741455,2.608041,2.593517,1.387940,-0.327736,2.295459,1.856132,1.953521,-1.689938,-0.607449,-0.562242,-1.679745,-1.092596,-1.043063,1.043063,2.010051,0.995784,2.180409,-0.486305,2.248320,2.139824,-0.718825,-0.122478,2.200183e-01,0.391042,0.259143,0.256152,2.391900e-01,0.243517,3.986019e-01,0.584323,0.864610,0.664897,0.498057,0.396406,6.659008e-01,7.278520e-01,6.511664e-01,6.879697e-01,0.917010,9.636803e-01,0.946723,-0.487305,0.0,0,-0.358394,-0.358394,3.189024e-04,-1.369370e-01,0,-0.398210,0.319224
2024-09-27,-0.313983,-0.194081,-1.117856,0.009144,2.182425,-0.502700,-0.607498,2.152852,2.142055,2.225312,0.294058,1.092008e+00,1.093546,-0.602266,-0.107728,-4.636831e-01,-0.120118,-0.122112,2.225453,2.142044e+00,2.176676,2.183678,-0.611498,6.333016e-02,0.375524,-1.835551e-01,2.139784e-01,2.435170e-02,8.187776e-02,-3.507211e-02,0.016056,-0.554795,-0.468915,-6.793862e-01,-6.532535e-01,-0.579726,-5.813477e-01,-7.213229e-01,-0.601883,-6.289142e-01,1.609094e+00,1.963263,3.645692e-16,-0.252426,2.547483,1.936683e-16,0.000000,-0.357316,2.318832e+00,0.000000,1.598445e-02,3.476914e-01,1.213264e+00,1.473471,2.036868e+00,1.178636e+00,-5.240697e-01,-6.733623e-01,0.000000,-0.240698,-0.584803,1.032369,0.794480,-0.179267,1.384934,0.915172,1.800955,1.763956,1.573197,1.621713,1.527627,0.428819,-0.551730,0.350838,-0.350800,1.921471,-0.801556,2.554654,2.593517,1.388343,-0.313642,2.297193,1.859777,1.954731,-1.648816,-0.586876,-0.487681,-1.637638,-0.941291,-1.099215,1.099215,2.013498,1.126785,2.187203,-0.474328,2.249978,2.142648,-0.712859,-0.137226,7.659860e-01,0.301893,0.444026,0.326292,3.175813e-01,0.297647,2.993971e-01,0.444920,0.620080,0.884129,0.692909,0.532397,4.336910e-01,6.933136e-01,7.534060e-01,6.792808e-01,0.713903,9.372124e-01,0.982722,-0.487307,0.0,0,-0.358394,-0.358394,3.189024e-04,-1.369370e-01,0,-0.815464,0.272204
2024-09-28,-0.307087,-0.187372,0.894569,0.001339,2.187142,-0.783886,-0.773613,2.184797,2.129581,2.258424,-0.336890,-8.127392e-02,-0.079330,-0.772442,-0.563950,-4.690246e-01,-0.564372,-0.566407,2.258574,2.129569e+00,2.184302,2.185738,-0.777761,-1.531213e-01,-0.140431,1.528584e-01,2.154145e-01,-1.238699e-02,-3.010038e-02,1.079869e-02,0.016156,-0.768176,-0.735150,-6.948613e-01,-7.334819e-01,-0.712389,-8.197332e-01,-7.622053e-01,-0.690100,-7.584536e-01,1.731226e+00,1.964924,3.645692e-16,-0.443993,2.547483,1.936683e-16,0.000000,-0.400426,2.123946e+00,0.000000,1.478105e-01,2.434120e-01,1.708971e+00,2.318929,2.038708e+00,1.679180e+00,-1.201952e+00,-8.993764e-01,0.000000,-0.263422,-0.593005,1.374179,0.871994,-0.162453,1.386763,0.916065,1.801099,1.768177,1.574264,1.622055,1.528683,0.434020,-0.554291,0.350838,-0.256116,1.924432,-0.824597,2.510103,2.593517,1.388722,-0.311268,2.297894,1.861927,1.956769,-1.695812,-0.595085,-0.501601,-1.682501,-1.297631,-1.103033,1.103033,2.015955,0.995784,2.192139,-0.472324,2.250607,2.145483,-0.807315,-0.133548,1.462409e-01,0.616232,0.255438,0.391901,2.910660e-01,0.286995,2.729034e-01,0.278357,0.417518,0.583462,0.836261,0.657301,5.062339e-01,4.134152e-01,6.644367e-01,7.244866e-01,0.653433,6.883198e-01,0.905770,-0.515283,0.0,0,-0.275560,-0.275560,8.924512e-02,1.135569e-01,1,-0.264479,0.262092
2024-09-29,-0.313983,-0.194081,-1.117856,-0.003887,2.173427,-0.692086,-0.783762,2.189516,2.119945,2.258961,-0.482288,-4.224515e-02,-0.040315,-0.781950,-0.589656,-4.688558e-01,-0.591663,-0.593700,2.259111,2.119930e+00,2.182428,2.184285,-0.787920,7.682284e-02,0.092177,-2.475544e-01,-1.840814e-01,2.641700e-02,2.027563e-02,-4.384157e-02,-0.040267,-0.799916,-0.722662,-6.972926e-01,-7.629465e-01,-0.732631,-8.155229e-01,-7.269809e-01,-0.690520,-7.518142e-01,1.312444e+00,1.966525,3.645692e-16,-0.569006,2.547483,1.936683e-16,0.000000,-0.481672,2.757324e+00,0.000000,4.013223e-01,5.436407e-01,3.300397e+00,2.980105,2.040781e+00,3.286025e+00,-1.345076e+00,-8.817970e-01,0.000000,-0.321468,-0.613435,1.266036,1.550782,-0.100232,1.386769,0.917615,1.801783,1.767293,1.574059,1.622292,1.529238,0.443122,-0.559415,0.340891,-0.256116,1.925527,-0.745726,2.554366,2.593517,1.389178,-0.323791,2.298187,1.861753,1.958209,-1.543076,-0.615531,-0.646645,-1.546732,-1.335478,-1.054226,1.054226,2.016858,0.995784,2.192720,-0.483011,2.250813,2.148289,-0.811114,-0.139494,-1.149653e-01,0.028190,0.450037,0.157324,2.895348e-01,0.207255,2.114354e-01,0.204963,0.214231,0.347624,0.507955,0.751527,5.858629e-01,4.454436e-01,3.594783e-01,6.027983e-01,0.662040,5.960961e-01,0.631624,-0.506153,0.0,0,-0.358394,-0.358394,-8.860732e-02,-1.369370e-01,0,0.962259,0.461490


### Target의 Returns예측

In [17458]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target', 'target_returns']].fillna(test_df.mean())
X['target_returns'] = X['target_returns'].shift(-1).fillna(0)
# X = X[['target']]

# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     6.956
Date:                Mon, 07 Apr 2025   Prob (F-statistic):           0.000972
Time:                        16:58:05   Log-Likelihood:                 4696.4
No. Observations:                2465   AIC:                            -9387.
Df Residuals:                    2462   BIC:                            -9369.
Df Model:                           2                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------


### During 2 day returns 예측

In [17459]:
# Independent, Dependent 분리
y = test_df['returns_during_2'].shift(-2).fillna(test_df['returns_during_2'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_2   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1056
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.745
Time:                        16:58:05   Log-Likelihood:                -3473.1
No. Observations:                2465   AIC:                             6950.
Df Residuals:                    2463   BIC:                             6962.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 3 day 예측

In [17460]:
# Independent, Dependent 분리
y = test_df['returns_during_3'].shift(-3).fillna(test_df['returns_during_3'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_3   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2193
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.640
Time:                        16:58:05   Log-Likelihood:                -3469.8
No. Observations:                2465   AIC:                             6944.
Df Residuals:                    2463   BIC:                             6955.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 4 day 예측

In [17461]:
# Independent, Dependent 분리
y = test_df['returns_during_4'].shift(-4).fillna(test_df['returns_during_4'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_4   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.751
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.186
Time:                        16:58:05   Log-Likelihood:                -3467.1
No. Observations:                2465   AIC:                             6938.
Df Residuals:                    2463   BIC:                             6950.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 5 day 예측

In [17462]:
# Independent, Dependent 분리
y = test_df['returns_during_5'].shift(-5).fillna(test_df['returns_during_5'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_5   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     8.780
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00307
Time:                        16:58:05   Log-Likelihood:                -3465.2
No. Observations:                2465   AIC:                             6934.
Df Residuals:                    2463   BIC:                             6946.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 6 day 예측

In [17463]:
# Independent, Dependent 분리
y = test_df['returns_during_6'].shift(-6).fillna(test_df['returns_during_6'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_6   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     14.22
Date:                Mon, 07 Apr 2025   Prob (F-statistic):           0.000166
Time:                        16:58:05   Log-Likelihood:                -3458.2
No. Observations:                2465   AIC:                             6920.
Df Residuals:                    2463   BIC:                             6932.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 7 day 예측

In [17464]:
# Independent, Dependent 분리
y = test_df['returns_during_7'].shift(-7).fillna(test_df['returns_during_7'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_7   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     13.79
Date:                Mon, 07 Apr 2025   Prob (F-statistic):           0.000209
Time:                        16:58:05   Log-Likelihood:                -3451.4
No. Observations:                2465   AIC:                             6907.
Df Residuals:                    2463   BIC:                             6918.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 8 day 예측

In [17465]:
# Independent, Dependent 분리
y = test_df['returns_during_8'].shift(-8).fillna(test_df['returns_during_8'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_8   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     10.34
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00132
Time:                        16:58:05   Log-Likelihood:                -3447.2
No. Observations:                2465   AIC:                             6898.
Df Residuals:                    2463   BIC:                             6910.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 9 Days

In [17466]:
# Independent, Dependent 분리
y = test_df['returns_during_9'].shift(-9).fillna(test_df['returns_during_9'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:       returns_during_9   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     11.46
Date:                Mon, 07 Apr 2025   Prob (F-statistic):           0.000721
Time:                        16:58:05   Log-Likelihood:                -3440.6
No. Observations:                2465   AIC:                             6885.
Df Residuals:                    2463   BIC:                             6897.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### During 10 Days

In [17467]:
# Independent, Dependent 분리
y = test_df['returns_during_10'].shift(-10).fillna(test_df['returns_during_10'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_10   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     9.935
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00164
Time:                        16:58:05   Log-Likelihood:                -3434.2
No. Observations:                2465   AIC:                             6872.
Df Residuals:                    2463   BIC:                             6884.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17468]:
# Independent, Dependent 분리
y = test_df['returns_during_11'].shift(-11).fillna(test_df['returns_during_11'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_11   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     7.363
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00670
Time:                        16:58:05   Log-Likelihood:                -3422.5
No. Observations:                2465   AIC:                             6849.
Df Residuals:                    2463   BIC:                             6861.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17469]:
# Independent, Dependent 분리
y = test_df['returns_during_12'].shift(-12).fillna(test_df['returns_during_12'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===


                            OLS Regression Results                            
Dep. Variable:      returns_during_12   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.455
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0111
Time:                        16:58:05   Log-Likelihood:                -3409.8
No. Observations:                2465   AIC:                             6824.
Df Residuals:                    2463   BIC:                             6835.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0346      0.054     -0.636      0.5

In [17470]:
# Independent, Dependent 분리
y = test_df['returns_during_13'].shift(-13).fillna(test_df['returns_during_13'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_13   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     6.222
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0127
Time:                        16:58:05   Log-Likelihood:                -3394.1
No. Observations:                2465   AIC:                             6792.
Df Residuals:                    2463   BIC:                             6804.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17471]:
# Independent, Dependent 분리
y = test_df['returns_during_14'].shift(-14).fillna(test_df['returns_during_14'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_14   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     5.982
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0145
Time:                        16:58:05   Log-Likelihood:                -3381.6
No. Observations:                2465   AIC:                             6767.
Df Residuals:                    2463   BIC:                             6779.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17472]:
# Independent, Dependent 분리
y = test_df['returns_during_15'].shift(-15).fillna(test_df['returns_during_15'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_15   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     6.109
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0135
Time:                        16:58:06   Log-Likelihood:                -3374.1
No. Observations:                2465   AIC:                             6752.
Df Residuals:                    2463   BIC:                             6764.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17473]:
# Independent, Dependent 분리
y = test_df['returns_during_16'].shift(-16).fillna(test_df['returns_during_16'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_16   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     7.789
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00530
Time:                        16:58:06   Log-Likelihood:                -3364.4
No. Observations:                2465   AIC:                             6733.
Df Residuals:                    2463   BIC:                             6744.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17474]:
# Independent, Dependent 분리
y = test_df['returns_during_17'].shift(-17).fillna(test_df['returns_during_17'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_17   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     8.046
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00460
Time:                        16:58:06   Log-Likelihood:                -3359.2
No. Observations:                2465   AIC:                             6722.
Df Residuals:                    2463   BIC:                             6734.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17475]:
# Independent, Dependent 분리
y = test_df['returns_during_18'].shift(-18).fillna(test_df['returns_during_18'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_18   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     7.356
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00673
Time:                        16:58:06   Log-Likelihood:                -3353.0
No. Observations:                2465   AIC:                             6710.
Df Residuals:                    2463   BIC:                             6722.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17476]:
# Independent, Dependent 분리
y = test_df['returns_during_19'].shift(-19).fillna(test_df['returns_during_19'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_19   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     7.570
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00598
Time:                        16:58:06   Log-Likelihood:                -3344.9
No. Observations:                2465   AIC:                             6694.
Df Residuals:                    2463   BIC:                             6705.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

In [17477]:
# Independent, Dependent 분리
y = test_df['returns_during_20'].shift(-20).fillna(test_df['returns_during_20'].mean())
X = test_df[['target']].fillna(test_df.target.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:      returns_during_20   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     9.433
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00215
Time:                        16:58:06   Log-Likelihood:                -3336.6
No. Observations:                2465   AIC:                             6677.
Df Residuals:                    2463   BIC:                             6689.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### Event Flag를 더미변수로 추가하면 기울기가 늘어나는지

In [17478]:
# 파생변수 추가
test_df['event_target'] = test_df['target'] * test_df['event_flag'].shift(0).fillna(0)

# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['target', 'volume', 'event_flag']]
X['event_flag'] = X['event_flag'].shift(-2).fillna(0)
X['target_event'] = (X['target'] * X['event_flag']).shift(0).fillna(0)
X = X[['target_event']]

# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 3})
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1959
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.658
Time:                        16:58:06   Log-Likelihood:                 4691.3
No. Observations:                2465   AIC:                            -9379.
Df Residuals:                    2463   BIC:                            -9367.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
cons

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['event_flag'] = X['event_flag'].shift(-2).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = (X['target'] * X['event_flag']).shift(0).fillna(0)


### Big Returns Flag 추가 설명력 확인

In [17479]:
# 파생변수 추가
test_df['event_target'] = test_df['target'] * test_df['event_flag'].shift(0).fillna(0)

# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target', 'volume', 'vpin', 'big_returns_flag']]
X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
X['target_big_returns'] = X['target'] * X['big_returns_flag']
X = X[['target_big_returns']]

# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     7.591
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00591
Time:                        16:58:06   Log-Likelihood:                 4697.5
No. Observations:                2465   AIC:                            -9391.
Df Residuals:                    2463   BIC:                            -9379.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_big_returns'] = X['target'] * X['big_returns_flag']


### 과거 변동성이 Target을 잘 설명하는지

In [17480]:
# test_df['volatility'] = test_df['returns']**2
# test_df['std_5'] = test_df['returns'].rolling(5).std()
# test_df['volume_5'] = test_df['volume'].rolling(5).std()

# # Independent, Dependent 분리
# y = test_df['target']
# X = test_df[['volume_5', 'std_5']].fillna(test_df.mean())

# # ## train, test 데이터 분리
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, shuffle=False)

# start = '2018-01-01'
# end = '2024-12-31'
# X_test = X[(X.index >= start)&(X.index <= end)]
# y_test = y[(y.index >= start)&(y.index <= end)]

# X_with_const = sm.add_constant(X_test)

# # OLS 회귀 적합
# # ---------------------------------------------
# lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 3})
# print("=== OLS with Newey-West Standard Errors ===")
# print(lr.summary())

### Target Flag가 Returns를 유의미하게 설명하는지

In [17481]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['vpin', 'target_flag', 'returns']].fillna(test_df.mean())
# X['target_flag'] = X['target_delta'].apply(lambda x: 1 if x > 0.05 else 0)
# X['target_flag'] = X['target_flag'] * X['target_delta']
X = X[['target_flag']]

## train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.813
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0936
Time:                        16:58:06   Log-Likelihood:                 4692.6
No. Observations:                2465   AIC:                            -9381.
Df Residuals:                    2463   BIC:                            -9370.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const 

### Target의 변화율이 Returns를 유의미하게 설명하는지

In [17482]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target']].fillna(test_df.mean())


# # train, test 데이터 분리
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin'].shift(1)

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.150
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.143
Time:                        16:58:06   Log-Likelihood:                 4693.0
No. Observations:                2465   AIC:                            -9382.
Df Residuals:                    2463   BIC:                            -9370.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

### Target 변화량, 변화율, 값으로 Returns 예측

In [17483]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target', 'target_returns']]
X['target_returns'] = X['target_returns'].shift(-1).fillna(X['target_returns'].mean())
X = X[['target']]


start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     2.150
Date:                Mon, 07 Apr 2025   Prob (F-statistic):              0.143
Time:                        16:58:06   Log-Likelihood:                 4693.0
No. Observations:                2465   AIC:                            -9382.
Df Residuals:                    2463   BIC:                            -9370.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_returns'] = X['target_returns'].shift(-1).fillna(X['target_returns'].mean())


In [17484]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'event_flag' ,'big_returns_flag']]
X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
X['target_event'] = X['target_returns'] * X['big_returns_flag']
X['target_event'] = X['target_event'].shift(0).fillna(0)
X = X[['target_event']]


start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     6.508
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0108
Time:                        16:58:06   Log-Likelihood:                 4695.1
No. Observations:                2465   AIC:                            -9386.
Df Residuals:                    2463   BIC:                            -9374.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
cons

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target_returns'] * X['big_returns_flag']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target_event'].shift(0).fillna(0)


### Control Not Rational

In [17485]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'target', 'event_flag', 'big_returns_flag', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
X['target_event'] = X['target'] * X['event_flag']
X['target_event'] = X['target_event'].shift(-1).fillna(0)

X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
X['target_big_returns'] = X['target'] * X['big_returns_flag']

X['target_returns'] = X['target_returns'].shift(-1).fillna(test_df['target_returns'].mean())

X['vpin'] = X['vpin'].shift(0).fillna(X['vpin'].mean())
X['returns'] = X['returns'].shift(0).fillna(X['returns'].mean())
X['volume'] = X['volume'].shift(0).fillna(X['volume'].mean())
X['rv'] = X['rv'].shift(0).fillna(X['rv'].mean())
X['n-transactions'] = X['n-transactions'].shift(0).fillna(X['n-transactions'].mean())
X = X[['target', 'target_returns', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     2.566
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0124
Time:                        16:58:06   Log-Likelihood:                 4700.1
No. Observations:                2465   AIC:                            -9384.
Df Residuals:                    2457   BIC:                            -9338.
Df Model:                           7                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target'] * X['event_flag']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target_event'].shift(-1).fillna(0)
A value is trying to be set on a copy of a 

### Control Rational Lag

In [17486]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-4).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'target', 'event_flag', 'big_returns_flag', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
X['target_event'] = X['target'] * X['event_flag']
X['target_event'] = X['target_event'].shift(-1).fillna(0)

X['big_returns_flag'] = X['big_returns_flag'].shift(-4).fillna(0)
X['target_big_returns'] = X['target'] * X['big_returns_flag']
X['target_big_returns'] = X['target_big_returns'].shift(0).fillna(0)
# X['target_event'] = X['target_event'].shift(0).fillna(0)

X['target_returns'] = X['target_returns'].shift(-1).fillna(test_df['target_returns'].mean())
# X['target'] = X['target'].shift(1).fillna(test_df['target'].mean())

X['vpin'] = X['vpin'].shift(-3).fillna(X['vpin'].mean())
X['returns'] = X['returns'].shift(-3).fillna(X['returns'].mean())
X['volume'] = X['volume'].shift(-3).fillna(X['volume'].mean())
X['rv'] = X['rv'].shift(-3).fillna(X['rv'].mean())
X['n-transactions'] = X['n-transactions'].shift(-3).fillna(X['n-transactions'].mean())
X = X[['target_returns', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]

start = '2018-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     2.558
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0180
Time:                        16:58:06   Log-Likelihood:                 4701.6
No. Observations:                2465   AIC:                            -9389.
Df Residuals:                    2458   BIC:                            -9349.
Df Model:                           6                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target'] * X['event_flag']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target_event'].shift(-1).fillna(0)
A value is trying to be set on a copy of a 

In [17487]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-5).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'target', 'event_flag', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
X['target_returns'] = X['target_returns'].shift(-2).fillna(method='ffill')
X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
X['target_event'] = X['target'] * X['event_flag']
X['target_event'] = X['target_event'].shift(-2).fillna(0)
X['vpin'] = X['vpin'].shift(-1).fillna(X['vpin'].mean())
X['returns'] = X['returns'].shift(-4).fillna(X['returns'].mean())
X['volume'] = X['volume'].shift(-4).fillna(X['volume'].mean())
X['rv'] = X['rv'].shift(-1).fillna(X['rv'].mean())
X['target'] = X['target'].shift(0).fillna(X['target'].mean())
X['n-transactions'] = X['n-transactions'].shift(-2).fillna(X['n-transactions'].mean())
X = X[['target_event', 'target_returns', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
# , , 'vpin', 'volume', 'returns', 'rv',
start = '2023-01-01'
end = '2023-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     2.309
Date:                Mon, 07 Apr 2025   Prob (F-statistic):             0.0259
Time:                        16:58:06   Log-Likelihood:                 859.30
No. Observations:                 365   AIC:                            -1703.
Df Residuals:                     357   BIC:                            -1671.
Df Model:                           7                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------


  X['target_returns'] = X['target_returns'].shift(-2).fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_returns'] = X['target_returns'].shift(-2).fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [17488]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'target', 'big_returns_flag', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
X['target_returns'] = X['target_returns'].shift(0).fillna(method='ffill')
X['big_returns_flag'] = X['big_returns_flag'].shift(-3).fillna(0)
X['target'] = X['target'].shift(0).fillna(X['target'].mean())
X['target_big_returns'] = X['target'] * X['big_returns_flag']
X['target_big_returns'] = X['target_big_returns'].shift(0).fillna(0)
X['vpin'] = X['vpin'].shift(-2).fillna(X['vpin'].mean())
X['returns'] = X['returns'].shift(-2).fillna(X['returns'].mean())
X['volume'] = X['volume'].shift(-2).fillna(X['volume'].mean())
X['rv'] = X['rv'].shift(-2).fillna(X['rv'].mean())
# X['target'] = X['target'].shift(2).fillna(X['target'].mean())
X['n-transactions'] = X['n-transactions'].shift(-1).fillna(X['n-transactions'].mean())
X = X[['target_returns', 'target_big_returns', 'vpin', 'volume', 'returns', 'rv', 'n-transactions']]
# , 'target_event', 'vpin', 'volume', 'returns', 'rv'
start = '2024-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

# X_test['vpin'] = test_df['vpin']
# X_test = X_test['vpin']

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
# lr = sm.OLS(y_test, X_with_const).fit()
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     3.461
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00145
Time:                        16:58:06   Log-Likelihood:                 595.18
No. Observations:                 274   AIC:                            -1174.
Df Residuals:                     266   BIC:                            -1145.
Df Model:                           7                                         
Covariance Type:                  HAC                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------

  X['target_returns'] = X['target_returns'].shift(0).fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_returns'] = X['target_returns'].shift(0).fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['big_returns_flag'] = X['big_returns_flag'].shift(-3).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

### Event Flag 추가 Target 3종세트 Returns 예측

In [17489]:
# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['target_returns', 'target', 'event_flag']]
X['event_flag'] = X['event_flag'].shift(-3).fillna(0)
X['target_event'] = X['target'] * X['event_flag']
X = X[['target_event', 'target_returns']]

start = '2024-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]

X_with_const = sm.add_constant(X_test)

# OLS 회귀 적합
# ---------------------------------------------
lr = sm.OLS(y_test, X_with_const).fit(cov_type='HAC', cov_kwds={'maxlags': 10})
print("=== OLS with Newey-West Standard Errors ===")
print(lr.summary())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['event_flag'] = X['event_flag'].shift(-3).fillna(0)


=== OLS with Newey-West Standard Errors ===
                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                     4.693
Date:                Mon, 07 Apr 2025   Prob (F-statistic):            0.00992
Time:                        16:58:06   Log-Likelihood:                 590.74
No. Observations:                 274   AIC:                            -1175.
Df Residuals:                     271   BIC:                            -1165.
Df Model:                           2                                         
Covariance Type:                  HAC                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['target_event'] = X['target'] * X['event_flag']
