### 통계적 분석

In [5]:
### 라이브러리
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from scipy import stats
from scipy.stats import shapiro

import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.stats.stattools import jarque_bera
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import StandardScaler

import datetime

### 환경설정
np.random.seed(4)


### 함수 정의
def preprocess_transaction_flag(df):
    df['transaction_flag'] = df['transaction_count'].apply(lambda x: 1 if x > 0 else 0)
    return df


def trim_date(df, start, end):
    ### df date 형식 맞춰주기
    df['date'] = df['date'].astype(str)
    if len(df['date'][0]) == 8:
        df['date'] = df['date'].apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:])


    return df[(df['date']>=start) & (df['date']<=end)]

### 데이터 세팅

In [6]:
### 학습용 데이터 읽기
test_df = pd.read_csv("data/test/test_df.csv")
test_df.set_index('date', inplace=True)

### 이벤트용 데이터 읽기
event_df = pd.read_csv("./data/event/bitcoin_event_details_sentiment.csv")
event_df.index = event_df['Date'].astype(str).apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:])

target_event_receive_df = pd.read_csv("./data/target/event_2_day_transactions_over_100_receive.csv")
target_event_receive_df = preprocess_transaction_flag(target_event_receive_df)
target_event_receive_df.columns = ['date', 'transaction_count', 'transaction_amount', 'transaction_flag']

target_event_receive_df = trim_date(target_event_receive_df, (test_df.index)[0], (test_df.index)[-1])



### Bitcoin Event Flag 추가 / 긍정, 부정, 애매 이벤트 필터링
# pos, neg, neu, all
event_sentiment = 'neg'
if event_sentiment == 'all':
    pass
elif event_sentiment == 'pos':
    event_df = event_df[event_df['classification']==1]
elif event_sentiment == 'neg':
    event_df = event_df[event_df['classification']==0]
elif event_sentiment == 'neu':
    event_df = event_df[event_df['classification']==2]
event_df['classification'] = 1
event_df = event_df['classification']
event_df = event_df[~event_df.index.duplicated(keep='first')]

test_df['event_flag'] = event_df
test_df['event_flag'] = test_df['event_flag'].fillna(0)
event_flag_copy = test_df['event_flag'].values

### 분석 데이터 선택
target_df = target_event_receive_df.copy().set_index('date')

### test_df에 target_df 데이터 적용
test_df['transaction_count'] = target_df['transaction_count']
test_df['transaction_amount'] = target_df['transaction_amount']
test_df['transaction_flag'] = target_df['transaction_flag']
test_df['transaction_amount_usd'] = test_df['transaction_amount'] * test_df['close']

### test_df의 target 선택
test_df['target'] = test_df['transaction_amount_usd']
test_df['target'] = np.where(test_df['target'] == 0, 1e-10, test_df['target'])

### target 관련 파생변수 생성
test_df['target_delta'] = test_df['target'].diff(1)
test_df['target_returns'] = test_df['target'].pct_change(1)

### VPIN 관련 변수 추가
vpin = pd.read_csv('./data/vpin/vpin.csv')
vpin['ma_10'] = vpin['vpin'].rolling(10).mean()
vpin['ma_10'] = vpin['ma_10'].fillna(vpin['ma_10'].mean())
vpin = vpin.set_index('date')

test_df['vpin'] = vpin['vpin']
test_df['vpin_ma_10'] = vpin['ma_10']

##### 전처리 관련
test_df = test_df.replace([np.inf, -np.inf], np.nan)
test_df = test_df.fillna(test_df.mean())

  test_df['target_returns'] = test_df['target'].pct_change(1)


In [7]:
test_df[['returns', 'transaction_amount', 'target', 'target_delta', 'target_returns', 'vpin']].describe(percentiles=[.01, .25, .5, .75, .99]).T

Unnamed: 0,count,mean,std,min,1%,25%,50%,75%,99%,max
returns,2830.0,0.00161209,0.0360799,-0.3950485,-0.09999244,-0.01227509,0.0,0.01529362,0.1063411,0.2250139
transaction_amount,2830.0,1090343.0,1710551.0,3459.729,25411.47,145784.8,647809.4,1278148.0,7860000.0,26156410.0
target,2830.0,14101900000.0,20979980000.0,219170200.0,949465200.0,3796013000.0,6888759000.0,14101900000.0,102272200000.0,240205800000.0
target_delta,2830.0,-23630360.0,9905321000.0,-90128700000.0,-34641320000.0,-1560776000.0,-23630360.0,1436089000.0,31902070000.0,101813000000.0
target_returns,2830.0,0.1749027,1.254728,-0.9824762,-0.7492699,-0.2259464,0.0,0.2751919,3.093613,34.72773
vpin,2830.0,0.645347,0.06112185,0.0009,0.4836492,0.6098612,0.6482219,0.6859938,0.7729164,0.847044


### 기본적인 통계, 시계열 분석

In [8]:
stats_df = test_df.copy()

print(stats_df[['target', 'target_delta', 'target_returns']].describe())

stats_df.index = pd.to_datetime(stats_df.index)

# 연도별 event_flag 개수 확인
event_count_per_year = stats_df.groupby(stats_df.index.year)['event_flag'].sum()

# 결과 출력
print(event_count_per_year)


             target  target_delta  target_returns
count  2.830000e+03  2.830000e+03     2830.000000
mean   1.410190e+10 -2.363036e+07        0.174903
std    2.097998e+10  9.905321e+09        1.254728
min    2.191702e+08 -9.012870e+10       -0.982476
25%    3.796013e+09 -1.560776e+09       -0.225946
50%    6.888759e+09 -2.363036e+07        0.000000
75%    1.410190e+10  1.436089e+09        0.275192
max    2.402058e+11  1.018130e+11       34.727727
date
2017    2.0
2018    8.0
2019    1.0
2020    0.0
2021    6.0
2022    9.0
2023    1.0
2024    0.0
Name: event_flag, dtype: float64


In [9]:
lagged_event_df = test_df.copy()
lagged_event_df['transaction_flag'].value_counts()

transaction_flag
1    2830
Name: count, dtype: int64

In [10]:
lagged_event_df = test_df.copy()
lagged_event_df['lagged_event_flag'] = lagged_event_df['event_flag'].shift(-1).fillna(0)
lagged_event_df[lagged_event_df['lagged_event_flag']==1]['transaction_flag'].value_counts()

transaction_flag
1    27
Name: count, dtype: int64

### VIF: Volume과 Target Delta 사이 설명력 중복 확인

In [11]:

# Target Delta를 적절히 Lagging
test_df['lag_target_delta'] = test_df['target_delta'].shift(1)

# Independent, Dependent 분리
y = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())
X = test_df[['lag_target_delta', 'target', 'volume']].fillna(test_df.mean())


X['returns'] = test_df['returns'].shift(-3).fillna(test_df['returns'].mean())

## train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, shuffle=False)

start = '2023-01-01'
end = '2024-12-31'
X_test = X[(X.index >= start)&(X.index <= end)]
y_test = y[(y.index >= start)&(y.index <= end)]


print(X_test[['returns', 'lag_target_delta', 'target', 'volume']].corr())  # 상관관계 확인

# VIF 계산
X = X[['lag_target_delta', 'target', 'volume']]  # OLS 모델에서 사용한 독립변수들
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
vif_data = pd.DataFrame()
vif_data["Feature"] = X_test.columns
vif_data["VIF"] = [variance_inflation_factor(X_test.values, i) for i in range(X_test.shape[1])]

print(vif_data)

                   returns  lag_target_delta    target    volume
returns           1.000000          0.043301 -0.089531  0.101290
lag_target_delta  0.043301          1.000000  0.325697  0.023594
target           -0.089531          0.325697  1.000000 -0.048433
volume            0.101290          0.023594 -0.048433  1.000000
            Feature       VIF
0  lag_target_delta  1.024990
1            target  1.142056
2            volume  1.115669
