# Bitcoin Volatility Prediction

## Importing Libraries

In [1]:
!pip install -r requirements.txt



#### 파이썬 3.11.4 ver로 작성하였습니다.

In [2]:
import sys
sys.version

'3.11.4 (main, Jul  5 2023, 09:00:44) [Clang 14.0.6 ]'

In [3]:
import dask
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pandas as pd

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split

import BayesianOptimization
from bayes_opt import BayesianOptimization

import pmdarima as pm #auto_arima
from pmdarima.arima.utils import ndiffs
from pmdarima import auto_arima
import statsmodels.api as sm

# Baseline model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit #Timeseires Split

import os
import warnings #경고 무시

In [4]:
# Print version
print("dask version:", dask.__version__)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("scikit-learn version:", sklearn.__version__)  
print("pmdarima version:", pm.__version__)
print("statsmodels version:", sm.__version__)

dask version: 2023.6.0
pandas version: 1.5.3
numpy version: 1.24.3
scikit-learn version: 1.3.0
pmdarima version: 2.0.4
statsmodels version: 0.14.0


In [5]:
## 기본 설정 값

# fig
figsize=(15,5)

# 결과창 출력의 최대 행 및 열 수 설정
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

# 경고 무시 설정
# warnings.filterwarnings('ignore')
# KPSS에서 경고가 하나 떴는데 큰 영향을 미치지 않는다고 하여 작성
# 경고 무시 설정은 다른 코드들 작성할 때 주석 처리해 가며 확인 필요

- 리샘플링 함수

In [6]:
def convert_tick_to_ohlcv(data):
    """
    Converts given Binance tick data into 1-hour interval OHLCV (Open, High, Low, Close, Volume) data.
    :param data: DataFrame with Tick data
    :return: DataFrame with the Open, High, Low, Close, Volume values
    """

    data['time'] = pd.to_datetime(data['time'], unit='ms')
    ohlcv = data.resample('1H', on='time').agg({
        'price': ['first', 'max', 'min', 'last'],
        'qty': 'sum',
        'quote_qty': 'std', # 추가
        'is_buyer_maker': 'sum'}) #추가
        

    ohlcv.columns = ['Open', 'High', 'Low', 'Close', 'Volume','quote_qty','is_buyer_maker']
    return ohlcv

def calculate_volatility(data, window=20):
    """
    Calculate the rolling volatility using the standard deviation of returns.
    :param data: DataFrame with OHLCV data
    :param window: The number of periods to use for calculating the standard deviation
    :return: DataFrame with the volatility values
    """

    # Calculate daily returns
    data['returns'] = data['Close'].pct_change()

    # Calculate the rolling standard deviation of returns
    data['volatility'] = data['returns'].rolling(window=window).std()

    return data

### Loading Data

- 제공된 원본 데이터 파일 리스트(경로 수정 필요)

In [30]:
file_list = [
    # '../data/BTCUSDT-trades-2023-01.csv',
    # '../data/BTCUSDT-trades-2023-02.csv',
    # '../data/BTCUSDT-trades-2023-03.csv',
    # '../data/BTCUSDT-trades-2023-04.csv',
    # '../data/BTCUSDT-trades-2023-05.csv',
    # '../data/BTCUSDT-trades-2023-06.csv',
    # '../data/BTCUSDT-trades-2023-07.csv',
    # '../data/BTCUSDT-trades-2023-08.csv',
    # '../data/BTCUSDT-trades-2023-09.csv',
    # '../data/BTCUSDT-trades-2023-10.csv',
    # '../data/BTCUSDT-trades-2023-11.csv',
    # '../data/BTCUSDT-trades-2023-12.csv',
    # '../data/BTCUSDT-trades-2024-01-01.csv',
    # '../data/BTCUSDT-trades-2024-01-02.csv',
    # '../data/BTCUSDT-trades-2024-01-03.csv',
    # '../data/BTCUSDT-trades-2024-01-04.csv',
    # '../data/BTCUSDT-trades-2024-01-05.csv',
    # '../data/BTCUSDT-trades-2024-01-06.csv',
    # '../data/BTCUSDT-trades-2024-01-07.csv',
    # '../data/BTCUSDT-trades-2024-01-08.csv',
    # '../data/BTCUSDT-trades-2024-01-09.csv',
    # '../data/BTCUSDT-trades-2024-01-10.csv',
    # '../data/BTCUSDT-trades-2024-01-11.csv',
    # '../data/BTCUSDT-trades-2024-01-12.csv',
    # '../data/BTCUSDT-trades-2024-01-13.csv',
    # '../data/BTCUSDT-trades-2024-01-14.csv',
    # '../data/BTCUSDT-trades-2024-01-15.csv',
    '../data/BTCUSDT-trades-2024-01-16.csv',
    '../data/BTCUSDT-trades-2024-01-17.csv',
    '../data/BTCUSDT-trades-2024-01-18.csv',
    '../data/BTCUSDT-trades-2024-01-19.csv',
    '../data/BTCUSDT-trades-2024-01-20.csv',
    '../data/BTCUSDT-trades-2024-01-21.csv',
    '../data/BTCUSDT-trades-2024-01-22.csv',
    '../data/BTCUSDT-trades-2024-01-23.csv',
    '../data/BTCUSDT-trades-2024-01-24.csv',
    '../data/BTCUSDT-trades-2024-01-25.csv',
    '../data/BTCUSDT-trades-2024-01-26.csv',
    '../data/BTCUSDT-trades-2024-01-27.csv'   

]

- 대용량 데이터 처리 > dask 라이브러리 이용
    - 처리 순서 : 개별 파일 dask.read_csv > pandas dataframe으로 변경 > 변환함수 적용 > append > 전체 데이터 concat

In [31]:
# 데이터를 담을 빈 리스트 생성
combined_dfs = []

# 각 파일에 대해 처리
for file in file_list:
    print(f"Reading and processing file: {file}")
    
    # 파일을 Dask DataFrame으로 읽기
    dask_df = dd.read_csv(file, usecols=['price', 'qty', 'quote_qty', 'time', 'is_buyer_maker'], dtype={'price': float, 'qty': float, 'quote_qty': float, 'time': float})
    
    try:
        # 변환 함수 적용
        computed_df = dask_df.compute()
        # 예외 처리
        try:
            processed_df = convert_tick_to_ohlcv(computed_df)
            combined_dfs.append(processed_df)
        except Exception as e:
            print(f"Error processing file {file}: {e}")
    except Exception as e:
        print(f"Error reading file {file}: {e}")

# 모든 파일 처리가 끝나면 리스트에 있는 모든 DataFrame을 concat하여 하나의 DataFrame으로 만듦
combined_df = pd.concat(combined_dfs, ignore_index=False)


Reading and processing file: ../data/BTCUSDT-trades-2024-01-16.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-17.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-18.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-19.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-20.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-21.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-22.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-23.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-24.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-25.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-26.csv
Reading and processing file: ../data/BTCUSDT-trades-2024-01-27.csv


- 변동성 계산

In [32]:
volatility_data = calculate_volatility(combined_df)

In [33]:
df = volatility_data.copy()

In [34]:
#df.set_index('time', inplace=True)
df.index = pd.to_datetime(df.index)
df.isnull().sum()
dfc = df.copy()

### Sloving Caution

In [35]:
# 경고 해결용(returns ARIMA) - 인덱스에 주파수 정보 추가

# 기존 데이터프레임의 인덱스를 활용하여 새로운 인덱스 생성
new_date_rng = pd.date_range(start=df.index.min(), end=df.index.max(), freq='h')

# 새로운 인덱스를 기존 데이터프레임의 인덱스로 설정
df = df.reindex(new_date_rng)

## 1. Preprocessing

### 1-1. Checking Missing Value

#### - price(Open, High, Low, Close, price), quote_qty

In [36]:
display(df[df['Open'].isnull()].index)

DatetimeIndex([], dtype='datetime64[ns]', freq='H')

#### - returns

In [37]:
df[df['returns'].isnull()].index # 매달 1일 1시간

DatetimeIndex(['2024-01-16'], dtype='datetime64[ns]', freq='H')

#### - volatility

In [38]:
df[df['volatility'].isnull()].index

DatetimeIndex(['2024-01-16 00:00:00', '2024-01-16 01:00:00',
               '2024-01-16 02:00:00', '2024-01-16 03:00:00',
               '2024-01-16 04:00:00', '2024-01-16 05:00:00',
               '2024-01-16 06:00:00', '2024-01-16 07:00:00',
               '2024-01-16 08:00:00', '2024-01-16 09:00:00',
               '2024-01-16 10:00:00', '2024-01-16 11:00:00',
               '2024-01-16 12:00:00', '2024-01-16 13:00:00',
               '2024-01-16 14:00:00', '2024-01-16 15:00:00',
               '2024-01-16 16:00:00', '2024-01-16 17:00:00',
               '2024-01-16 18:00:00', '2024-01-16 19:00:00'],
              dtype='datetime64[ns]', freq='H')

### 1-1-2. Interpolate MIssing Value - functions

In [39]:
# spline 보간
def spline_interpolate_column(df, column_name, order=3):
    df[column_name] = df[column_name].interpolate(method='spline', order=order)

In [40]:
# ARIMA 보간
def ARIMA_Interpolate(df, column_list):
    
    for column in column_list:
        df_column = df[column]  # 해당 컬럼 선택

        # 최적의 ARIMA 모델 선택
        autoarima_model = pm.auto_arima(df_column.dropna(), suppress_warnings=True)
        order = autoarima_model.order
        print(order)
        
        # 결측치 보간 전 ARIMA 모델 훈련
        model = sm.tsa.ARIMA(df_column.dropna(), order=order)  # p, d, q는 ARIMA 모델의 차수
        results = model.fit()

        # 보간할 결측치 선택
        missing_indices = df_column[df_column.isnull()].index

        # 결측치 보간
        for idx in missing_indices:
            # ARIMA 모델을 사용하여 결측치 예측
            predicted_value = results.get_forecast(steps=1).predicted_mean.iloc[0]  # 예측 결과의 첫 번째 값을 사용

            # 결측치 보간
            df_column[idx] = predicted_value

### 1-1-3. Interpolate Missing Value - Interpolate

#### - price(Open, High, Low, Close, price), quote_qty

In [41]:
columns_to_interpolate = ['Open', 'High', 'Low', 'Close']

for column in columns_to_interpolate:
    spline_interpolate_column(df, column, order=3)

In [42]:
columns_to_interpolate_volume = ['quote_qty']

for column in columns_to_interpolate_volume:
    spline_interpolate_column(df, column, order=3)

In [43]:
df.isna().sum()

Open               0
High               0
Low                0
Close              0
Volume             0
quote_qty          0
is_buyer_maker     0
returns            1
volatility        20
dtype: int64

#### - returns

In [44]:
returns_list=['returns']
ARIMA_Interpolate(df, returns_list)

(0, 0, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


#### - volatility

In [45]:
volatility_list=['volatility']
ARIMA_Interpolate(df, volatility_list)

(2, 0, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


In [46]:
df.isna().sum()

Open              0
High              0
Low               0
Close             0
Volume            0
quote_qty         0
is_buyer_maker    0
returns           0
volatility        0
dtype: int64

### 1-2. Technical Indicators

#### MACD Function (Moving Average Convergence Divergence)
Calculates the difference between short-term and long-term moving averages to indicate trends.
Quickly detects trends but may be sensitive to noise.
#### Stochastic Oscillator Function
An indicator that reflects the trend of price fluctuations in financial markets such as stocks.
Calculates the relative strength of the price based on the highest (High) and lowest (Low) prices over the last N days.
#### ATR Function (Average True Range)
An indicator that represents the average range of price movements.
Calculates volatility using True Range.
#### Bollinger Bands Function
An indicator that uses moving averages and standard deviation to assess the relative height of prices.
Forms upper and lower bands around the moving average, indicating deviations in stock prices.
#### ROC Function (Rate of Change)
Measures the relative price change over a certain period.
Analyzes the flow of stock prices by comparing the current price with the price a certain period ago.
#### RSI Function (Relative Strength Index)
Relative Strength Index indicating the relative strength of price movements over a specific period.
Generally, values above 70 indicate an uptrend, while values below 30 indicate a downtrend.
#### Ultimate Oscillator Function
A comprehensive indicator that combines relative strength for various periods.
Provides a richer analysis of the stock price trend.

###  1-2-1. Technical Indicators - functions

In [47]:
# MACD 함수
def MACD(data, short_window, long_window, signal_window):
    
    data[f'Short_MA_{short_window}'] = data['Close'].ewm(span=short_window, adjust=False).mean()
    data[f'Long_MA_{long_window}'] = data['Close'].ewm(span=long_window, adjust=False).mean()
    data[f'MACD_{short_window}{long_window}'] = data[f'Short_MA_{short_window}'] - data[f'Long_MA_{long_window}']
    data[f'Signal_Line_{signal_window}'] = data[f'MACD_{short_window}{long_window}'].ewm(span=signal_window, adjust=False).mean()
    data[f'MACD_Oscillator_{short_window}{long_window}'] = data[f'MACD_{short_window}{long_window}'] - data[f'Signal_Line_{signal_window}']
    
    data.drop([f'Short_MA_{short_window}', f'Long_MA_{long_window}'], axis=1, inplace=True)
    
    return data
    
    
# Stochastic Oscillator 함수
def cal_stoc_os(df, period, m):
    
    # 최근 N일간의 최고가(H)와 최저가(L)
    df[f'SO high_{period}{m}'] = df['High'].rolling(window=period).max()
    df[f'SO low_{period}{m}'] = df['Low'].rolling(window=period).min()
    
    # %K 계산
    df[f'SO %K_{period}{m}'] = (df['Close'] - df[f'SO low_{period}{m}']) / (df[f'SO high_{period}{m}'] - df[f'SO low_{period}{m}']) * 100
    
    # %D 계산
    df[f'SO %D_{period}{m}'] = df[f'SO %K_{period}{m}'].rolling(m).mean()
    
    return df
    
    
# ATR 함수
def calculate_atr(df, period):
    # True Range 계산
    df['high-low'] = df['High'] - df['Low']
    df['high-close'] = abs(df['High'] - df['Close'].shift())
    df['low-close'] = abs(df['Low'] - df['Close'].shift())
    
    df['true_range'] = df[['high-low', 'high-close', 'low-close']].max(axis=1)
    
    # ATR 계산
    df[f'atr_{period}'] = df['true_range'].rolling(window=period, min_periods=1).mean()
    
    # 임시 열 삭제
    df.drop(['high-low', 'high-close', 'low-close', 'true_range'], axis=1, inplace=True)
    
    return df


# Bollinger_band 함수
def bollinger_bands(df, window, num_std_dev):
    
    # 이동평균 계산
    df['rolling_mean'] = df['Close'].rolling(window=window).mean()

    # 표준편차 계산
    df['rolling_std'] = df['Close'].rolling(window=window).std()

    # 볼린저 밴드 상단 및 하단 계산
    df[f'upper_band_{window}'] = df['rolling_mean'] + (num_std_dev * df['rolling_std'])
    df[f'lower_band_{window}'] = df['rolling_mean'] - (num_std_dev * df['rolling_std'])

    # 필요없는 컬럼 삭제
    df.drop(['rolling_mean', 'rolling_std'], axis=1, inplace=True)

    return df


# ROC 계산 함수
def calculate_roc(df, close_column, timeperiod):
    prev_close = df[close_column].shift(timeperiod)
    df[f'roc_{timeperiod}'] = ((df[close_column] - prev_close) / prev_close) * 100
    
    return df


# RSI 계산 함수
def calculate_rsi(df, close_column, timeperiod):
    delta = df[close_column].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=timeperiod, min_periods=1).mean()
    avg_loss = loss.rolling(window=timeperiod, min_periods=1).mean()

    rs = avg_gain / avg_loss
    df[f'rsi_{timeperiod}'] = 100 - (100 / (1 + rs))
    
    return df


# Ultimate Oscillator 함수
def calculate_uo(df, period1, period2, period3):
    # True Range 계산
    df['true_range'] = df['High'].combine(df['Close'].shift(1), max) - df['Low'].combine(df['Close'].shift(1), min)
    
    # Average True Range (ATR) 계산
    df['avg_true_range1'] = df['true_range'].rolling(window=period1, min_periods=1).mean()
    df['avg_true_range2'] = df['true_range'].rolling(window=period2, min_periods=1).mean()
    df['avg_true_range3'] = df['true_range'].rolling(window=period3, min_periods=1).mean()

    # Buying Pressure 계산
    df['buying_pressure'] = df['Close'] - df[['Open', 'Close']].min(axis=1)

    # True Buying Pressure (TBP) 계산
    df['true_buying_pressure1'] = df['buying_pressure'].rolling(window=period1, min_periods=1).sum()
    df['true_buying_pressure2'] = df['buying_pressure'].rolling(window=period2, min_periods=1).sum()
    df['true_buying_pressure3'] = df['buying_pressure'].rolling(window=period3, min_periods=1).sum()

    # UO 계산
    df[f'UO_{period1}{period2}{period3}'] = (
        100
        * (4 * df['true_buying_pressure1'] / df['avg_true_range1'])
        + 2 * (4 * df['true_buying_pressure2'] / df['avg_true_range2'])
        + (df['true_buying_pressure3'] / df['avg_true_range3'])
    ) / (4 + 2 + 1)

    # 필요 없는 열 제거
    df = df.drop(['true_range', 'avg_true_range1', 'avg_true_range2', 'avg_true_range3',
                  'buying_pressure', 'true_buying_pressure1', 'true_buying_pressure2', 'true_buying_pressure3'], axis=1)

    return df

### 1-2-2. Technical Indicators - Generate

In [48]:
# df와 df_idc의 구분을 위한 copy
df_copy=df.copy()

In [49]:
df_idc = MACD(df_copy, 6, 13, 4) # 단기추세 감지 but 노이즈에 민감할 가능성 있음
df_idc = MACD(df_idc, 12, 26, 9) # 일반적인 window 계수 but 변동성이 큰 시장에선 느리게 반응할 수 있음
df_idc = MACD(df_idc, 5, 26, 9) 

df_idc = calculate_atr(df_idc, period=14) 
df_idc = calculate_atr(df_idc, period=20)

df_idc = cal_stoc_os(df_idc, period=20, m=2)
df_idc = cal_stoc_os(df_idc, period=14, m=2)
df_idc = cal_stoc_os(df_idc, period=30, m=2)

df_idc = cal_stoc_os(df_idc, period=20, m=3)
df_idc = cal_stoc_os(df_idc, period=14, m=3)
df_idc = cal_stoc_os(df_idc, period=30, m=3)

df_idc = cal_stoc_os(df_idc, period=20, m=5)
df_idc = cal_stoc_os(df_idc, period=14, m=5)
df_idc = cal_stoc_os(df_idc, period=30, m=5)

df_idc = cal_stoc_os(df_idc, period=20, m=10)
df_idc = cal_stoc_os(df_idc, period=14, m=10)
df_idc = cal_stoc_os(df_idc, period=30, m=10)

df_idc = bollinger_bands(df_idc, window=20, num_std_dev=1.5)
df_idc = bollinger_bands(df_idc, window=15, num_std_dev=1.5)
df_idc = bollinger_bands(df_idc, window=10, num_std_dev=1.5)
df_idc = bollinger_bands(df_idc, window=20, num_std_dev=2)
df_idc = bollinger_bands(df_idc, window=15, num_std_dev=2)
df_idc = bollinger_bands(df_idc, window=10, num_std_dev=2)

df_idc = calculate_roc(df_idc, close_column='Close', timeperiod=1)
df_idc = calculate_roc(df_idc, close_column='Close', timeperiod=2)

df_idc = calculate_rsi(df_idc, close_column='Close', timeperiod=7)
df_idc = calculate_rsi(df_idc, close_column='Close', timeperiod=9)
df_idc = calculate_rsi(df_idc, close_column='Close', timeperiod=14)

df_idc_all = calculate_uo(df_idc, 7, 10, 14)
df_idc_all = calculate_uo(df_idc, 7, 9, 11)
df_idc_all = calculate_uo(df_idc, 7, 10, 12)

In [50]:
df_idc_all.shape

(288, 81)

### 1-3. Checking Missing Value

In [51]:
df_idc_all.isna().sum()

Open                     0
High                     0
Low                      0
Close                    0
Volume                   0
quote_qty                0
is_buyer_maker           0
returns                  0
volatility               0
MACD_613                 0
Signal_Line_4            0
MACD_Oscillator_613      0
MACD_1226                0
Signal_Line_9            0
MACD_Oscillator_1226     0
MACD_526                 0
MACD_Oscillator_526      0
atr_14                   0
atr_20                   0
SO high_202             19
SO low_202              19
SO %K_202               19
SO %D_202               20
SO high_142             13
SO low_142              13
SO %K_142               13
SO %D_142               14
SO high_302             29
SO low_302              29
SO %K_302               29
SO %D_302               30
SO high_203             19
SO low_203              19
SO %K_203               19
SO %D_203               21
SO high_143             13
SO low_143              13
S

#### - SO

In [52]:
SO_list = ['SO high_202', 'SO low_202', 'SO %K_202', 'SO %D_202',
       'SO high_142', 'SO low_142', 'SO %K_142', 'SO %D_142', 'SO high_302',
       'SO low_302', 'SO %K_302', 'SO %D_302', 'SO high_203', 'SO low_203',
       'SO %K_203', 'SO %D_203', 'SO high_143', 'SO low_143', 'SO %K_143',
       'SO %D_143', 'SO high_303', 'SO low_303', 'SO %K_303', 'SO %D_303',
       'SO high_205', 'SO low_205', 'SO %K_205', 'SO %D_205', 'SO high_145',
       'SO low_145', 'SO %K_145', 'SO %D_145', 'SO high_305', 'SO low_305',
       'SO %K_305', 'SO %D_305', 'SO high_2010', 'SO low_2010', 'SO %K_2010',
       'SO %D_2010', 'SO high_1410', 'SO low_1410', 'SO %K_1410', 'SO %D_1410',
       'SO high_3010', 'SO low_3010', 'SO %K_3010', 'SO %D_3010']

ARIMA_Interpolate(df_idc_all, SO_list)

(1, 1, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(4, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 2, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 2, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 2, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(5, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 2, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(2, 1, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


#### - BB

In [53]:
BB_list = ['upper_band_20', 'lower_band_20', 'upper_band_15','lower_band_15','upper_band_10','lower_band_10']  

ARIMA_Interpolate(df_idc_all, BB_list)

(1, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


(1, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_column[idx] = predicted_value


#### - roc, rsi

In [54]:
columns_to_bfill = ['roc_1', 'roc_2', 'rsi_7', 'rsi_9', 'rsi_14']

for column in columns_to_bfill:
    df_idc_all[column] = df_idc_all[column].bfill()

In [55]:
df_idc_all.isna().sum()

Open                    0
High                    0
Low                     0
Close                   0
Volume                  0
quote_qty               0
is_buyer_maker          0
returns                 0
volatility              0
MACD_613                0
Signal_Line_4           0
MACD_Oscillator_613     0
MACD_1226               0
Signal_Line_9           0
MACD_Oscillator_1226    0
MACD_526                0
MACD_Oscillator_526     0
atr_14                  0
atr_20                  0
SO high_202             0
SO low_202              0
SO %K_202               0
SO %D_202               0
SO high_142             0
SO low_142              0
SO %K_142               0
SO %D_142               0
SO high_302             0
SO low_302              0
SO %K_302               0
SO %D_302               0
SO high_203             0
SO low_203              0
SO %K_203               0
SO %D_203               0
SO high_143             0
SO low_143              0
SO %K_143               0
SO %D_143   

In [56]:
df_idc_all.head()

Unnamed: 0,Open,High,Low,Close,Volume,quote_qty,is_buyer_maker,returns,volatility,MACD_613,Signal_Line_4,MACD_Oscillator_613,MACD_1226,Signal_Line_9,MACD_Oscillator_1226,MACD_526,MACD_Oscillator_526,atr_14,atr_20,SO high_202,SO low_202,SO %K_202,SO %D_202,SO high_142,SO low_142,SO %K_142,SO %D_142,SO high_302,SO low_302,SO %K_302,SO %D_302,SO high_203,SO low_203,SO %K_203,SO %D_203,SO high_143,SO low_143,SO %K_143,SO %D_143,SO high_303,SO low_303,SO %K_303,SO %D_303,SO high_205,SO low_205,SO %K_205,SO %D_205,SO high_145,SO low_145,SO %K_145,SO %D_145,SO high_305,SO low_305,SO %K_305,SO %D_305,SO high_2010,SO low_2010,SO %K_2010,SO %D_2010,SO high_1410,SO low_1410,SO %K_1410,SO %D_1410,SO high_3010,SO low_3010,SO %K_3010,SO %D_3010,upper_band_20,lower_band_20,upper_band_15,lower_band_15,upper_band_10,lower_band_10,roc_1,roc_2,rsi_7,rsi_9,rsi_14,UO_71014,UO_7911,UO_71012
2024-01-16 00:00:00,42515.0,42679.0,42466.1,42604.7,4807.989,8579.550704,39667,-3.6e-05,0.002261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212.9,212.9,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.97846,42249.822736,41379.363116,82.252041,85.17078,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.85525,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.030513,0.228613,0.0,0.0,0.0,24.617392,24.617392,24.617392
2024-01-16 01:00:00,42604.6,42733.7,42556.6,42591.7,4728.842,9032.908754,37545,-0.000305,0.002261,-1.857143,-0.742857,-1.114286,-1.037037,-0.674074,-0.82963,-3.37037,-2.696296,195.0,195.0,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.97846,42249.822736,41379.363116,82.252041,85.17078,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.85525,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.030513,0.228613,0.0,0.0,0.0,26.877143,26.877143,26.877143
2024-01-16 02:00:00,42591.7,42732.8,42552.0,42702.1,5519.024,14268.124236,36255,0.002592,0.002261,12.853061,4.69551,8.157551,6.969125,4.11166,5.741226,23.254595,19.142936,190.266667,190.266667,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.97846,42249.822736,41379.363116,82.252041,85.17078,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.85525,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,0.259205,0.228613,89.465154,89.465154,89.465154,61.448268,61.448268,61.448268
2024-01-16 03:00:00,42702.1,42942.0,42695.7,42919.5,7032.131,15921.078311,43162,0.005091,0.002261,52.391837,23.774041,28.617796,30.504789,22.385035,23.421512,95.478535,73.0935,204.275,204.275,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.97846,42249.822736,41379.363116,82.252041,85.17078,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.85525,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,0.509108,0.769634,96.185446,96.185446,96.185446,119.417102,119.417102,119.417102
2024-01-16 04:00:00,42919.3,42945.0,42759.9,42805.2,5516.792,13639.276794,41625,-0.002663,0.002261,58.132237,37.517319,20.614917,39.478831,39.522105,25.916443,108.070385,68.54828,200.44,200.44,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.97846,42249.822736,41379.363116,82.252041,85.17078,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.85525,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.266313,0.24144,72.028126,72.028126,72.028126,121.701899,121.701899,121.701899


In [57]:
df_idc_all.tail()

Unnamed: 0,Open,High,Low,Close,Volume,quote_qty,is_buyer_maker,returns,volatility,MACD_613,Signal_Line_4,MACD_Oscillator_613,MACD_1226,Signal_Line_9,MACD_Oscillator_1226,MACD_526,MACD_Oscillator_526,atr_14,atr_20,SO high_202,SO low_202,SO %K_202,SO %D_202,SO high_142,SO low_142,SO %K_142,SO %D_142,SO high_302,SO low_302,SO %K_302,SO %D_302,SO high_203,SO low_203,SO %K_203,SO %D_203,SO high_143,SO low_143,SO %K_143,SO %D_143,SO high_303,SO low_303,SO %K_303,SO %D_303,SO high_205,SO low_205,SO %K_205,SO %D_205,SO high_145,SO low_145,SO %K_145,SO %D_145,SO high_305,SO low_305,SO %K_305,SO %D_305,SO high_2010,SO low_2010,SO %K_2010,SO %D_2010,SO high_1410,SO low_1410,SO %K_1410,SO %D_1410,SO high_3010,SO low_3010,SO %K_3010,SO %D_3010,upper_band_20,lower_band_20,upper_band_15,lower_band_15,upper_band_10,lower_band_10,roc_1,roc_2,rsi_7,rsi_9,rsi_14,UO_71014,UO_7911,UO_71012
2024-01-27 19:00:00,41785.0,41977.0,41775.1,41896.4,7765.565,18797.517191,38285,0.002668,0.002019,40.403682,30.996484,9.407198,146.529443,210.556287,-34.805681,189.87565,-20.680636,152.521429,145.215,41977.0,41360.1,86.934673,78.656203,41977.0,41360.1,86.934673,84.282095,42239.3,41002.1,72.28419,69.303734,41977.0,41360.1,86.934673,73.936576,41977.0,41360.1,86.934673,82.405076,42239.3,41002.1,72.28419,67.927335,41977.0,41360.1,86.934673,67.082359,41977.0,41360.1,86.934673,82.564744,42239.3,41002.1,72.28419,70.994755,41977.0,41360.1,86.934673,55.794468,41977.0,41360.1,86.934673,72.340528,42239.3,41002.1,72.28419,74.633851,41926.282235,41574.097765,41919.616474,41543.050193,41873.682804,41669.537196,0.266843,0.30405,86.169284,82.137285,58.884892,98.254035,97.631804,98.239248
2024-01-27 20:00:00,41896.5,42070.0,41896.4,42049.6,5408.243,16324.490996,35317,0.003657,0.002154,69.666208,46.464374,23.201834,157.992808,215.865915,-18.673852,237.104426,21.238511,155.557143,145.49,42070.0,41360.1,97.126356,92.030515,42070.0,41360.1,97.126356,92.030515,42239.3,41200.0,81.74733,77.01576,42070.0,41360.1,97.126356,84.812921,42070.0,41360.1,97.126356,88.563515,42239.3,41200.0,81.74733,73.4516,42070.0,41360.1,97.126356,76.559951,42070.0,41360.1,97.126356,85.359477,42239.3,41200.0,81.74733,71.387407,42070.0,41360.1,97.126356,60.392123,42070.0,41360.1,97.126356,76.524067,42239.3,41200.0,81.74733,74.961314,41985.31547,41542.74453,42001.586587,41501.266747,42002.304354,41604.515646,0.365664,0.633482,92.606285,88.448928,68.997644,157.603487,157.318754,157.646731
2024-01-27 21:00:00,42049.6,42165.6,42001.8,42137.8,5859.277,12457.410066,42103,0.002098,0.002189,97.338521,66.814032,30.524488,172.209487,229.346777,-3.565739,283.270228,53.92345,157.992857,145.19,42165.6,41360.1,96.548727,96.837542,42165.6,41360.1,96.548727,96.837542,42239.3,41360.1,88.455414,85.101372,42165.6,41360.1,96.548727,93.536586,42165.6,41360.1,96.548727,93.536586,42239.3,41360.1,88.455414,80.828978,42165.6,41360.1,96.548727,83.096962,42165.6,41360.1,96.548727,88.178062,42239.3,41360.1,88.455414,74.79695,42165.6,41360.1,96.548727,65.672564,42165.6,41360.1,96.548727,80.118429,42239.3,41360.1,88.455414,75.90404,42057.912127,41503.437873,42098.101003,41463.818997,42127.070891,41558.369109,0.209752,0.576183,93.761141,93.263918,71.553503,177.524225,177.31981,177.457956
2024-01-27 22:00:00,42137.8,42187.1,42057.6,42135.3,5271.863,16107.925688,39498,-5.9e-05,0.002191,109.95061,84.068663,25.881946,181.185979,244.302454,4.328603,304.125161,59.822707,144.371429,147.45,42187.1,41360.1,93.736397,95.142562,42187.1,41360.1,93.736397,95.142562,42239.3,41360.1,88.171065,88.313239,42187.1,41360.1,93.736397,95.803827,42187.1,41360.1,93.736397,95.803827,42239.3,41360.1,88.171065,86.124603,42187.1,41360.1,93.736397,88.944777,42187.1,41360.1,93.736397,91.195134,42239.3,41360.1,88.171065,79.396255,42187.1,41360.1,93.736397,70.832146,42187.1,41360.1,93.736397,83.353898,42239.3,41360.1,88.171065,76.878118,42115.61209,41477.04791,42171.85903,41448.22097,42208.820103,41557.559897,-0.005933,0.203807,92.353973,93.502343,93.779781,162.103538,161.749787,161.898956
2024-01-27 23:00:00,42135.4,42156.7,42055.4,42102.2,4399.8,14341.036645,33102,-0.000786,0.0022,108.455945,93.823576,14.632369,183.51359,255.62353,5.324971,300.907832,45.284303,125.471429,147.565,42187.1,41360.1,89.733978,91.735187,42187.1,41632.0,84.705458,89.220928,42239.3,41360.1,84.406278,86.288672,42187.1,41360.1,89.733978,93.339701,42187.1,41632.0,84.705458,91.663528,42239.3,41360.1,84.406278,87.010919,42187.1,41360.1,89.733978,92.816026,42187.1,41632.0,84.705458,91.810322,42239.3,41360.1,84.406278,83.012855,42187.1,41360.1,89.733978,75.633569,42187.1,41632.0,84.705458,85.319744,42239.3,41360.1,84.406278,77.551814,42158.587642,41463.772358,42186.923584,41519.396416,42252.73906,41588.70094,-0.078556,-0.084485,86.927796,86.870355,86.625041,163.501062,163.153819,163.400034


In [58]:
#df_idc_all.to_csv('../data/final_dask.csv', index_label='time')

## 2. Feature Engineering

### 2-1. Feature combination optimization

In [59]:
# Test model

def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rf_base(df, df_name, n_splits): 
    
    # Feature와 Target 나누기
    X = df.drop(columns=['volatility'])
    y = df['volatility']

    tscv = TimeSeriesSplit(n_splits)
    
    mape_list = []
    rmse_list = []

    # TimeSeriesSplit을 사용하여 데이터 분할
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        

        # RandomForestRegressor(베이지안옵티마이저 )
        rf_model = RandomForestRegressor(max_depth=None,
                                         max_features='sqrt',
                                         n_estimators=200,
                                         min_samples_split=5, 
                                         min_samples_leaf=2, 
                                         max_leaf_nodes=None,
                                         random_state=42)  # random_state 추가


        # 모델 학습
        rf_model.fit(X_train, y_train)

        # 예측
        y_pred = rf_model.predict(X_test)

        # 평가지표(MAPE, RMSE)
        test_mape = mean_absolute_percentage_error(y_test, y_pred)
        test_rmse = calculate_rmse(y_test, y_pred)
        
        # 결과 저장
        mape_list.append(test_mape)
        rmse_list.append(test_rmse)
        
        print(f'{df_name} : MAPE: {test_mape}, RMSE: {test_rmse}')
    
    # 평균 결과
    print(f'{df_name} : Average MAPE: {np.mean(mape_list)}, Average RMSE: {np.mean(rmse_list)}')

In [60]:
df_idc_all.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'quote_qty', 'is_buyer_maker',
       'returns', 'volatility', 'MACD_613', 'Signal_Line_4',
       'MACD_Oscillator_613', 'MACD_1226', 'Signal_Line_9',
       'MACD_Oscillator_1226', 'MACD_526', 'MACD_Oscillator_526', 'atr_14',
       'atr_20', 'SO high_202', 'SO low_202', 'SO %K_202', 'SO %D_202',
       'SO high_142', 'SO low_142', 'SO %K_142', 'SO %D_142', 'SO high_302',
       'SO low_302', 'SO %K_302', 'SO %D_302', 'SO high_203', 'SO low_203',
       'SO %K_203', 'SO %D_203', 'SO high_143', 'SO low_143', 'SO %K_143',
       'SO %D_143', 'SO high_303', 'SO low_303', 'SO %K_303', 'SO %D_303',
       'SO high_205', 'SO low_205', 'SO %K_205', 'SO %D_205', 'SO high_145',
       'SO low_145', 'SO %K_145', 'SO %D_145', 'SO high_305', 'SO low_305',
       'SO %K_305', 'SO %D_305', 'SO high_2010', 'SO low_2010', 'SO %K_2010',
       'SO %D_2010', 'SO high_1410', 'SO low_1410', 'SO %K_1410', 'SO %D_1410',
       'SO high_3010', 'SO low_3010',

In [61]:
df_idc_all

Unnamed: 0,Open,High,Low,Close,Volume,quote_qty,is_buyer_maker,returns,volatility,MACD_613,Signal_Line_4,MACD_Oscillator_613,MACD_1226,Signal_Line_9,MACD_Oscillator_1226,MACD_526,MACD_Oscillator_526,atr_14,atr_20,SO high_202,SO low_202,SO %K_202,SO %D_202,SO high_142,SO low_142,SO %K_142,SO %D_142,SO high_302,SO low_302,SO %K_302,SO %D_302,SO high_203,SO low_203,SO %K_203,SO %D_203,SO high_143,SO low_143,SO %K_143,SO %D_143,SO high_303,SO low_303,SO %K_303,SO %D_303,SO high_205,SO low_205,SO %K_205,SO %D_205,SO high_145,SO low_145,SO %K_145,SO %D_145,SO high_305,SO low_305,SO %K_305,SO %D_305,SO high_2010,SO low_2010,SO %K_2010,SO %D_2010,SO high_1410,SO low_1410,SO %K_1410,SO %D_1410,SO high_3010,SO low_3010,SO %K_3010,SO %D_3010,upper_band_20,lower_band_20,upper_band_15,lower_band_15,upper_band_10,lower_band_10,roc_1,roc_2,rsi_7,rsi_9,rsi_14,UO_71014,UO_7911,UO_71012
2024-01-16 00:00:00,42515.0,42679.0,42466.1,42604.7,4807.989,8579.550704,39667,-0.000036,0.002261,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,212.900000,212.900000,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.978460,42249.822736,41379.363116,82.252041,85.170780,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.855250,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.030513,0.228613,0.000000,0.000000,0.000000,24.617392,24.617392,24.617392
2024-01-16 01:00:00,42604.6,42733.7,42556.6,42591.7,4728.842,9032.908754,37545,-0.000305,0.002261,-1.857143,-0.742857,-1.114286,-1.037037,-0.674074,-0.829630,-3.370370,-2.696296,195.000000,195.000000,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.978460,42249.822736,41379.363116,82.252041,85.170780,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.855250,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.030513,0.228613,0.000000,0.000000,0.000000,26.877143,26.877143,26.877143
2024-01-16 02:00:00,42591.7,42732.8,42552.0,42702.1,5519.024,14268.124236,36255,0.002592,0.002261,12.853061,4.695510,8.157551,6.969125,4.111660,5.741226,23.254595,19.142936,190.266667,190.266667,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.978460,42249.822736,41379.363116,82.252041,85.170780,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.855250,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,0.259205,0.228613,89.465154,89.465154,89.465154,61.448268,61.448268,61.448268
2024-01-16 03:00:00,42702.1,42942.0,42695.7,42919.5,7032.131,15921.078311,43162,0.005091,0.002261,52.391837,23.774041,28.617796,30.504789,22.385035,23.421512,95.478535,73.093500,204.275000,204.275000,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.978460,42249.822736,41379.363116,82.252041,85.170780,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.855250,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,0.509108,0.769634,96.185446,96.185446,96.185446,119.417102,119.417102,119.417102
2024-01-16 04:00:00,42919.3,42945.0,42759.9,42805.2,5516.792,13639.276794,41625,-0.002663,0.002261,58.132237,37.517319,20.614917,39.478831,39.522105,25.916443,108.070385,68.548280,200.440000,200.440000,42165.447309,41360.337721,89.733978,88.668532,42189.580003,41726.94223,84.705458,85.116387,42249.822736,41379.363116,82.252041,83.247189,42165.447309,41360.337721,89.733978,89.555258,42189.580003,41726.94223,84.705458,87.455097,42249.822736,41379.363116,82.252041,85.112845,42165.447309,41360.337721,89.733978,93.277473,42189.580003,41726.94223,84.705458,90.978460,42249.822736,41379.363116,82.252041,85.170780,42165.447309,41360.337721,89.733978,79.365508,42189.580003,41726.94223,84.705458,86.855250,42249.822736,41379.363116,82.252041,78.053779,42195.250345,41452.178714,42197.458529,41567.750244,42280.709474,41608.070371,-0.266313,0.241440,72.028126,72.028126,72.028126,121.701899,121.701899,121.701899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-27 19:00:00,41785.0,41977.0,41775.1,41896.4,7765.565,18797.517191,38285,0.002668,0.002019,40.403682,30.996484,9.407198,146.529443,210.556287,-34.805681,189.875650,-20.680636,152.521429,145.215000,41977.000000,41360.100000,86.934673,78.656203,41977.000000,41360.10000,86.934673,84.282095,42239.300000,41002.100000,72.284190,69.303734,41977.000000,41360.100000,86.934673,73.936576,41977.000000,41360.10000,86.934673,82.405076,42239.300000,41002.100000,72.284190,67.927335,41977.000000,41360.100000,86.934673,67.082359,41977.000000,41360.10000,86.934673,82.564744,42239.300000,41002.100000,72.284190,70.994755,41977.000000,41360.100000,86.934673,55.794468,41977.000000,41360.10000,86.934673,72.340528,42239.300000,41002.100000,72.284190,74.633851,41926.282235,41574.097765,41919.616474,41543.050193,41873.682804,41669.537196,0.266843,0.304050,86.169284,82.137285,58.884892,98.254035,97.631804,98.239248
2024-01-27 20:00:00,41896.5,42070.0,41896.4,42049.6,5408.243,16324.490996,35317,0.003657,0.002154,69.666208,46.464374,23.201834,157.992808,215.865915,-18.673852,237.104426,21.238511,155.557143,145.490000,42070.000000,41360.100000,97.126356,92.030515,42070.000000,41360.10000,97.126356,92.030515,42239.300000,41200.000000,81.747330,77.015760,42070.000000,41360.100000,97.126356,84.812921,42070.000000,41360.10000,97.126356,88.563515,42239.300000,41200.000000,81.747330,73.451600,42070.000000,41360.100000,97.126356,76.559951,42070.000000,41360.10000,97.126356,85.359477,42239.300000,41200.000000,81.747330,71.387407,42070.000000,41360.100000,97.126356,60.392123,42070.000000,41360.10000,97.126356,76.524067,42239.300000,41200.000000,81.747330,74.961314,41985.315470,41542.744530,42001.586587,41501.266747,42002.304354,41604.515646,0.365664,0.633482,92.606285,88.448928,68.997644,157.603487,157.318754,157.646731
2024-01-27 21:00:00,42049.6,42165.6,42001.8,42137.8,5859.277,12457.410066,42103,0.002098,0.002189,97.338521,66.814032,30.524488,172.209487,229.346777,-3.565739,283.270228,53.923450,157.992857,145.190000,42165.600000,41360.100000,96.548727,96.837542,42165.600000,41360.10000,96.548727,96.837542,42239.300000,41360.100000,88.455414,85.101372,42165.600000,41360.100000,96.548727,93.536586,42165.600000,41360.10000,96.548727,93.536586,42239.300000,41360.100000,88.455414,80.828978,42165.600000,41360.100000,96.548727,83.096962,42165.600000,41360.10000,96.548727,88.178062,42239.300000,41360.100000,88.455414,74.796950,42165.600000,41360.100000,96.548727,65.672564,42165.600000,41360.10000,96.548727,80.118429,42239.300000,41360.100000,88.455414,75.904040,42057.912127,41503.437873,42098.101003,41463.818997,42127.070891,41558.369109,0.209752,0.576183,93.761141,93.263918,71.553503,177.524225,177.319810,177.457956
2024-01-27 22:00:00,42137.8,42187.1,42057.6,42135.3,5271.863,16107.925688,39498,-0.000059,0.002191,109.950610,84.068663,25.881946,181.185979,244.302454,4.328603,304.125161,59.822707,144.371429,147.450000,42187.100000,41360.100000,93.736397,95.142562,42187.100000,41360.10000,93.736397,95.142562,42239.300000,41360.100000,88.171065,88.313239,42187.100000,41360.100000,93.736397,95.803827,42187.100000,41360.10000,93.736397,95.803827,42239.300000,41360.100000,88.171065,86.124603,42187.100000,41360.100000,93.736397,88.944777,42187.100000,41360.10000,93.736397,91.195134,42239.300000,41360.100000,88.171065,79.396255,42187.100000,41360.100000,93.736397,70.832146,42187.100000,41360.10000,93.736397,83.353898,42239.300000,41360.100000,88.171065,76.878118,42115.612090,41477.047910,42171.859030,41448.220970,42208.820103,41557.559897,-0.005933,0.203807,92.353973,93.502343,93.779781,162.103538,161.749787,161.898956


#### **기본 칼럼 및 파생변수 1차 선별**
* 가격 관련 칼럼 Open, High, Low, Close 중 1개만 선별

> **시작 버전 : 기본 칼럼 + SO 관련 모두 + ROC 관련 모두 + ATR 관련 모두 + MACD 관련 모두**
>
> 이유 : 조합이 가장 많은 SO와 가장 단순한 ROC 그리고 타깃 변수와 관련성이 가장 높은 ATR, MACD부터 시작

In [62]:
# df1 = df_idc_all.loc[:,['Open', 'High', 'Low', 'Close', 'Volume', 'quote_qty', 'is_buyer_maker',
#        'returns', 'volatility', 'MACD_613', 'Signal_Line_4',
#        'MACD_Oscillator_613', 'MACD_1226', 'Signal_Line_9',
#        'MACD_Oscillator_1226', 'MACD_526', 'MACD_Oscillator_526', 'atr_14',
#        'atr_20', 'SO high_202', 'SO low_202', 'SO %K_202', 'SO %D_202',
#        'SO high_142', 'SO low_142', 'SO %K_142', 'SO %D_142', 'SO high_302',
#        'SO low_302', 'SO %K_302', 'SO %D_302', 'SO high_203', 'SO low_203',
#        'SO %K_203', 'SO %D_203', 'SO high_143', 'SO low_143', 'SO %K_143',
#        'SO %D_143', 'SO high_303', 'SO low_303', 'SO %K_303', 'SO %D_303',
#        'SO high_205', 'SO low_205', 'SO %K_205', 'SO %D_205', 'SO high_145',
#        'SO low_145', 'SO %K_145', 'SO %D_145', 'SO high_305', 'SO low_305',
#        'SO %K_305', 'SO %D_305', 'SO high_2010', 'SO low_2010', 'SO %K_2010',
#        'SO %D_2010', 'SO high_1410', 'SO low_1410', 'SO %K_1410', 'SO %D_1410',
#        'SO high_3010', 'SO low_3010', 'SO %K_3010', 'SO %D_3010', 'roc_1', 'roc_2']]

# # test_
# rf_base(df1, df_name='df1', n_splits=5)

In [63]:
# df2 = df_idc_all.loc[:,['Open', 'High', 'Low', 'Close', 'Volume', 'quote_qty', 'is_buyer_maker',
#        'returns', 'volatility', 'MACD_613', 'Signal_Line_4',
#        'MACD_Oscillator_613', 'MACD_1226', 'Signal_Line_9',
#        'MACD_Oscillator_1226', 'MACD_526', 'MACD_Oscillator_526', 'atr_14', 'atr_20', 
#        'SO low_202', 'SO low_142', 'SO low_302', 'SO low_203',
#        'SO low_143','SO low_303','SO low_205', 'SO low_145', 
#        'SO low_305','SO low_2010', 'SO low_1410', 'SO low_3010',
#        'SO high_202','SO high_142','SO high_302','SO high_203', 
#        'SO high_143', 'SO high_303','SO high_205', 'SO high_145',
#        'SO high_305','SO high_2010','SO high_1410','SO high_3010',
#        'roc_1', 'roc_2']]

# # test_
# rf_base(df2, df_name='df2', n_splits=5)

> **SO %K, %D 관련 모두 제거**

In [64]:
# df3 = df_idc_all.loc[:,['volatility', 'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#        'MACD_613', 'Signal_Line_4',
#        'MACD_Oscillator_613', 'MACD_1226', 'Signal_Line_9',
#        'MACD_Oscillator_1226', 'MACD_526', 'MACD_Oscillator_526', 'atr_14', 'atr_20', 
#        'SO low_202', 'SO low_142', 'SO low_302', 'SO low_203',
#        'SO low_143','SO low_303','SO low_205', 'SO low_145', 
#        'SO low_305','SO low_2010', 'SO low_1410', 'SO low_3010',
#        'SO high_202','SO high_142','SO high_302','SO high_203', 
#        'SO high_143', 'SO high_303','SO high_205', 'SO high_145',
#        'SO high_305','SO high_2010','SO high_1410','SO high_3010',
#        'roc_1', 'roc_2']]

# # test_
# rf_base(df3, df_name='df3', n_splits=5)

> **기본 칼럼 'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty' 선별 완료**

In [65]:
# df4 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                           'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                           'SO low_202', 'SO low_142', 'SO low_302', 'SO low_203',
#                           'SO low_143','SO low_303','SO low_205', 'SO low_145', 
#                           'SO low_305','SO low_2010', 'SO low_1410', 'SO low_3010',
#                           'SO high_202','SO high_142','SO high_302','SO high_203', 
#                           'SO high_143', 'SO high_303','SO high_205', 'SO high_145',
#                           'SO high_305','SO high_2010','SO high_1410','SO high_3010',
#                           'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4', 'roc_1', 'roc_2']]

# # test_
# rf_base(df4, df_name='df4', n_splits=5)

> **MACD 칼럼 'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4' 1차 선별 완료**

In [66]:
# df5 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                           'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                           'SO low_202', 'SO low_142', 'SO low_302', 'SO low_203',
#                           'SO low_143','SO low_303','SO low_205', 'SO low_145', 
#                           'SO low_305','SO low_2010', 'SO low_1410', 'SO low_3010',
#                           'SO high_202','SO high_142','SO high_302','SO high_203', 
#                           'SO high_143', 'SO high_303','SO high_205', 'SO high_145',
#                           'SO high_305','SO high_2010','SO high_1410','SO high_3010',
#                           'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4', 'roc_2']]

# # test_
# rf_base(df5, df_name='df5', n_splits=5)

> **ROC 칼럼 'roc_2' 선별 완료**

In [67]:
# df6 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_305', 'SO low_3010', 'SO high_305','SO high_3010',
#                          'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
#                          'roc_2']]

# # test_
# rf_base(df6, df_name='df6', n_splits=5)

> **SO 칼럼 'SO low_305', 'SO low_3010', 'SO high_305','SO high_3010' 만 선별**

#### **MACD 2차 선별**

> **df6(1) : MACD 세트**
> MAPE: 0.24491970782156525
> 개수 : 16개

> **df6(2) : MACD_1226 단독**
> MAPE: 0.24058672694187538
> 개수 : 13개

In [68]:
# # df6(1)
# # 개수 : 16개

# df6_1 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_305', 'SO low_3010', 'SO high_305','SO high_3010',
#                          'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
#                          'roc_2']]

# # test_
# rf_base(df6_1, df_name='df6_1', n_splits=5)

In [69]:
# # df6 (2)
# # 개수 : 13개

# df6_2 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_305', 'SO low_3010', 'SO high_305','SO high_3010',
#                          'MACD_1226',
#                          'roc_2']]

# # test_
# rf_base(df6_2, df_name='df6_2', n_splits=5)

#### **SO 2차 선별**

##### **SO 305 > 205, 3010**

> 결론 : **df76(2)버전 : MACD1226단독 + SO low_205, SO low_3010** 에서 시작하는 것이 좋다!

In [70]:
# # df7 (1) : df6(1) + 305 > 205
# # 개수 : 16개

# df7 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 'SO high_205','SO high_3010',
#                          'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
#                          'roc_2']]

# # test_
# rf_base(df7, df_name='df7', n_splits=5)

In [71]:
# # df8 : df6(2) + 305 > 205
# # 개수 : 13개

# df8 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 'SO high_205','SO high_3010',
#                          'MACD_1226', 
#                          'roc_2']]

# # test_
# rf_base(df8, df_name='df8', n_splits=5)

In [72]:
# # df9: df6 + 305 > 205 > low_205만
# # 개수 : 16개

# df9 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 'SO high_3010',
#                          'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
#                          'roc_2']]

# # test_
# rf_base(df9, df_name='df9', n_splits=5)

In [73]:
# # df10 : df6 + 305 > 205 > low_205만
# # 개수 : 13개

# df10 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 'SO high_3010',
#                          'MACD_1226',
#                          'roc_2']]

# # test_
# rf_base(df10, df_name='df10', n_splits=5)

In [74]:
# # df11: df6 > low_3010, low_205
# # 개수 : 15개

# df11 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
#                          'roc_2']]

# # test_
# rf_base(df11, df_name='df11', n_splits=5)

In [75]:
# # df12: df6 > low_3010, low_205
# # 개수 : 12개

# df12 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 
#                          'roc_2']]

# # test_
# rf_base(df12, df_name='df12', n_splits=5)

> 결론 : **df12버전 : MACD1226단독 + SO low_205, SO low_3010** 에서 시작하는 것이 좋다!

##### **SO 2010**

* SO 2010 추가

> **df14 버전 : df12버전 + SO low 2010**
> 
> MAPE: 0.2168803450494526


> **df14**(MACD1226, SO low 205, SO low 3010, SO low 2010) 에서 **MAPE 0.2168803450494526** 로 최저이다.
> 
> 위의 테스트 (2) 선별 과정을 보면 대체로 SO 변수는 추가될수록 MAPE 값이 상승하는 추세를 보인다.
> 
> 따라서 SO 변수는 더이상 추가하지 않고 다른 변수들을 추가하여 변수를 다양하게 활용해보고자 한다.

In [76]:
# # df13 : df12 + SO low/high 2010 
# # 개수 : 14개

# df13 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 'SO high_2010',
#                          'roc_2']]

# # test_
# rf_base(df13, df_name='df13', n_splits=5)

In [77]:
# # df14 : df12 + SO low 2010 
# # 개수 : 13개

# df14 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'roc_2']]

# # test_
# rf_base(df14, df_name='df14', n_splits=5)

In [78]:
# # df15 : df12 + SO high 2010 
# # 개수 : 13개

# df15 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO high_2010', 
#                          'roc_2']]

# # test_
# rf_base(df15, df_name='df15', n_splits=5)

#### **Bollinger Band 추가 선별**
* BB 관련 변수 lower/upper_band 추가

> **df17 버전: df14버전 + lower_band_20**
>
> MAPE: 0.22175690632707684
>
> 개수 : 14개

In [79]:
# # df16 : df14 + BB UP/LOW 20
# # 개수 : 15개

# df16 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'upper_band_20', 'lower_band_20',
#                          'roc_2']]

# # test_
# rf_base(df16, df_name='df16', n_splits=5)

In [80]:
# # df17 : df14 + BB LOW 20
# # 개수 : 14개

# df17 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'roc_2']]

# # test_
# rf_base(df17, df_name='df17', n_splits=5)

In [81]:
# # df18 : df14 + BB LOW 20
# # 개수 : 14개

# df18 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'upper_band_20',
#                          'roc_2']]

# # test_
# rf_base(df18, df_name='df18', n_splits=5)

In [82]:
# # df19 : df14 + BB LOW 20 + BB LOW 10
# # 개수 : 14개

# df19 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20', 'lower_band_10',
#                          'roc_2']]

# # test_
# rf_base(df19, df_name='df19', n_splits=5)

In [83]:
# # df20 : df14 + BB LOW 10
# # 개수 : 13개

# df20 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20', 'lower_band_10',
#                          'roc_2']]

# # test_
# rf_base(df20, df_name='df20', n_splits=5)

#### **RSI, UO 추가 선별**
* RSI, UO 관련 변수 추가
 
* 해당 변수들은 성능 좋았던 칼럼 목록에 포함되지 않지만, 다양한 변수를 최대한 활용해보고자 조합을 확인해보고자 한다.

#### **RSI 추가 선별**
* rsi_7, 9, 14 비교

> **df23버전 : df17버전 + 'rsi_14'**
>
> MAPE: 0.2246215630761636
>
> 개수 : 15개

In [84]:
# # df21 : df17 + 'rsi_7'
# # 개수 : 15개

# df21 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_7',
#                          'roc_2']]

# # test_
# rf_base(df21, df_name='df21', n_splits=5)

In [85]:
# # df22 : df17 + 'rsi_9'
# # 개수 : 15개

# df22 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_9',
#                          'roc_2']]

# # test_
# rf_base(df22, df_name='df22', n_splits=5)

In [86]:
# # df23 : df17 + 'rsi_14'
# # 개수 : 15개

# df23 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_14',
#                          'roc_2']]

# # test_
# rf_base(df23, df_name='df23', n_splits=5)

#### **UO 추가 선별**
* UO 관련 변수 추가

> df26버전 : df23버전 + 'UO_71012'
>
> MAPE: 0.22359679750755346
>
> 개수 : 16개

In [87]:
# # df24 : df23 + 'UO_71014'
# # 개수 : 16개

# df24 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_14',
#                          'UO_71014',
#                          'roc_2']]

# # test_
# rf_base(df24, df_name='df24', n_splits=5)

In [88]:
# # df25 : df23 + 'UO_7911'
# # 개수 : 16개

# df25 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_14',
#                          'UO_7911',
#                          'roc_2']]

# # test_
# rf_base(df25, df_name='df25', n_splits=5)

In [89]:
# # df26 : df23 + 'UO_71012'
# # 개수 : 16개

# df26 = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
#                          'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
#                          'SO low_205', 'SO low_3010', 
#                          'MACD_1226', 'SO low_2010', 
#                          'lower_band_20',
#                          'rsi_14',
#                          'UO_71012',
#                          'roc_2']]

# # test_
# rf_base(df26, df_name='df26', n_splits=5)

#### **최종 변수 조합 출력**

In [90]:
df_final = df_idc_all.loc[:,[ 'volatility', 'atr_14','atr_20',
                         'Close', 'Volume', 'returns', 'is_buyer_maker', 'quote_qty',
                         'SO low_205', 'SO low_3010', 
                         'MACD_1226', 'SO low_2010', 
                         'lower_band_20',
                         'rsi_14',
                         'UO_71012',
                         'roc_2']]

# MAPE 성능 확인
rf_base(df_final, df_name='df_final', n_splits=5)

df_final : MAPE: 0.3483532670769551, RMSE: 0.0023537327035657316
df_final : MAPE: 1.0708011930867407, RMSE: 0.0017154472205014027
df_final : MAPE: 0.16211490350989796, RMSE: 0.0008455914143904588
df_final : MAPE: 0.29490836248001134, RMSE: 0.0012778753130279214
df_final : MAPE: 0.17982546449696776, RMSE: 0.0008589370092773616
df_final : Average MAPE: 0.41120063813011454, Average RMSE: 0.0014103167321525753


### 2-2. Model Efficiency Optimization

In [91]:
def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features='log2',  # log2로 설정하여 과적합 예방
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100)
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, max_features='log2', random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

### 2-3. Model Evaluation

In [92]:
predicted_values, best_params = predict_next_period_volatility(df_final, prediction_range_hours=73)
print("Predicted values:", predicted_values)
print("Best hyperparameters:", best_params)

|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.004126[0m | [0m8.745    [0m | [0m95.56    [0m | [0m7.588    [0m | [0m12.78    [0m | [0m89.0     [0m |
| [0m2        [0m | [0m-0.004132[0m | [0m6.56     [0m | [0m15.23    [0m | [0m8.796    [0m | [0m12.82    [0m | [0m227.0    [0m |
| [0m3        [0m | [0m-0.004148[0m | [0m5.206    [0m | [0m97.29    [0m | [0m8.492    [0m | [0m5.822    [0m | [0m95.46    [0m |
| [0m4        [0m | [0m-0.004129[0m | [0m6.834    [0m | [0m37.38    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m122.8    [0m |
| [0m5        [0m | [0m-0.004137[0m | [0m11.12    [0m | [0m22.55    [0m | [0m3.629    [0m | [0m8.595    [0m | [0m164.0    [0m |
| [0m6        [0m | [0m-0.004131[0m | [0m8.871    [0m | [0m72.2     [0m | [0m3.322    [0m | [0m16.98    [0

In [93]:
predicted_values

array([0.00439966, 0.00439974, 0.00448327, 0.00444331, 0.00449999,
       0.00438074, 0.00427852, 0.00412861, 0.0040046 , 0.00402297,
       0.00394501, 0.00388744, 0.0039227 , 0.00402199, 0.00397515,
       0.00361472, 0.00381649, 0.00357822, 0.00330255, 0.00343388,
       0.0031923 , 0.00338518, 0.00327472, 0.00320112, 0.00317336,
       0.00321454, 0.00315227, 0.00333931, 0.00314512, 0.00316655,
       0.00327063, 0.00327663, 0.00320276, 0.00322522, 0.00329057,
       0.00425073, 0.00472067, 0.00494235, 0.00509832, 0.00572815,
       0.00582923, 0.00624676, 0.00625031, 0.00635747, 0.00640986,
       0.00650057, 0.00651191, 0.00652629, 0.00650933, 0.00644634,
       0.00648967, 0.00628457, 0.00642692, 0.00634702, 0.00620181,
       0.00545703, 0.00486097, 0.0047418 , 0.00438189, 0.00350868,
       0.0029525 , 0.00239184, 0.00213267, 0.00208801, 0.00205868,
       0.00207004, 0.00200207, 0.00200489, 0.00211651, 0.00217043,
       0.00219201, 0.0021399 , 0.00213145])

- 예측값 삽입

In [94]:
#submission = pd.read_csv('[ASCENDxBDA] submission.csv')

In [95]:
#submission.head()

In [96]:
# 예측값을 submission DataFrame에 넣기
#submission['pred_volatility'] = np.array(predicted_values)

In [97]:
#submission.to_csv('[ASCENDxBDA] submission.csv', index = False, encoding = 'utf-8-sig')