# Model Optimization
- 결측치 보간 후 파일로 진행

In [18]:
import pandas as pd

df = pd.read_csv('../data/final_dask.csv')
df.head(3)

Unnamed: 0,time,Open,High,Low,Close,Volume,quote_qty,is_buyer_maker,returns,volatility,...,upper_band_10,lower_band_10,roc_1,roc_2,rsi_7,rsi_9,rsi_14,UO_71014,UO_7911,UO_71012
0,2023-01-01 00:00:00,16537.5,16540.9,16504.0,16527.0,5381.399,7264.129209,16494,0.000138,0.002201,...,42286.896445,41602.459445,0.141586,0.093181,100.0,100.0,100.0,0.0,0.0,0.0
1,2023-01-01 01:00:00,16527.1,16554.3,16524.1,16550.4,3210.826,6819.889969,8705,0.001416,0.002201,...,42286.896445,41602.459445,0.141586,0.093181,100.0,100.0,100.0,40.577816,40.577816,40.577816
2,2023-01-01 02:00:00,16550.5,16557.1,16534.8,16542.4,2399.668,6030.420093,8468,-0.000483,0.002201,...,42286.896445,41602.459445,-0.048337,0.093181,74.522293,74.522293,74.522293,45.684084,45.684084,45.684084


In [103]:
pip install BayesianOptimization

Note: you may need to restart the kernel to use updated packages.


In [104]:
pip install bayesian-optimization

Note: you may need to restart the kernel to use updated packages.


In [21]:
# from bayes_opt import BayesianOptimization

In [22]:
df = df.set_index('time')
df.index=pd.to_datetime(df.index)

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit #Timeseires Split

In [24]:
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [25]:
def rf_base(df, df_name, n_splits): 
    
    # Feature와 Target 나누기
    X = df.drop(columns=['volatility'])
    y = df['volatility']

    tscv = TimeSeriesSplit(n_splits)
    
    mape_list = []
    rmse_list = []

    # TimeSeriesSplit을 사용하여 데이터 분할
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        

        # RandomForestRegressor
        rf_model = RandomForestRegressor(random_state=42)  # random_state 추가

        # 모델 학습
        rf_model.fit(X_train, y_train)

        # 예측
        y_pred = rf_model.predict(X_test)

        # 평가지표(MAPE, RMSE)
        test_mape = mean_absolute_percentage_error(y_test, y_pred)
        test_rmse = calculate_rmse(y_test, y_pred)
        
        # 결과 저장
        mape_list.append(test_mape)
        rmse_list.append(test_rmse)
        
        print(f'{df_name} : MAPE: {test_mape}, RMSE: {test_rmse}')
    
    # 평균 결과a
    print(f'{df_name} : Average MAPE: {np.mean(mape_list)}, Average RMSE: {np.mean(rmse_list)}')

### 1.Optimazation

In [161]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd

def predict_next_period_volatility(df, prediction_range_hours, max_features_choice):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features_choice,  # max_features를 선택된 값으로 설정
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 200),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100)   # 기본값은 None
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, max_features=max_features_choice, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 모델 학습 및 예측값 생성
    predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73, max_features_choice='log2')

    # 예측값과 실제값 가져오기
    actual_values = df91[-73:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)

    return predicted_values, best_params


In [159]:
predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=72, max_features_choice='sqrt')
print("Predicted values:", predicted_values)
print("Best hyperparameters:", best_params)

|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.004176[0m | [0m8.745    [0m | [0m95.56    [0m | [0m7.588    [0m | [0m12.78    [0m | [0m73.4     [0m |
| [0m2        [0m | [0m-0.004333[0m | [0m6.56     [0m | [0m15.23    [0m | [0m8.796    [0m | [0m12.82    [0m | [0m156.2    [0m |
| [0m3        [0m | [0m-0.004364[0m | [0m5.206    [0m | [0m97.29    [0m | [0m8.492    [0m | [0m5.822    [0m | [0m77.27    [0m |
| [0m4        [0m | [0m-0.004291[0m | [0m6.834    [0m | [0m37.38    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m93.68    [0m |
| [0m5        [0m | [0m-0.004284[0m | [0m11.12    [0m | [0m22.55    [0m | [0m3.629    [0m | [0m8.595    [0m | [0m118.4    [0m |
| [0m6        [0m | [0m-0.004177[0m | [0m8.871    [0m | [0m72.2     [0m | [0m3.322    [0m | [0m16.98    [0

### 2. Market data(ohlcv)

In [111]:
test = pd.read_csv('../data/output5.csv')

In [112]:
test

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume
0,1706400000000,42122.13,42148.34,41933.99,41995.81,98.03302
1,1706403600000,41987.63,42240.84,41987.54,42102.15,92.51710
2,1706407200000,42097.97,42180.23,42091.81,42107.45,89.52966
3,1706410800000,42095.18,42240.99,42094.50,42230.48,92.44950
4,1706414400000,42230.28,42768.16,42220.78,42724.83,92.96013
...,...,...,...,...,...,...
68,1706644800000,43541.47,43764.07,43481.74,43554.90,93.59847
69,1706648400000,43563.41,43696.51,43481.08,43548.06,93.77550
70,1706652000000,43547.92,43582.35,43321.31,43331.38,87.96784
71,1706655600000,43316.58,43357.93,42683.71,42941.17,95.87972


In [113]:
# timestamp 컬럼을 datetime으로 변환
pd.to_datetime(test['Timestamp'], unit='ms')

0    2024-01-28 00:00:00
1    2024-01-28 01:00:00
2    2024-01-28 02:00:00
3    2024-01-28 03:00:00
4    2024-01-28 04:00:00
             ...        
68   2024-01-30 20:00:00
69   2024-01-30 21:00:00
70   2024-01-30 22:00:00
71   2024-01-30 23:00:00
72   2024-01-31 00:00:00
Name: Timestamp, Length: 73, dtype: datetime64[ns]

In [114]:
def calculate_volatility(data, window=20):
    """
    Calculate the rolling volatility using the standard deviation of returns.
    :param data: DataFrame with OHLCV data
    :param window: The number of periods to use for calculating the standard deviation
    :return: DataFrame with the volatility values
    """

    # Calculate daily returns
    data['returns'] = data['Close'].pct_change()

    # Calculate the rolling standard deviation of returns
    data['volatility'] = data['returns'].rolling(window=window).std()

    return data

In [115]:
calculate_volatility(test)

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume,returns,volatility
0,1706400000000,42122.13,42148.34,41933.99,41995.81,98.03302,,
1,1706403600000,41987.63,42240.84,41987.54,42102.15,92.51710,0.002532,
2,1706407200000,42097.97,42180.23,42091.81,42107.45,89.52966,0.000126,
3,1706410800000,42095.18,42240.99,42094.50,42230.48,92.44950,0.002922,
4,1706414400000,42230.28,42768.16,42220.78,42724.83,92.96013,0.011706,
...,...,...,...,...,...,...,...,...
68,1706644800000,43541.47,43764.07,43481.74,43554.90,93.59847,0.000014,0.002838
69,1706648400000,43563.41,43696.51,43481.08,43548.06,93.77550,-0.000157,0.002474
70,1706652000000,43547.92,43582.35,43321.31,43331.38,87.96784,-0.004976,0.002556
71,1706655600000,43316.58,43357.93,42683.71,42941.17,95.87972,-0.009005,0.003197


In [116]:
volatility_values = test['volatility'].values

In [117]:
volatility_values

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
       0.00383486, 0.00379335, 0.00386   , 0.00395661, 0.00279125,
       0.0029479 , 0.00290073, 0.00296158, 0.00294961, 0.00291741,
       0.00291595, 0.00290467, 0.00293984, 0.00310177, 0.00306164,
       0.00307716, 0.00315799, 0.00310818, 0.00293988, 0.00459708,
       0.00529905, 0.00548493, 0.00537989, 0.00534376, 0.00532565,
       0.00527156, 0.00530888, 0.0053522 , 0.00536684, 0.0054403 ,
       0.00546002, 0.00543433, 0.00550819, 0.00550584, 0.00551804,
       0.00548311, 0.00536375, 0.00535579, 0.00532997, 0.00401113,
       0.00296199, 0.00279106, 0.00290377, 0.00294731, 0.00292832,
       0.00298262, 0.00295929, 0.00294658, 0.00283768, 0.00247445,
       0.00255642, 0.00319703, 0.00317506])

In [118]:
predicted_values_test = predicted_values[20:]

- max_features : sqrt

In [119]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(volatility_values[20:72], predicted_values_test)

0.15541190929949303

- max_features : log2 > log2 채택

In [160]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(volatility_values[20:], predicted_values_2[20:])

0.14142103943250775

### 3.test code_1
- n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes
- max_features 입력값 비교

In [166]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours, max_features_choice):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features_choice,  # max_features를 선택된 값으로 설정
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 200),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100)   # 기본값은 None
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, max_features=max_features_choice, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73, max_features_choice='log2')
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [164]:
predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73, max_features_choice='log2')
print("Predicted values:", predicted_values)
print("Best hyperparameters:", best_params)

|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.004185[0m | [0m8.745    [0m | [0m95.56    [0m | [0m7.588    [0m | [0m12.78    [0m | [0m73.4     [0m |
| [0m2        [0m | [0m-0.004341[0m | [0m6.56     [0m | [0m15.23    [0m | [0m8.796    [0m | [0m12.82    [0m | [0m156.2    [0m |
| [0m3        [0m | [0m-0.004371[0m | [0m5.206    [0m | [0m97.29    [0m | [0m8.492    [0m | [0m5.822    [0m | [0m77.27    [0m |
| [0m4        [0m | [0m-0.004299[0m | [0m6.834    [0m | [0m37.38    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m93.68    [0m |
| [0m5        [0m | [0m-0.004291[0m | [0m11.12    [0m | [0m22.55    [0m | [0m3.629    [0m | [0m8.595    [0m | [0m118.4    [0m |
| [0m6        [0m | [0m-0.004186[0m | [0m8.871    [0m | [0m72.2     [0m | [0m3.322    [0m | [0m16.98    [0

In [165]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values[20:])

0.14142103943250775

- n estimators 300 범위 확장

In [171]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours, max_features_choice):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features_choice,  # max_features를 선택된 값으로 설정
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100)   # 기본값은 None
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, max_features=max_features_choice, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73, max_features_choice='log2')
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [172]:
predicted_values_2, best_params_2 = predict_next_period_volatility(df91, prediction_range_hours=73, max_features_choice='log2')
print("Predicted values:", predicted_values)
print("Best hyperparameters:", best_params)

|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.004181[0m | [0m8.745    [0m | [0m95.56    [0m | [0m7.588    [0m | [0m12.78    [0m | [0m89.0     [0m |
| [0m2        [0m | [0m-0.004334[0m | [0m6.56     [0m | [0m15.23    [0m | [0m8.796    [0m | [0m12.82    [0m | [0m227.0    [0m |
| [0m3        [0m | [0m-0.004376[0m | [0m5.206    [0m | [0m97.29    [0m | [0m8.492    [0m | [0m5.822    [0m | [0m95.46    [0m |
| [0m4        [0m | [0m-0.004287[0m | [0m6.834    [0m | [0m37.38    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m122.8    [0m |
| [0m5        [0m | [0m-0.004296[0m | [0m11.12    [0m | [0m22.55    [0m | [0m3.629    [0m | [0m8.595    [0m | [0m164.0    [0m |
| [95m6        [0m | [95m-0.004178[0m | [95m8.871    [0m | [95m72.2     [0m | [95m3.322    [0m | [95m16.98 

In [173]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_2[20:])

0.13999434663687726

- 300 , max_features 추천

In [174]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [175]:
# 사용 예시
predicted_values_3, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values)
print("Best hyperparameters:", best_params)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003955[0m | [0m8.745    [0m | [0m14.31    [0m | [0m75.88    [0m | [0m6.388    [0m | [0m4.808    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003832[0m | [95m5.581    [0m | [95m13.13    [0m | [95m64.1     [0m | [95m7.373    [0m | [95m2.371    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004268[0m | [0m13.32    [0m | [0m3.973    [0m | [0m26.36    [0m | [0m2.651    [0m | [0m7.476    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.004063[0m | [0m9.319    [0m | [0m5.077    [0m | [0m65.07    [0m | [0m2.255    [0m | [0m7.259    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.00386 [0m | [0m9.561    [0m | [0m11.99    [0m | [0m27.97    [0m | [0m5.628    [0m | [0m12.66    [0m | [0m61

In [176]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_3[20:])

0.13060176647182728

- max_depth 20

In [179]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [180]:
predicted_values_4, best_params_4 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_4)
print("Best hyperparameters:", best_params_4)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003961[0m | [0m10.62    [0m | [0m14.31    [0m | [0m75.88    [0m | [0m6.388    [0m | [0m4.808    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003832[0m | [95m5.871    [0m | [95m13.13    [0m | [95m64.1     [0m | [95m7.373    [0m | [95m2.371    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004268[0m | [0m17.49    [0m | [0m3.973    [0m | [0m26.36    [0m | [0m2.651    [0m | [0m7.476    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.004058[0m | [0m11.48    [0m | [0m5.077    [0m | [0m65.07    [0m | [0m2.255    [0m | [0m7.259    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.00386 [0m | [0m11.84    [0m | [0m11.99    [0m | [0m27.97    [0m | [0m5.628    [0m | [0m12.66    [0m | [0m61

In [181]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_4[20:])

0.13009153371714885

- min_samples_spli 15 줄임 > 버림

In [182]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 20),
        'min_samples_split': (2, 15),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [183]:
predicted_values_5, best_params_5 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_5)
print("Best hyperparameters:", best_params_5)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003961[0m | [0m10.62    [0m | [0m14.31    [0m | [0m75.88    [0m | [0m6.388    [0m | [0m4.028    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003832[0m | [95m5.871    [0m | [95m13.13    [0m | [95m64.1     [0m | [95m7.373    [0m | [95m2.268    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004268[0m | [0m17.49    [0m | [0m3.973    [0m | [0m26.36    [0m | [0m2.651    [0m | [0m5.955    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.004052[0m | [0m11.48    [0m | [0m5.077    [0m | [0m65.07    [0m | [0m2.255    [0m | [0m5.798    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.003859[0m | [0m11.84    [0m | [0m11.99    [0m | [0m27.97    [0m | [0m5.628    [0m | [0m9.701    [0m | [0m61

In [184]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_5[20:])

0.1304477490212385

- min sample leaf 15

In [186]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 15),
        'max_leaf_nodes': (10, 100),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [187]:
predicted_values_6, best_params_6 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_6)
print("Best hyperparameters:", best_params_6)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003958[0m | [0m10.62    [0m | [0m14.31    [0m | [0m75.88    [0m | [0m9.381    [0m | [0m4.808    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003833[0m | [95m5.871    [0m | [95m13.13    [0m | [95m64.1     [0m | [95m10.91    [0m | [95m2.371    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004266[0m | [0m17.49    [0m | [0m3.973    [0m | [0m26.36    [0m | [0m3.568    [0m | [0m7.476    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.004058[0m | [0m11.48    [0m | [0m5.077    [0m | [0m65.07    [0m | [0m2.953    [0m | [0m7.259    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.003856[0m | [0m11.84    [0m | [0m11.99    [0m | [0m27.97    [0m | [0m8.199    [0m | [0m12.66    [0m | [0m61

In [188]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_6[20:])

0.12982412739487115

- mean_sample_leaf 20 > 버려

In [189]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 20),
        'max_leaf_nodes': (10, 100),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [190]:
predicted_values_7, best_params_7 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_7)
print("Best hyperparameters:", best_params_7)

|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003957[0m | [0m10.62    [0m | [0m14.31    [0m | [0m75.88    [0m | [0m12.37    [0m | [0m4.808    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003832[0m | [95m5.871    [0m | [95m13.13    [0m | [95m64.1     [0m | [95m14.45    [0m | [95m2.371    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004272[0m | [0m17.49    [0m | [0m3.973    [0m | [0m26.36    [0m | [0m4.485    [0m | [0m7.476    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.004051[0m | [0m11.48    [0m | [0m5.077    [0m | [0m65.07    [0m | [0m3.65     [0m | [0m7.259    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.00386 [0m | [0m11.84    [0m | [0m11.99    [0m | [0m27.97    [0m | [0m10.77    [0m | [0m12.66    [0m | [0m61

In [191]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_7[20:])

0.13115401747004235

- max_leaf 80으로 축소

In [192]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes, max_features):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)
        max_features = int(max_features)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features=max_features,
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 15),
        'max_leaf_nodes': (10, 80),
        'max_features': (1, X.shape[1]) # 특징의 개수에 따라 max_features의 범위 설정
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [193]:
predicted_values_8, best_params_8 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_8)
print("Best hyperparameters:", best_params_8)


|   iter    |  target   | max_depth | max_fe... | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.003931[0m | [0m10.62    [0m | [0m14.31    [0m | [0m61.24    [0m | [0m9.381    [0m | [0m4.808    [0m | [0m89.0     [0m |
| [95m2        [0m | [95m-0.003833[0m | [95m5.871    [0m | [95m13.13    [0m | [95m52.08    [0m | [95m10.91    [0m | [95m2.371    [0m | [95m292.5    [0m |
| [0m3        [0m | [0m-0.004287[0m | [0m17.49    [0m | [0m3.973    [0m | [0m22.73    [0m | [0m3.568    [0m | [0m7.476    [0m | [0m181.2    [0m |
| [0m4        [0m | [0m-0.00408 [0m | [0m11.48    [0m | [0m5.077    [0m | [0m52.83    [0m | [0m2.953    [0m | [0m7.259    [0m | [0m141.6    [0m |
| [0m5        [0m | [0m-0.003888[0m | [0m11.84    [0m | [0m11.99    [0m | [0m23.98    [0m | [0m8.199    [0m | [0m12.66    [0m | [0m61

In [194]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_8[20:])

0.13144050266892257

- max_features log2 > 폐기

In [195]:
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

def predict_next_period_volatility(df, prediction_range_hours):
    # 데이터셋 복사
    data = df.copy()
    
    # 시간 인덱스 설정
    data.index = pd.to_datetime(data.index)
    
    # 특징과 타겟 설정
    X = data.drop(columns=['volatility']) 
    y = data['volatility']  # 타겟
    
    # 베이지안 옵티마이제이션을 위한 목적 함수 정의
    def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes):
        # Ensure the parameters are integers
        n_estimators = int(n_estimators)
        max_depth = int(max_depth)
        min_samples_split = int(min_samples_split)
        min_samples_leaf = int(min_samples_leaf)
        max_leaf_nodes = int(max_leaf_nodes)

        # 랜덤 포레스트 모델 생성
        rf_model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            max_features='log2',  # log2로 설정하여 과적합 예방
            random_state=42
        )

        # 모델 학습
        rf_model.fit(X, y)

        # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측
        X_predict = X.iloc[-prediction_range_hours:]
        predicted_values = rf_model.predict(X_predict)

        return -predicted_values.mean()  # 목적 함수 최소화

    # 베이지안 옵티마이제이션을 위한 탐색 공간 설정
    pbounds = {
        'n_estimators': (50, 300),
        'max_depth': (5, 15),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10),
        'max_leaf_nodes': (10, 100)
    }

    # 베이지안 옵티마이제이션 수행
    optimizer = BayesianOptimization(
        f=rf_cv,
        pbounds=pbounds,
        random_state=42,
        verbose=2
    )

    optimizer.maximize(init_points=5, n_iter=10)

    # 최적의 하이퍼파라미터
    best_params = optimizer.max['params']
    best_params = {key: int(value) for key, value in best_params.items()}  # 정수형으로 변환

    # 최적의 하이퍼파라미터로 랜덤 포레스트 모델 생성 및 학습
    rf_model = RandomForestRegressor(**best_params, max_features='log2', random_state=42)
    rf_model.fit(X, y)

    # 마지막 prediction_range_hours만큼의 데이터를 사용하여 예측합니다.
    X_predict = X.iloc[-prediction_range_hours:]
    predicted_values = rf_model.predict(X_predict)
    
    # 예측값과 실제값 가져오기
    actual_values = df[-prediction_range_hours:]['volatility'].values

    # MAPE 계산
    mape = mean_absolute_percentage_error(actual_values, predicted_values)

    # 출력
    print("Best MAPE:", mape)
    
    return predicted_values, best_params

# 사용 예시
# predicted_values, best_params = predict_next_period_volatility(df91, prediction_range_hours=73)
# print("Predicted values:", predicted_values)
# print("Best hyperparameters:", best_params)


In [196]:
predicted_values_9, best_params_9 = predict_next_period_volatility(df91, prediction_range_hours=73)
print("Predicted values:", predicted_values_9)
print("Best hyperparameters:", best_params_9)


|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.004181[0m | [0m8.745    [0m | [0m95.56    [0m | [0m7.588    [0m | [0m12.78    [0m | [0m89.0     [0m |
| [0m2        [0m | [0m-0.004334[0m | [0m6.56     [0m | [0m15.23    [0m | [0m8.796    [0m | [0m12.82    [0m | [0m227.0    [0m |
| [0m3        [0m | [0m-0.004376[0m | [0m5.206    [0m | [0m97.29    [0m | [0m8.492    [0m | [0m5.822    [0m | [0m95.46    [0m |
| [0m4        [0m | [0m-0.004287[0m | [0m6.834    [0m | [0m37.38    [0m | [0m5.723    [0m | [0m9.775    [0m | [0m122.8    [0m |
| [0m5        [0m | [0m-0.004296[0m | [0m11.12    [0m | [0m22.55    [0m | [0m3.629    [0m | [0m8.595    [0m | [0m164.0    [0m |
| [95m6        [0m | [95m-0.004178[0m | [95m8.871    [0m | [95m72.2     [0m | [95m3.322    [0m | [95m16.98 

In [197]:
mean_absolute_percentage_error(volatility_values[20:], predicted_values_9[20:])

0.13999434663687726

In [3]:
import pandas as pd
import numpy as np

submission = pd.read_csv('[ASCENDxBDA] submission.csv')

In [4]:
submission

Unnamed: 0.1,Unnamed: 0,pred_volatility
0,2024-01-28 0:00,0
1,2024-01-28 1:00,0
2,2024-01-28 2:00,0
3,2024-01-28 3:00,0
4,2024-01-28 4:00,0
...,...,...
68,2024-01-30 20:00,0
69,2024-01-30 21:00,0
70,2024-01-30 22:00,0
71,2024-01-30 23:00,0


In [7]:
Predicted_values = [0.00481383 0.00467156 0.00470126 0.00437417 0.0042471  0.0041031
 0.00416906 0.00392622 0.00389217 0.00392079 0.0036308  0.003479
 0.00357902 0.00368112 0.00368309 0.00353545 0.00386433 0.00372889
 0.00378182 0.00396721 0.00377975 0.00402857 0.00381187 0.00372462
 0.00368273 0.00366913 0.00367533 0.0039051  0.00369203 0.00365227
 0.00334952 0.00338581 0.0033953  0.00338052 0.00343597 0.00401183
 0.00481923 0.00477377 0.00480801 0.00509282 0.0051798  0.00552395
 0.00543991 0.00562369 0.00573581 0.00583565 0.00592695 0.00588524
 0.00592501 0.00569789 0.0056525  0.00548558 0.00537874 0.00531251
 0.00493141 0.00459605 0.00456761 0.00460955 0.00447128 0.00390828
 0.00349457 0.00339633 0.00318922 0.00285389 0.00293126 0.00278226
 0.00266508 0.00258925 0.00277192 0.00281272 0.002863   0.00283697
 0.00276158]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3527840333.py, line 1)