In [16]:
import sys
from pathlib import Path
notebook_dir = Path().resolve()
src_path = notebook_dir.parent / 'src'
sys.path.insert(0, str(src_path))
import pandas as pd
from utils.timesplit import RollingSplit

crypto = pd.read_csv("../data/processed_data/BTCUSDT_1h_processed.csv")

# Variables
input_features = ['open',
                  'high', 
                  'low', 
                  'close']
train_size = 0.7
val_size = 0.15

# Features and target
X = crypto[input_features]
y = crypto['log_return']

# Initialize Rolling Splitter
population = len(crypto)
sample = round(population * 0.1)                # number of points within each fold
train_count = int(train_size * sample)          # number of samples in the training set
val_count = int(val_size * sample)              # number of samples in the validation set
test_count = sample - train_count - val_count   # number of samples in the test set
step_size = test_count                          # step size as test arbitrarily chosen as test size for non-overlapping sets

splitter = RollingSplit(train_size=train_count, val_size=val_count, test_size=test_count, step_size=step_size)

print(f"Each fold will have: \nTotal sample points: {sample}, \nTraining points: {train_count} \nValidation points: {val_count},  \nTesting points: {test_count},  \nStepping size: {step_size}")

Each fold will have: 
Total sample points: 4382, 
Training points: 3067 
Validation points: 657,  
Testing points: 658,  
Stepping size: 658


In [None]:
import optuna
from xgboost import XGBRegressor
import numpy as np
import random

def random_seed(seed):
    random.seed(seed)


def huber_loss(true, pred, delta=delta):
    error = true - pred
    abs_error = np.abs(error)
    quadratic = np.minimum(abs_error, delta)
    linear = abs_error - quadratic
    loss = 0.5 * quadratic**2 + delta * linear

    return np.mean(loss)

# -> Optuna objective function for hyperparameter tuning
def objective(trial):
    
    # => Param grid
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'delta': trial.suggest_float('delta', 0.5, 2.0)
    }

    # => MSE for each fold (maybe add huber loss later)
    fold_loss = [] 

    for fold, (train_idx, val_idx, test_idx) in enumerate(splitter.split(X)):
        """
        Instead of splitting data, we split the indices and use .iloc to get the data
        """
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val,   y_val   = X.iloc[val_idx],   y.iloc[val_idx]
        
        # => Model training
        XGB_reg = XGBRegressor(
            objective='reg:squarederror',
            **params,
            device = 'cuda'
        )

        # => Fitting the XGB model
        XGB_reg.fit(X_train, y_train,
                    eval_set = [(X_val, y_val)],
                    early_stopping_rounds = 10,
                    verbose = True)

        XGB_reg_val_preds = XGB_reg.predict(X_val)
        val_loss = mean_squared_error(y_val, XGB_reg_val_preds)
        fold_loss.append(val_loss)

    return np.mean(fold_loss)


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=1, show_progress_bar=True)

print("Best Parameters: ", study.best_params)
print("Best Loss: ", study.best_value)

In [None]:
"""
fit the optuna variables later, this is a grid search cell not SOTA
"""


from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, r2_score

# -> Param grid
params = {
    "max_depth": [3, 5, 7],
    "n_estimators": [50, 100, 250, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "n_estimators": [50, 100, 250, 500]
}
grid = list(ParameterGrid(params))

# -> Initialize results storage
results = []
all_test_preds = []
all_test_idx = []

# -> For each fold within the rolling split
for fold, (train_idx, val_idx, test_idx) in enumerate(splitter.split(X)):
    """
    Instead of splitting data, we split the indices and use .iloc to get the data
    """
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val,   y_val   = X.iloc[val_idx],   y.iloc[val_idx]
    X_test,  y_test  = X.iloc[test_idx],  y.iloc[test_idx]
    
    # => Grid search on validation set
    best_score = np.inf
    best_params = None

    for params in grid:
        model = XGBRegressor(
            objective = 'reg:squarederror',
            random_state = 69,
            device = 'cuda',
            **params
        )
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, val_pred)
        if val_mse < best_score:
            best_score = val_mse
            best_params = params

    # => Retraining on train/val set
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])

    best_model = XGBRegressor(
        objective = 'reg:squarederror',
        random_state = 69,
        device = 'cuda',
        **best_params
    )

    best_model.fit(X_train_val, y_train_val,  verbose=False)
    test_preds = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, test_preds)
    test_r2 = r2_score(y_test, test_preds)

    # => Appending results for all folds -
    results.append({
    "fold": fold + 1,
    "val_mse": best_score,
    "test_mse": test_mse,
    "test r2": test_r2,
    })

    all_test_preds.extend(test_preds)
    all_test_idx.extend(test_idx)

# -> Results summary 
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,fold,val_mse,test_mse,test r2
0,1,1.3e-05,2e-06,0.883342
1,2,2e-06,8e-06,0.462335
2,3,1e-05,4.1e-05,-0.432679
3,4,4.1e-05,2.7e-05,0.071878
4,5,2.7e-05,4e-06,0.702118
5,6,4e-06,3.5e-05,-0.018483
6,7,3.5e-05,5.8e-05,-0.009857
7,8,5.8e-05,0.000109,-0.005543
8,9,0.000109,0.000111,0.375038
9,10,0.000105,0.000118,0.07004
