In [1]:
import pandas as pd
import numpy as np
import ta
from sklearn.model_selection import train_test_split
import time
import optuna
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_ds (df):
    for i in range(1, 4):  
        df[f'X_t-{i}'] = df['Close'].shift(i)

    # Shift Close Column up by 5 rows
    df['Pt_5'] = df['Close'].shift(-5)

    #Agregamos RSI
    rsi_data = ta.momentum.RSIIndicator(close= df['Close'], window=28)
    df['RSI'] = rsi_data.rsi()

    # La Y
    df['Y_BUY'] = df['Close'] * (1 + 0.02) < df['Pt_5']
    df['Y_SELL'] = df['Close'] * (1 - 0.02) > df['Pt_5']
    
    df['Y_BUY'] = df['Y_BUY'].astype(int)
    df['Y_SELL'] = df['Y_SELL'].astype(int)

    return df

In [3]:
data_1 = pd.read_csv('../data/aapl_1h_train.csv')
data_1

Unnamed: 0,Timestamp,Gmtoffset,Datetime,Open,High,Low,Close,Volume
0,1602509400,0,2020-10-12 13:30:00,120.059997,121.330101,119.284500,120.919998,48082766.0
1,1602513000,0,2020-10-12 14:30:00,120.919998,122.639999,120.769996,122.529998,27028067.0
2,1602516600,0,2020-10-12 15:30:00,122.525001,123.629997,122.290000,123.261001,35722193.0
3,1602520200,0,2020-10-12 16:30:00,123.260002,124.000000,123.040000,123.930000,22891159.0
4,1602523800,0,2020-10-12 17:30:00,124.089996,125.180000,123.910003,125.050003,31443470.0
...,...,...,...,...,...,...,...,...
390,1609432200,0,2020-12-31 16:30:00,132.529998,132.800003,131.720001,132.619995,13538948.0
391,1609435800,0,2020-12-31 17:30:00,132.619995,132.717605,132.310104,132.560806,7474176.0
392,1609439400,0,2020-12-31 18:30:00,132.565002,132.889999,132.009994,132.764999,9408857.0
393,1609443000,0,2020-12-31 19:30:00,132.770004,133.179992,132.565994,132.800003,10147999.0


In [4]:
data_ejem = clean_ds(data_1)
data_ejem

Unnamed: 0,Timestamp,Gmtoffset,Datetime,Open,High,Low,Close,Volume,X_t-1,X_t-2,X_t-3,Pt_5,RSI,Y_BUY,Y_SELL
0,1602509400,0,2020-10-12 13:30:00,120.059997,121.330101,119.284500,120.919998,48082766.0,,,,124.314102,,1,0
1,1602513000,0,2020-10-12 14:30:00,120.919998,122.639999,120.769996,122.529998,27028067.0,120.919998,,,124.419998,,0,0
2,1602516600,0,2020-10-12 15:30:00,122.525001,123.629997,122.290000,123.261001,35722193.0,122.529998,120.919998,,123.757202,,0,0
3,1602520200,0,2020-10-12 16:30:00,123.260002,124.000000,123.040000,123.930000,22891159.0,123.261001,122.529998,120.919998,124.430000,,0,0
4,1602523800,0,2020-10-12 17:30:00,124.089996,125.180000,123.910003,125.050003,31443470.0,123.930000,123.261001,122.529998,123.080001,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1609432200,0,2020-12-31 16:30:00,132.529998,132.800003,131.720001,132.619995,13538948.0,132.529998,132.884994,133.679992,,50.983206,0,0
391,1609435800,0,2020-12-31 17:30:00,132.619995,132.717605,132.310104,132.560806,7474176.0,132.619995,132.529998,132.884994,,50.783652,0,0
392,1609439400,0,2020-12-31 18:30:00,132.565002,132.889999,132.009994,132.764999,9408857.0,132.560806,132.619995,132.529998,,51.463321,0,0
393,1609443000,0,2020-12-31 19:30:00,132.770004,133.179992,132.565994,132.800003,10147999.0,132.764999,132.560806,132.619995,,51.582189,0,0


In [5]:
close_data = data_ejem[['Close', 'X_t-1', 'X_t-2', 'X_t-3', 'RSI', 'Y_BUY']]
close_data


Unnamed: 0,Close,X_t-1,X_t-2,X_t-3,RSI,Y_BUY
0,120.919998,,,,,1
1,122.529998,120.919998,,,,0
2,123.261001,122.529998,120.919998,,,0
3,123.930000,123.261001,122.529998,120.919998,,0
4,125.050003,123.930000,123.261001,122.529998,,0
...,...,...,...,...,...,...
390,132.619995,132.529998,132.884994,133.679992,50.983206,0
391,132.560806,132.619995,132.529998,132.884994,50.783652,0
392,132.764999,132.560806,132.619995,132.529998,51.463321,0
393,132.800003,132.764999,132.560806,132.619995,51.582189,0


In [6]:
close_data['Y_BUY'].value_counts()


Y_BUY
0    359
1     36
Name: count, dtype: int64

In [7]:
x = close_data.drop(columns=['Y_BUY'])
y = close_data[['Y_BUY']]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
class XGBOptimizer:
    def __init__(self, x_train, y_train, x_test, y_test):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def opt_xgb(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'max_leaves': trial.suggest_int('max_leaves', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
            'gamma': trial.suggest_float('gamma', 0.1, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 1),
            'eval_metric': 'logloss',  
            'use_label_encoder': False,
            'device': 'cuda' # Si no tienen GPU y linux comentar esta linea
        }
        model = xgb.XGBClassifier(**params)
        model.fit(self.x_train, self.y_train)
        y_pred = model.predict(self.x_test)
        f1 = f1_score(self.y_test, y_pred)
        return f1

    def xgb_optuna(self):
        start_time = time.time()
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: self.opt_xgb(trial), n_trials=100)
        trial = study.best_trial
        print('Accuracy: {}'.format(trial.value))
        print("Best hyperparameters: {}".format(trial.params))
        end_time = time.time()
        execution_time_minutes = (end_time - start_time) / 60
        print("Execution time: {} minutes".format(execution_time_minutes))
        return trial.params


In [10]:
XGBOptimizer(x_train, y_train, x_test, y_test).xgb_optuna()

[I 2024-03-06 15:19:33,646] A new study created in memory with name: no-name-5239042c-2b68-4703-a773-f53248eec68f
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-03-06 15:19:34,999] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 148, 'max_depth': 3, 'max_leaves': 8, 'learning_rate': 0.28498267958251744, 'booster': 'gbtree', 'gamma': 0.723346858731904, 'reg_alpha': 0.4600949495356732, 'reg_lambda': 0.9423172064644489}. Best is trial 0 with value: 0.0.
Parameters: { "gamma", "max_depth", "max_leaves" } are not used.

[I 2024-03-06 15:19:35,386] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 460, 'max_depth': 7, 'max_leaves': 6, 'learning_rate': 0.22977942062029078, 'booster': 'gblinear', 'gamma': 0.9497601680195725, 'reg_alpha': 0.8410427714731121, 'reg_lambda': 0.8250342388879868}. Best is trial 0 with value: 0.0.
Parameters: { "gamma",

KeyboardInterrupt: 

In [None]:
import xgboost as xgb

# Define the hyperparameters
hyperparameters = {
    'n_estimators': 200,
    'max_depth': 4,
    'max_leaves': 8,
    'learning_rate': 0.19907630294635822,
    'booster': 'gbtree',
    'gamma': 0.23192419129293812,
    'reg_alpha': 0.29521621673453086,
    'reg_lambda': 0.5262324218052595
}

# Create the XGBoost model
model = xgb.XGBClassifier(**hyperparameters)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(report)
