In [27]:
import numpy as np 
import pandas as pd 
import pandas_ta as ta
import ccxt
from datetime import datetime
from xgboost import XGBRegressor
import optuna 
from tqdm.auto import tqdm
from sklearn.metrics import mean_absolute_error

In [28]:
chart_df = pd.read_feather('BTC_USDT-4h.feather')
chart_df['date'] = pd.to_datetime(chart_df['date'])
chart_df['date'] = chart_df['date'].dt.tz_localize(None) 

hours, days, months = [], [], [] 
for dt in tqdm(chart_df["date"]): 
    dtobj = pd.to_datetime(dt) 
    hour = dtobj.hour 
    day = dtobj.day 
    month = dtobj.month 
    hours.append(hour) 
    days.append(day) 
    months.append(month) 

chart_df["hours"] = hours 
chart_df["days"] = days 
chart_df["months"] = months 

# define targets 
close = chart_df.close.values 
targets = [] 

for i in range(len(close) - 1): 
    targets.append(close[i+1]) 
targets.append(None) 
chart_df["targets"] = targets 

  0%|          | 0/13719 [00:00<?, ?it/s]

In [29]:
chart_df.head(2)

Unnamed: 0,date,open,high,low,close,volume,hours,days,months,targets
0,2017-08-17 04:00:00,4261.48,4349.99,4261.32,4349.99,82.088865,4,17,8,4427.3
1,2017-08-17 08:00:00,4333.32,4485.39,4333.32,4427.3,63.619882,8,17,8,4352.34


In [30]:
# add some technical indicators 
chart_df.set_index(pd.DatetimeIndex(chart_df["date"]), inplace=True)

# feature engineering 
chart_df["bop"] = chart_df.ta.bop(lookahead=False)
chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False) 
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False) 
chart_df["vwap"] = chart_df.ta.vwap(lookahead=False) 
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100.0 
chart_df["high/low"] = chart_df["high"] / chart_df["low"] 
chart_df["low/open"] = chart_df["low"] / chart_df["open"] 
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False) 
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"] 
for l in range(1, 12): 
    for col in ["open", "high", "low", "close", "volume", "vwap"]: 
        val = chart_df[col].values 
        val_ret = [None for _ in range(l)] 
        for i in range(l, len(val)):
            if val[i-l] == 0: 
                ret = 1 
            else:
                ret = val[i] / val[i-l] 
            val_ret.append(ret) 
        chart_df["{}_change_{}".format(col, l)] = val_ret 
        
chart_df.drop(columns={"date"}, inplace=True) 
chart_df.dropna(inplace=True) 

  chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False)


In [31]:
chart_df.head(2)

Unnamed: 0_level_0,open,high,low,close,volume,hours,days,months,targets,bop,...,low_change_10,close_change_10,volume_change_10,vwap_change_10,open_change_11,high_change_11,low_change_11,close_change_11,volume_change_11,vwap_change_11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-23 16:00:00,4226.94,4259.58,4103.51,4117.07,174.637585,16,23,8,4114.01,-0.703979,...,1.112373,1.065938,1.522754,1.075381,1.049637,1.046454,1.038863,1.025167,1.725297,1.025898
2017-08-23 20:00:00,4136.48,4178.65,4069.8,4114.01,152.616402,20,23,8,4113.58,-0.206431,...,1.197,1.083182,0.91266,1.102681,1.03,1.0405,1.103235,1.065146,1.33074,1.074368


In [4]:
train_size = int(chart_df.shape[0] * 0.8) 

train_df = chart_df.iloc[:train_size] 
val_df = chart_df.iloc[train_size:] 

train_df.shape, val_df.shape

((10943, 84), (2736, 84))

In [5]:
train_columns = [] 
for col in chart_df.columns:
    if col != "targets": 
        train_columns.append(col) 

In [6]:
X_train = train_df[train_columns] 
y_train = train_df["targets"] 

X_val = val_df[train_columns] 
y_val = val_df["targets"] 

X_train.shape, y_train.shape, X_val.shape, y_val.shape 

((10943, 83), (10943,), (2736, 83), (2736,))

In [9]:
def objective(trial): 
    param = {
        "objective": "reg:squarederror",  # Objective for regression
        "n_estimators": 100, 
        "random_state": 42, 
        "tree_method": "gpu_hist",  # Assuming you have a compatible GPU
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True), 
        "subsample": trial.suggest_float("subsample", 0.6, 1.0), 
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0), 
        "max_depth": trial.suggest_int("max_depth", 3, 10), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300), 
        "gamma": trial.suggest_float("gamma", 0, 0.4), 
        "alpha": trial.suggest_float("alpha", 0, 10), 
        "lambda": trial.suggest_float("lambda", 1, 10), 
    }

    reg_xgb = XGBRegressor(**param)
    reg_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)
    predictions = reg_xgb.predict(X_val) 
    mae = mean_absolute_error(y_val, predictions)
    return mae 

In [11]:
study = optuna.create_study(direction="minimize") 
study.optimize(objective, n_trials=500) 

[I 2023-12-22 14:03:44,964] A new study created in memory with name: no-name-77cde2dd-3aa5-4022-b30e-3d38578f9e7d
[I 2023-12-22 14:03:45,225] Trial 0 finished with value: 568.3288510113989 and parameters: {'learning_rate': 0.2739501995373563, 'subsample': 0.7699438812163704, 'colsample_bytree': 0.9895865032675099, 'max_depth': 7, 'min_child_weight': 238, 'gamma': 0.38779645962512266, 'alpha': 9.93639178334509, 'lambda': 7.781000155108699}. Best is trial 0 with value: 568.3288510113989.
[I 2023-12-22 14:03:45,481] Trial 1 finished with value: 398.0204708773072 and parameters: {'learning_rate': 0.0475914296576656, 'subsample': 0.9703189018221157, 'colsample_bytree': 0.780629073765465, 'max_depth': 3, 'min_child_weight': 117, 'gamma': 0.2527596491826379, 'alpha': 8.286644946376443, 'lambda': 3.4848134323342363}. Best is trial 1 with value: 398.0204708773072.
[I 2023-12-22 14:03:45,825] Trial 2 finished with value: 1147.730620202851 and parameters: {'learning_rate': 0.029878895731479168, '

In [12]:
print("done!") 

done!


In [13]:
best_params = study.best_params 
best_params["n_estimators"] = 100  
best_params["random_state"] = 42 
best_params["objective"] = "reg:squarederror" 
best_params["tree_method"] = "gpu_hist" 
print(f"best params = {best_params}")

best params = {'learning_rate': 0.07574900358755156, 'subsample': 0.8487453149256451, 'colsample_bytree': 0.6242657667111873, 'max_depth': 5, 'min_child_weight': 17, 'gamma': 0.23279180432621427, 'alpha': 2.018985796203534, 'lambda': 4.959323678120456, 'n_estimators': 100, 'random_state': 42, 'objective': 'reg:squarederror', 'tree_method': 'gpu_hist'}


In [15]:
# refit full 
full_x = np.concatenate([X_train, X_val], axis=0) 
full_y = np.concatenate([y_train, y_val], axis=0)  

# sorry wrong name 
clf_xgb = XGBRegressor(**best_params) 

clf_xgb.fit(full_x, full_y, eval_set=[(full_x, full_y)], early_stopping_rounds=30, verbose=20)

[0]	validation_0-rmse:24022.08516
[20]	validation_0-rmse:5083.88360
[40]	validation_0-rmse:1151.97175
[60]	validation_0-rmse:457.55739




[80]	validation_0-rmse:380.26977
[99]	validation_0-rmse:365.23183


In [17]:
clf_xgb.save_model("XGBoost_regression_optuna") 

print("done saving!") 

done saving!


In [32]:
# inference 
# get inference data 
bitget = ccxt.bitget()
ohlcv = bitget.fetch_ohlcv("BTC/USDT:USDT", "4h")
chart_df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])

def preprocess(df):
    bitget = ccxt.bitget()
    dates = df["timestamp"].values
    utc_timestamps = []
    for timestamp in dates:
        # Convert to ISO 8601 format
        date_string = bitget.iso8601(int(timestamp))
        # Convert to datetime object without timezone conversion
        date_object = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        # Format to string if needed (optional)
        formatted_utc_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
        utc_timestamps.append(formatted_utc_time)
    # Update DataFrame
    df["date"] = utc_timestamps
    df = df.drop(columns={"timestamp"})
    return df

chart_df = preprocess(chart_df)

In [33]:
hours, days, months = [], [], [] 
for dt in tqdm(chart_df["date"]): 
    dtobj = pd.to_datetime(dt) 
    hour = dtobj.hour 
    day = dtobj.day 
    month = dtobj.month 
    hours.append(hour) 
    days.append(day) 
    months.append(month) 

chart_df["hours"] = hours 
chart_df["days"] = days 
chart_df["months"] = months 

  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
# add some technical indicators 
chart_df.set_index(pd.DatetimeIndex(chart_df["date"]), inplace=True)
# feature engineering 
chart_df["bop"] = chart_df.ta.bop(lookahead=False)
chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False) 
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False) 
chart_df["vwap"] = chart_df.ta.vwap(lookahead=False) 
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100.0 
chart_df["high/low"] = chart_df["high"] / chart_df["low"] 
chart_df["low/open"] = chart_df["low"] / chart_df["open"] 
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False) 
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"] 
for l in range(1, 12): 
    for col in ["open", "high", "low", "close", "volume", "vwap"]: 
        val = chart_df[col].values 
        val_ret = [None for _ in range(l)] 
        for i in range(l, len(val)):
            if val[i-l] == 0: 
                ret = 1 
            else:
                ret = val[i] / val[i-l] 
            val_ret.append(ret) 
        chart_df["{}_change_{}".format(col, l)] = val_ret 
        
chart_df.drop(columns={"date"}, inplace=True) 
chart_df.dropna(inplace=True) 

  chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False)


In [36]:
chart_df.head(2)

Unnamed: 0_level_0,open,high,low,close,volume,hours,days,months,bop,ebsw,...,low_change_10,close_change_10,volume_change_10,vwap_change_10,open_change_11,high_change_11,low_change_11,close_change_11,volume_change_11,vwap_change_11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-12 12:00:00,41592.0,42070.4,41157.2,41357.6,26161.019,12,12,12,-0.25668,0.0,...,0.944894,0.94422,3.676342,0.950104,0.946822,0.957103,0.940807,0.944024,8.250046,0.949874
2023-12-12 16:00:00,41357.6,41464.3,40687.8,41157.5,36940.481,16,12,12,-0.257695,0.57735,...,1.010172,0.972581,0.894687,0.983842,0.944024,0.940617,0.934117,0.939652,5.191152,0.945592


In [38]:
chart_df.tail(4) 

Unnamed: 0_level_0,open,high,low,close,volume,hours,days,months,bop,ebsw,...,low_change_10,close_change_10,volume_change_10,vwap_change_10,open_change_11,high_change_11,low_change_11,close_change_11,volume_change_11,vwap_change_11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-22 00:00:00,43868.9,44288.5,43746.2,44030.0,31238.118,0,22,12,0.297068,0.434185,...,1.024789,1.026881,1.465271,1.031595,1.033915,1.029051,1.032066,1.025286,0.942547,1.03322
2023-12-22 04:00:00,44030.0,44444.0,43556.9,43605.5,34385.447,4,22,12,-0.478526,0.349391,...,1.01837,0.989631,0.74432,1.020613,1.025286,1.033094,1.020355,1.016981,1.612901,1.029719
2023-12-22 08:00:00,43605.5,43859.3,43482.0,43775.1,29259.987,8,22,12,0.44951,0.020348,...,0.999435,1.000828,0.864698,1.015029,1.016981,0.989248,1.016619,0.99348,0.633373,1.018922
2023-12-22 12:00:00,43775.1,43829.2,43435.0,43435.0,18554.701,12,22,12,-0.86276,-0.872611,...,1.004579,0.994207,0.748255,1.012748,0.99348,0.989889,0.998354,0.993052,0.548333,1.013885


In [43]:
x_input = chart_df.iloc[-4, :].values.reshape((1, -1)) 
x_input.shape

(1, 83)

In [50]:
# load model 
test_xgb = XGBRegressor()
test_xgb.load_model("XGBoost_regression_optuna")