In [16]:
import pandas as pd
import pandas_ta as ta
import datetime
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, log_loss
import joblib

In [17]:
df = pd.read_csv('../data/btc_15m_data_2018_to_2025.csv')
df.drop(columns=['Close time', 'Quote asset volume', 'Number of trades',  'Quote asset volume', 'Number of trades', 'Taker buy base asset volume', 'Taker buy quote asset volume', 'Ignore'], inplace=True)
df.head()
df.set_index("Open time", inplace=True)
df.index = pd.to_datetime(df.index)



In [18]:
df.ta.sma(length=10, append=True)  # Short-term SMA
df.ta.ema(length=10, append=True)  # Short-term EMA
df.ta.wma(length=10, append=True)  # Short-term WMA
df.ta.macd(fast=5, slow=8, append=True)  # Faster MACD signals
df.ta.rsi(length=7, append=True)  # Shorter RSI for quicker overbought/oversold signals
df.ta.bbands(length=10, append=True)  # Shorter Bollinger Bands for volatility
df.ta.adx(length=7, append=True)  # Shorter ADX for quicker trend strength analysis
df.ta.stoch(length=7, append=True)  # Shorter Stochastic Oscillator
df.ta.willr(length=7, append=True)  # Shorter Williams %R
df.ta.roc(length=5, append=True)  # Shorter ROC for quicker momentum analysis
df.ta.cci(length=10, append=True)  # Shorter CCI for quicker overbought/oversold signals
df.ta.atr(length=7, append=True)  # Shorter ATR for quicker volatility analysis

# Additional indicators
df.ta.tsi(length=10, append=True)       # Shorter True Strength Index
df.ta.ichimoku(append=True)             # Ichimoku Cloud (default settings)
df.ta.obv(append=True)                  # On-Balance Volume (no window size)
df.ta.vwap(append=True)                 # Volume Weighted Average Price (no window size)
df.ta.donchian(length=10, append=True)  # Shorter Donchian Channels
df.ta.ema(length=5, append=True)        # Very short-term EMA


# Drop rows with NaN values after adding indicators
# df.dropna(inplace=True)
df.drop(columns=['ISB_26'])
# Display the first few rows of the updated DataFrame
df.dropna(inplace=True)


In [19]:
def triple_barrier_labels(df, upper_pct=0.01, lower_pct=-0.01, max_hold=5):
    labels = []
    for i in range(len(df)):
        max_target = df['Close'].iloc[i] * (1 + upper_pct)
        min_target = df['Close'].iloc[i] * (1 + lower_pct)
        
        future_prices = df['Close'].iloc[i+1:i+max_hold+1]
        touch_upper = (future_prices > max_target).any()
        touch_lower = (future_prices < min_target).any()
        
        if touch_upper and not touch_lower:
            labels.append('Long')
        elif touch_lower and not touch_upper:
            labels.append('Short')
        else:
            labels.append('Neutral')
    return labels

df['Signal'] = triple_barrier_labels(df)
# Drop OHLC and Adj Close cause we need only indicators (If we need it cause for now we need Close price for backtesting our strategy)
df.drop(['Open', 'High', 'Low' ], axis=1, inplace=True)
df.dropna(inplace=True)

In [20]:
df['Signal'].value_counts()

Signal
Neutral    211436
Short       20943
Long        20427
Name: count, dtype: int64

In [21]:
df.dropna(inplace=True)
df.to_csv('../data/btc_15m_data_2018_to_2025_with_indicatorsGrid.csv')
# Label encode the target column``
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['Signal'])

# Prepare data
X = df.drop(['Signal'], axis=1)
y = y_encoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Split the data
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, shuffle=False)

In [23]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import joblib

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [200, 500, 1000],       # Number of trees
    'max_depth': [6, 8, 10, 12],              # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],   # Learning rate
    'subsample': [0.6, 0.8, 1.0],             # Subsample ratio of the training data
    'colsample_bytree': [0.8, 1.0],      # Subsample ratio of columns when constructing each tree
    'gamma': [0, 1, 5],                  # Minimum loss reduction required to make a split
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier( eval_metric='logloss')

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',  # Metric to optimize
    cv=3,                # 3-fold cross-validation
    verbose=2,           # Print progress
    n_jobs=-1            # Use all available cores
)

# Train the model using GridSearchCV
print("Starting grid search...")
grid_search.fit(X_train, y_train)

# Classification report
y_pred = grid_search.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))


# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model on the test set
accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", accuracy)

# Save the best model
joblib.dump(best_model, '../models/xgboost_model_gridsearch.joblib')
print("Best model saved as '../models/xgboost_model_gridsearch.joblib'")

Starting grid search...
Fitting 3 folds for each of 648 candidates, totalling 1944 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=1.0; total time=  31.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.8; total time=  32.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.8; total time=  32.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=1.0; total time=  32.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.6; total time=  32.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.6; total time=  32.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=6, n_estimators=200, subsample=0.8; total time=  32.5s
[CV] END colsample_bytree=0.8, gamma=



[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.6; total time= 4.0min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time= 3.8min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.6; total time= 4.1min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time= 3.8min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=1000, subsample=0.8; total time= 3.9min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=8, n_estimators=200, subsample=0.6; total time= 1.3min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=8, n_estimators=200, subsample=0.6; total time= 1.3min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=8, n_estimators=200, subsample=0.6; total time= 1.3min
[CV