This notebook is going to contain the dataset, and model(s) which is going to be used to predict the open and low for the next day. So far, a dataset has been created, and various techincal indicators for bitcoin price movement have been added into it, the aggregated sentiment scores, generated using FinBERT are also the feature in my final dataset. The main focus of this notebook would be to predict open and low for the next day without any further normalization or preprocessing

## 1. Loading the Data

In [None]:
### Importing the necessary libraries
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
### Getting the Complete Data
from btc_data_pipeline import BitcoinDataPipeline
import pandas as pd

bdp = BitcoinDataPipeline()
btc_data = bdp.getLatestBitcoinData()
btc_data = btc_data.drop(index='2024-07-10' , axis=0)

sentiment_data = pd.read_csv('data/sentiment_scores.csv', parse_dates=['Date'], index_col='Date')
sentiment_data.index = pd.to_datetime(sentiment_data.index)
sentiment_data.index = sentiment_data.index.normalize()

In [None]:
data = btc_data.merge(sentiment_data, on='Date', how='left')

In [None]:
data = data.drop(index=pd.to_datetime('2024-07-09'), axis=0)
data['aggregated_sentiment'] = data['aggregated_sentiment'].ffill()

In order to get our target variables, I can simply shift the high and low by -1, this would provide me the target columns, as the open and low for the next day.


In [None]:
data['y_high'] = data['High'].shift(-1)
data['y_low'] = data['Low'].shift(-1)

In [None]:
data.head(3)

In [None]:
data = data.drop(index=pd.to_datetime('2024-07-08'), axis=0)
data.tail(3) ### Last row has target variables as Nan, I will simply drop it

## 2. Setting up Model Evaluation Functions

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def calculate_mae(y_true, y_pred):
    """
    Calculate the mean absolute error (MAE) between the true values and the predicted values.

    Parameters:
    y_true (array-like): The true values.
    y_pred (array-like): The predicted values.

    Returns:
    float: The mean absolute error (MAE) between the true values and the predicted values.
    """
    return mean_absolute_error(y_true, y_pred)


def calculate_rmse(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE) between the true values and the predicted values.

    Parameters:
    - y_true: array-like, true values
    - y_pred: array-like, predicted values

    Returns:
    - rmse: float, the RMSE value
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mape(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE) between the true and predicted values.

    Parameters:
    - y_true (array-like): The true values.
    - y_pred (array-like): The predicted values.

    Returns:
    - mape (float): The calculated MAPE.

    """
    return np.mean(np.abs((y_true, y_pred)/y_true)) * 100

def calculate_r2(y_true, y_pred):
    """
    Calculate the R-squared (coefficient of determination) score.

    Parameters:
    - y_true: array-like of shape (n_samples,)
        The true target values.
    - y_pred: array-like of shape (n_samples,)
        The predicted target values.

    Returns:
    - r2_score: float
        The R-squared score.

    """
    return r2_score(y_true, y_pred)


def evaluate_model(y_true_high, y_pred_high, y_true_low, y_pred_low):
        """
        Evaluate the performance of a model by calculating various metrics for high and low values.

        Parameters:
        - y_true_high (array-like): True high values.
        - y_pred_high (array-like): Predicted high values.
        - y_true_low (array-like): True low values.
        - y_pred_low (array-like): Predicted low values.

        Returns:
        - metrics (dict): A dictionary containing the calculated metrics for high and low values.
            The keys of the dictionary are:
            - 'High_MAE': Mean Absolute Error for high values.
            - 'Low_MAE': Mean Absolute Error for low values.
            - 'High_RMSE': Root Mean Squared Error for high values.
            - 'Low_RMSE': Root Mean Squared Error for low values.
            - 'High_MAPE': Mean Absolute Percentage Error for high values.
            - 'Low_MAPE': Mean Absolute Percentage Error for low values.
            - 'High_R2': R-squared score for high values.
            - 'Low_R2': R-squared score for low values.
        """
        metrics = {
                'High_MAE': calculate_mae(y_true_high, y_pred_high),
                'Low_MAE': calculate_mae(y_true_low, y_pred_low),
                'High_RMSE': calculate_rmse(y_true_high, y_pred_high),
                'Low_RMSE': calculate_rmse(y_true_low, y_pred_low),
                'High_MAPE': calculate_mape(y_true_high, y_pred_high),
                'Low_MAPE': calculate_mape(y_true_low, y_pred_low),
                'High_R2': calculate_r2(y_true_high, y_pred_high),
                'Low_R2': calculate_r2(y_true_low, y_pred_low),
        }
        return metrics

## 3. Creating Multiple Models

### Model 1: XGBoost

In [None]:
from sklearn.model_selection import train_test_split

### Preparing the data for XGBoost
y = data[['y_high', 'y_low']]
X = data.drop(columns=['y_high', 'y_low'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, shuffle=False)

In [None]:
### Train separate XGBoost models for y_high and y_low.
import xgboost as xgb

### Converting data into DMatrix, optimized for XGBoost
dtrain_high = xgb.DMatrix(X_train, label=y_train['y_high'])
dtrain_low = xgb.DMatrix(X_train, label=y_train['y_low'])
dtest_high = xgb.DMatrix(X_test)
dtest_low = xgb.DMatrix(X_test)

In [None]:
### Parameters for the XGBoost Model
params = {
    'objective' : 'reg:squarederror',
    'eval_metric' : 'rmse'
}

In [None]:
xgb_model_high = xgb.train(params, dtrain_high, num_boost_round=100)
xgb_model_low = xgb.train(params, dtrain_low, num_boost_round=100)

In [None]:
y_pred_high = xgb_model_high.predict(dtest_high)
y_pred_low = xgb_model_low.predict(dtest_low)

# Combine predictions into a DataFrame
y_pred = pd.DataFrame({
    'y_pred_high': y_pred_high,
    'y_pred_low': y_pred_low
})


In [None]:
y_test_high = y_test['y_high']
y_test_low = y_test['y_low']

In [None]:
# Evaluate the model
metrics_xgb = evaluate_model(y_test_high.values, y_pred_high, y_test_low.values, y_pred_low)
print(metrics_xgb)

In [None]:
import plotly.graph_objects as go

def plot_predictions_vs_actuals(dates, actuals, predictions, title):
    fig = go.Figure()

    # Add actual values
    fig.add_trace(go.Scatter(
        x=dates,
        y=actuals,
        mode='lines',
        name='Actual',
        line=dict(color='blue')
    ))

    # Add predicted values
    fig.add_trace(go.Scatter(
        x=dates,
        y=predictions,
        mode='lines',
        name='Predicted',
        line=dict(color='red')
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title='Price (USD)',
        template='plotly_dark'
    )

    return fig


dates = y_test.index

# Plot for High Prices
fig_high = plot_predictions_vs_actuals(dates, y_test['y_high'], y_pred['y_pred_high'], 'Actual vs Predicted High Prices')
fig_high.show()


In [None]:
fig_high = plot_predictions_vs_actuals(dates, y_test['y_low'], y_pred['y_pred_low'], 'Actual vs Predicted Low Prices')
fig_high.show()

### Model 2: Gradient Boosting Machine

In [None]:
from sklearn.model_selection import train_test_split

### Preparing the data for Gradient Boosting Machine
y = data[['y_high', 'y_low']]
X = data.drop(columns=['y_high', 'y_low'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, shuffle=False)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbm_high = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbm_low = GradientBoostingRegressor(n_estimators=100, random_state=42)

gbm_high.fit(X_train, y_train['y_high'])
gbm_low.fit(X_train, y_train['y_low'])

In [None]:
y_pred_high = gbm_high.predict(X_test)
y_pred_low = gbm_high.predict(X_test)

y_pred_gbm = pd.DataFrame({
    'y_pred_high': y_pred_high,
    'y_pred_low': y_pred_low
})

In [None]:
y_test_high = y_test['y_high']
y_test_low = y_test['y_low']


metrics_gbm = evaluate_model(y_test_high.values, y_pred_high, y_test_low.values, y_pred_low)
metrics_gbm

In [None]:
# Plot for High Prices
fig_high_gbm = plot_predictions_vs_actuals(dates, y_test['y_high'], y_pred_gbm['y_pred_high'], 'GBM: Actual vs Predicted High Prices')
fig_high_gbm.show()

In [None]:
# Plot for Low Prices
fig_low_gbm = plot_predictions_vs_actuals(dates, y_test['y_low'], y_pred_gbm['y_pred_low'], 'GBM: Actual vs Predicted Low Prices')
fig_low_gbm.show()

### Model 3: LSTM Neural Network

In [None]:
from sklearn.model_selection import train_test_split

y = data[['y_high', 'y_low']]
X = data.drop(columns=['y_high', 'y_low'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, shuffle=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale the target variables separately
y_scaler_high = MinMaxScaler()
y_train_high_scaled = y_scaler_high.fit_transform(y_train['y_high'].values.reshape(-1, 1))
y_test_high_scaled = y_scaler_high.transform(y_test['y_high'].values.reshape(-1, 1))

y_scaler_low = MinMaxScaler()
y_train_low_scaled = y_scaler_low.fit_transform(y_train['y_low'].values.reshape(-1, 1))
y_test_low_scaled = y_scaler_low.transform(y_test['y_low'].values.reshape(-1, 1))

In [None]:
### Saving the Scalers
os.makedirs('models/scalers/', exist_ok=True)

with open('models/scalers/x_scaler.pkl', 'wb') as f:
    pickle.dump(f)

with open('models/scalers/y_high_scaler.pkl', 'wb') as f:
    pickle.dump(f)

with open('models/scalers/y_low_scaler.pkl', 'wb') as f:
    pickle.dump(f)

In [None]:
# Reshape input to be 3D (samples, timesteps, features)
X_train_scaled_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_scaled_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, GRU, Input

def build_model(input_shape):
    model = Sequential()
    model.add(Input(input_shape))
    model.add(Bidirectional(LSTM(50, return_sequences=True)))
    model.add(GRU(50))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(1))  # Output layer for high or low price prediction
    model.compile(optimizer='adam', loss='mse')
    return model

# Build models for high and low predictions
input_shape = (X_train_scaled_lstm.shape[1], X_train_scaled_lstm.shape[2])
lstm_model_high = build_model(input_shape)
lstm_model_low = build_model(input_shape)


In [None]:
# Train the model for high price prediction
history_high = lstm_model_high.fit(X_train_scaled_lstm, y_train_high_scaled, epochs=100, batch_size=32, validation_split=0.2)

In [None]:
# Train the model for low price prediction
history_low = lstm_model_low.fit(X_train_scaled_lstm, y_train_low_scaled, epochs=100, batch_size=32, validation_split=0.2)

In [None]:
# Predict high and low prices
y_pred_high_scaled = lstm_model_high.predict(X_test_scaled_lstm)
y_pred_low_scaled = lstm_model_low.predict(X_test_scaled_lstm)

In [None]:
# Inverse transform the predictions
y_pred_high = y_scaler_high.inverse_transform(y_pred_high_scaled).flatten()
y_pred_low = y_scaler_low.inverse_transform(y_pred_low_scaled).flatten()


In [None]:
# Extract true values for evaluation
y_test_high = y_test['y_high']
y_test_low = y_test['y_low']

# Evaluate the model
metrics_lstm = evaluate_model(y_test_high.values, y_pred_high, y_test_low.values, y_pred_low)
print(metrics_lstm)


In [None]:
# Plot for High Prices
fig_high_dl = plot_predictions_vs_actuals(dates, y_test['y_high'], y_pred_high, 'BiDirectional LSTM + GRU: Actual vs Predicted High Prices')
fig_high_dl.show()

In [None]:
# Plot for Low Prices
fig_low_dl = plot_predictions_vs_actuals(dates, y_test['y_low'], y_pred_low, 'BiDirectional LSTM + GRU: Actual vs Predicted Low Prices')
fig_low_dl.show()

### Model 4: TabNet Architecture

In [None]:
### Using the builtin implementation of TabNet from Pytorch

from pytorch_tabnet.tab_model import TabNetRegressor

### Initializing the models
tabnet_model_high = TabNetRegressor()
tabnet_model_low = TabNetRegressor()

In [None]:
### Training the model for predicting high
tabnet_model_high.fit(
    X_train_scaled, y_train_high_scaled,
    eval_set=[(X_test_scaled, y_test_high_scaled)],
    max_epochs=300,
    patience=30,
    batch_size=32,
    virtual_batch_size=32,
    num_workers=0,
    drop_last=False
)

In [None]:
### Training the model for predicting Low
tabnet_model_low.fit(
    X_train_scaled, y_train_low_scaled,
    eval_set=[(X_test_scaled, y_test_low_scaled)],
    max_epochs=300,
    patience=30,
    batch_size=32,
    virtual_batch_size=32,
    num_workers=0,
    drop_last=False
)

In [None]:
# Predict high and low prices
y_pred_high_scaled = tabnet_model_high.predict(X_test_scaled)
y_pred_low_scaled = tabnet_model_low.predict(X_test_scaled)

# Inverse transform the predictions
y_pred_high = y_scaler_high.inverse_transform(y_pred_high_scaled).flatten()
y_pred_low = y_scaler_low.inverse_transform(y_pred_low_scaled).flatten()

In [None]:
# Extract true values for evaluation
y_test_high = y_test['y_high']
y_test_low = y_test['y_low']

# Evaluate the model
metrics_tabnet = evaluate_model(y_test_high.values, y_pred_high, y_test_low.values, y_pred_low)
print(metrics_tabnet)


In [None]:
# Plot for High Prices
fig_high_tabnet = plot_predictions_vs_actuals(dates, y_test['y_high'], y_pred_high, 'TabNet: Actual vs Predicted High Prices')
fig_high_tabnet.show()

In [None]:
# Plot for Low Prices
fig_low_tabnet = plot_predictions_vs_actuals(dates, y_test['y_low'], y_pred_low, 'TabNet: Actual vs Predicted Low Prices')
fig_low_tabnet.show()

## 4. Comparing all 4 models

In [None]:
### Using plotly to visulise and compare the 4 models
import plotly.graph_objects as go
import pandas as pd

metrics_df = pd.DataFrame({
    'Metric': ['High_MAE', 'Low_MAE', 'High_RMSE', 'Low_RMSE', 'High_MAPE', 'Low_MAPE', 'High_R2', 'Low_R2'],
    'XGBoost': [metrics_xgb['High_MAE'], metrics_xgb['Low_MAE'], metrics_xgb['High_RMSE'], metrics_xgb['Low_RMSE'], metrics_xgb['High_MAPE'], metrics_xgb['Low_MAPE'], metrics_xgb['High_R2'], metrics_xgb['Low_R2']],
    'GBM': [metrics_gbm['High_MAE'], metrics_gbm['Low_MAE'], metrics_gbm['High_RMSE'], metrics_gbm['Low_RMSE'], metrics_gbm['High_MAPE'], metrics_gbm['Low_MAPE'], metrics_gbm['High_R2'], metrics_gbm['Low_R2']],
    'LSTM': [metrics_lstm['High_MAE'], metrics_lstm['Low_MAE'], metrics_lstm['High_RMSE'], metrics_lstm['Low_RMSE'], metrics_lstm['High_MAPE'], metrics_lstm['Low_MAPE'], metrics_lstm['High_R2'], metrics_lstm['Low_R2']],
    'TabNet': [metrics_tabnet['High_MAE'], metrics_tabnet['Low_MAE'], metrics_tabnet['High_RMSE'], metrics_tabnet['Low_RMSE'], metrics_tabnet['High_MAPE'], metrics_tabnet['Low_MAPE'], metrics_tabnet['High_R2'], metrics_tabnet['Low_R2']]
})

metrics_df

In [None]:
### Ranking the models for each metric
rankings = metrics_df.copy()
for metric in rankings['Metric']:
    # For MAE, RMSE, MAPE: lower is better
    if 'MAE' in metric or 'RMSE' in metric or 'MAPE' in metric:
        rankings.loc[rankings['Metric'] == metric, ['XGBoost', 'GBM', 'LSTM', 'TabNet']] = rankings.loc[rankings['Metric'] == metric, ['XGBoost', 'GBM', 'LSTM', 'TabNet']].rank(axis=1, method='min')
    # For R2: higher is better
    else:
        rankings.loc[rankings['Metric'] == metric, ['XGBoost', 'GBM', 'LSTM', 'TabNet']] = rankings.loc[rankings['Metric'] == metric, ['XGBoost', 'GBM', 'LSTM', 'TabNet']].rank(axis=1, method='min', ascending=False)

# Plotting function for combined High and Low metrics
def plot_ranking_chart(rankings_df, title):
    fig = go.Figure()

    for model in ['XGBoost', 'GBM', 'LSTM', 'TabNet']:
        fig.add_trace(go.Bar(
            x=rankings_df['Metric'],
            y=rankings_df[model],
            name=model
        ))

    fig.update_layout(
        title=title,
        xaxis_title='Metric',
        yaxis_title='Rank',
        template='plotly_dark',
        barmode='group'
    )

    fig.show()

# Plot the ranking chart
plot_ranking_chart(rankings, 'Model Rankings by Metric')


This concludes the model development, the main model I choose for predicting the high and low are as follows:
1. For Predicting High: LSTM Model
2. For Predicting Low: TabNet Model

Next step is to basically Train these two selected models on the complete dataset before deploying.

In [None]:
y = data[['y_high', 'y_low']]
X = data.drop(columns=['y_high', 'y_low'])

In [None]:
### Normalzing the Data for the models
import pickle
with open('models/scalers/x_scaler.pkl', 'rb') as f:
    x_scaler = pickle.load(f)

with open('models/scalers/y_high_scaler.pkl', 'rb') as f:
    y_high_scaler = pickle.load(f)

with open('models/scalers/y_low_scaler.pkl', 'rb') as f:
    y_low_scaler = pickle.load(f)

In [None]:
X_scaled = x_scaler.transform(X)
y_high_scaled = y_high_scaler.transform(y['y_high'].values.reshape(-1, 1))
y_low_scaled = y_low_scaler.transform(y['y_low'].values.reshape(-1, 1))

#### LSTM Model

In [None]:
X_scaled_lstm = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

In [None]:
input_shape = (X_scaled_lstm.shape[1], X_scaled_lstm.shape[2])
lstm_model = build_model(input_shape)
lstm_model.summary()

In [None]:

history_lstm = lstm_model.fit(X_scaled_lstm, y_high_scaled, epochs=100, batch_size=32)

In [None]:
### Saving the Model
os.makedirs('models/high/', exist_ok=True)
lstm_model.save('models/high/high.keras')

#### TabNet

In [None]:
tabnet_model = TabNetRegressor()

tabnet_model.fit(
    X_scaled, y_low_scaled,
    max_epochs=300,
    patience=30,
    batch_size=32,
    virtual_batch_size=32,
    num_workers=0,
    drop_last=False
)

In [None]:
### Saving the Model
os.makedirs('models/low/', exist_ok=True)
tabnet_model.save_model('models/high/low')