In [1]:
# add autoreload magic
%load_ext autoreload
%autoreload 2

In [2]:
# Global parameters
feature_view_name = 'ohlc_feature_view'
feature_view_version = 1
ohlc_window_sec = 60
product_id = 'BTC/USD'
last_n_days_to_fetch_from_store = 90
last_n_days_to_test_model = 7
discretization_thresholds = [-0.0001, 0.0001]
prediction_window_sec = 60*5

In [None]:
import os
os.environ['HOPSWORKS_API_KEY'] ='API_KEY'
os.environ['HOPSWORKS_PROJECT_NAME'] = 'project_name'

In [4]:
import pandas as pd
from loguru import logger
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report   

In [5]:
# Step 1    
# Fetch the data from the feature store
from tools.ohlc_data_reader import OhlcDataReader

ohlc_data_reader = OhlcDataReader(
    ohlc_window_sec=ohlc_window_sec,
    feature_view_name=feature_view_name,
    feature_view_version=feature_view_version,
)

logger.info('Fetching OHLC data from the feature store')

ohlc_data = ohlc_data_reader.read_from_offline_store(
    product_id=product_id,
    last_n_days=last_n_days_to_fetch_from_store,
)

2025-04-27 12:39:30,751 INFO: Initializing external client
2025-04-27 12:39:30,751 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-04-27 12:39:36,186 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1174676


[32m2025-04-27 12:39:38.986[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mFetching OHLC data from the feature store[0m


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (160.27s) 


In [6]:
ohlc_data

Unnamed: 0,timestamp,open,high,low,close,product_id
0,1711330020000,66684.9,66699.0,66684.9,66699.0,BTC/USD
1,1711330080000,66699.0,66699.0,66699.0,66699.0,BTC/USD
2,1711330140000,66699.0,66699.0,66699.0,66699.0,BTC/USD
3,1711330200000,66699.0,66699.0,66699.0,66699.0,BTC/USD
4,1711330260000,66699.0,66745.0,66699.0,66739.4,BTC/USD
...,...,...,...,...,...,...
126148,1719093960000,64281.0,64281.0,64281.0,64281.0,BTC/USD
126149,1719094020000,64281.0,64281.0,64281.0,64281.0,BTC/USD
126150,1719094680000,64257.9,64258.0,64257.9,64258.0,BTC/USD
126151,1719094740000,64257.9,64257.9,64257.9,64257.9,BTC/USD


In [7]:
# add a column to ohlc_data with a human-readable data, using
# the ohlc_data['timestamp'] column in milliseconds
ohlc_data['datetime'] = pd.to_datetime(ohlc_data['timestamp'], unit='ms')

In [8]:
from src.training import split_train_test

# Step 2
# Split the data into training and testing using a cutoff date
logger.info('Splitting the data into training and testing')
ohlc_train, ohlc_test = split_train_test(
    ohlc_data=ohlc_data,
    last_n_days_to_test_model=last_n_days_to_test_model,
)

# print(ohlc_train.head())
# print(ohlc_test.head())

[32m2025-04-27 12:42:26.454[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSplitting the data into training and testing[0m


In [9]:
from src.training import interpolate_missing_candles

# Step 3
# Preprocess the data for training and for testing
# Interpolate missing candles
logger.info('Interpolating missing candles for training data')
ohlc_train = interpolate_missing_candles(ohlc_train, ohlc_window_sec)
logger.info('Interpolating missing candles for testing data')
ohlc_test = interpolate_missing_candles(ohlc_test, ohlc_window_sec)

[32m2025-04-27 12:42:26.799[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mInterpolating missing candles for training data[0m
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work b

In [10]:
from src.training import create_target_metric

# Step 4
# Create the target metric as a new column in our dataframe for training and testing
logger.info('Creating the target metric')
ohlc_train = create_target_metric(
    ohlc_train,
    ohlc_window_sec,
    discretization_thresholds,
    prediction_window_sec,
)
ohlc_test = create_target_metric(
    ohlc_test,
    ohlc_window_sec,
    discretization_thresholds,
    prediction_window_sec,
)

[32m2025-04-27 12:42:27.411[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mCreating the target metric[0m


In [11]:
# Plot distribution of the target
logger.info('Distribution of the target in the training data')
logger.debug(ohlc_train['target'].value_counts())
logger.info('Distribution of the target in the testing data')
logger.debug(ohlc_test['target'].value_counts())

[32m2025-04-27 12:42:27.713[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mDistribution of the target in the training data[0m
[32m2025-04-27 12:42:27.721[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1mtarget
2.0    50572
0.0    48222
1.0    20541
Name: count, dtype: int64[0m
[32m2025-04-27 12:42:27.728[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mDistribution of the target in the testing data[0m
[32m2025-04-27 12:42:27.732[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mtarget
0.0    3519
2.0    3507
1.0    3049
Name: count, dtype: int64[0m


In [12]:
# Before training, let's split the features and the target
X_train = ohlc_train.drop(columns=['target'])
y_train = ohlc_train['target']
X_test = ohlc_test.drop(columns=['target'])
y_test = ohlc_test['target']

In [13]:
from src.baseline_model import BaselineModel

# create model
model = BaselineModel(
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

# generate predictions
y_test_predictions = model.predict(X_test)

# evalute our dummy model
# Let's evaluate the model. It is a classifier with 3 classes

print('****** TEST DATA ******')
# Compute accuracy using scikit-learn
accuracy = accuracy_score(y_test, y_test_predictions)
print(f'Accuracy of the model on test data: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_test, y_test_predictions))

# generate predictions
print('****** TRAINING DATA ******')
y_train_predictions = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_predictions)
print(f'Accuracy of the model: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_train, y_train_predictions))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




****** TEST DATA ******
Accuracy of the model on test data: 0.5105707196029776
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.46      0.46      0.46      3519
         1.0       0.63      0.63      0.63      3049
         2.0       0.46      0.46      0.46      3507

    accuracy                           0.51     10075
   macro avg       0.52      0.52      0.52     10075
weighted avg       0.51      0.51      0.51     10075

****** TRAINING DATA ******
Accuracy of the model: 0.4291197050320526
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.44      0.44      0.44     48222
         1.0       0.33      0.33      0.33     20541
         2.0       0.46      0.46      0.46     50572

    accuracy                           0.43    119335
   macro avg       0.41      0.41      0.41    119335
weighted avg       0.43      0.43      0.43    119335



In [14]:
from src.feature_engineering import (add_momentum_indicators,
add_volatility_indicators)
X_train = add_momentum_indicators(X_train)
X_train = add_volatility_indicators(X_train)
X_train

Unnamed: 0,timestamp,open,high,low,close,product_id,datetime,rsi,momentum,std
0,1711330020000,66684.9,66699.0,66684.9,66699.0,BTC/USD,2024-03-25 01:27:00,0.000000,0.0,0.000000
1,1711330080000,66699.0,66699.0,66699.0,66699.0,BTC/USD,2024-03-25 01:28:00,0.000000,0.0,0.000000
2,1711330140000,66699.0,66699.0,66699.0,66699.0,BTC/USD,2024-03-25 01:29:00,0.000000,0.0,0.000000
3,1711330200000,66699.0,66699.0,66699.0,66699.0,BTC/USD,2024-03-25 01:30:00,0.000000,0.0,0.000000
4,1711330260000,66699.0,66745.0,66699.0,66739.4,BTC/USD,2024-03-25 01:31:00,0.000000,0.0,16.160000
...,...,...,...,...,...,...,...,...,...,...
119330,1718489820000,66042.0,66042.0,66035.1,66035.1,BTC/USD,2024-06-15 22:17:00,10.958241,-19.8,8.333791
119331,1718489880000,66035.2,66035.2,66035.2,66035.2,BTC/USD,2024-06-15 22:18:00,11.341698,-19.7,8.956744
119332,1718489940000,66035.1,66053.8,66035.1,66047.9,BTC/USD,2024-06-15 22:19:00,44.204812,-7.0,7.642779
119333,1718490000000,66047.9,66047.9,66047.9,66047.9,BTC/USD,2024-06-15 22:20:00,44.204812,-7.0,5.706844


In [22]:
from src.feature_engineering import add_features

X_train = add_features(
    X_train,
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

X_test = add_features(
    X_test,
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

features_to_use = [
    'rsi',
    'momentum',
    'std',
    'last_observed_target',
    'days_of_week',
    'hour_of_day',
    'minute_of_hour',
]

X_train_ = X_train[features_to_use]
X_test_ = X_test[features_to_use]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [16]:
X_train_

Unnamed: 0,rsi,momentum,std,last_observed_target,days_of_week,hour_of_day,minute_of_hour
0,0.000000,0.0,0.000000,1.0,0,1,27
1,0.000000,0.0,0.000000,1.0,0,1,28
2,0.000000,0.0,0.000000,1.0,0,1,29
3,0.000000,0.0,0.000000,1.0,0,1,30
4,0.000000,0.0,16.160000,1.0,0,1,31
...,...,...,...,...,...,...,...
119330,10.958241,-19.8,8.333791,0.0,5,22,17
119331,11.341698,-19.7,8.956744,0.0,5,22,18
119332,44.204812,-7.0,7.642779,0.0,5,22,19
119333,44.204812,-7.0,5.706844,0.0,5,22,20


In [23]:
X_test

Unnamed: 0,timestamp,open,high,low,close,product_id,datetime,rsi,momentum,std,last_observed_target,days_of_week,hour_of_day,minute_of_hour
0,1718490480000,66055.0,66055.0,66054.9,66054.9,BTC/USD,2024-06-15 22:28:00,0.00000,0.0,0.000000,1.0,5,22,28
1,1718490540000,66054.9,66054.9,66054.9,66054.9,BTC/USD,2024-06-15 22:29:00,0.00000,0.0,0.000000,1.0,5,22,29
2,1718490600000,66055.0,66055.0,66055.0,66055.0,BTC/USD,2024-06-15 22:30:00,0.00000,0.0,0.000000,1.0,5,22,30
3,1718490660000,66055.0,66055.3,66054.9,66055.2,BTC/USD,2024-06-15 22:31:00,0.00000,0.0,0.000000,1.0,5,22,31
4,1718490720000,66055.2,66055.2,66055.2,66055.2,BTC/USD,2024-06-15 22:32:00,0.00000,0.0,0.135644,1.0,5,22,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10070,1719094680000,64257.9,64258.0,64257.9,64258.0,BTC/USD,2024-06-22 22:18:00,78.21121,177.9,9.199997,0.0,5,22,18
10071,1719094740000,64257.9,64257.9,64257.9,64257.9,BTC/USD,2024-06-22 22:19:00,78.13150,177.8,11.292189,0.0,5,22,19
10072,1719094800000,64257.9,64257.9,64257.9,64257.9,BTC/USD,2024-06-22 22:20:00,78.13150,-23.1,11.300369,0.0,5,22,20
10073,1719094860000,64257.9,64257.9,64257.9,64257.9,BTC/USD,2024-06-22 22:21:00,78.13150,-23.1,9.230078,0.0,5,22,21


Training a boosting tree algorithm -->> XGBoost

In [24]:
import xgboost as xgb

In [27]:
# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_, label=y_train)
dtest = xgb.DMatrix(X_test_, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify the objective for classification
    'num_class': 3                 # Number of classes in the dataset,

    # Add other parameters here
    # These are things you can tune to optimize the model (aka hyperparameters)
    # 'eta': 0.1,                    # Learning rate
    # 'max_depth': 6,                # Maximum depth of a tree
    # 'subsample': 0.8,              # Subsample ratio of the training instances
    # 'colsample_bytree': 0.8,       # Subsample ratio of columns when constructing each tree
    # 'gamma': 1,                    # Minimum loss reduction required to make a further partition
    # 'alpha': 0,                    # L1 regularization term on weights
    # 'lambda': 1,                   # L2 regularization term on weights
    # 'scale_pos_weight': 1          # Balancing of positive and negative weights
}

# Train the model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

# Predict on the test set
y_test_predictions = model.predict(dtest)

print('TEST DATA')
# Calculate accuracy both on the training and test set
accuracy = accuracy_score(y_test, y_test_predictions)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")

# Classifcation report
print(f'Classification report of the model:')
print(classification_report(y_test, y_test_predictions))

print(' TRAINING DATA ')
y_train_predictions = model.predict(dtrain)
accuracy = accuracy_score(y_train, y_train_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100))

# Classifcation report
print(f'Classification report of the model:')
print(classification_report(y_train, y_train_predictions))

TEST DATA
Accuracy on test data: 48.41%
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.42      0.47      0.44      3519
         1.0       0.79      0.42      0.55      3049
         2.0       0.43      0.55      0.48      3507

    accuracy                           0.48     10075
   macro avg       0.55      0.48      0.49     10075
weighted avg       0.53      0.48      0.49     10075

 TRAINING DATA 
Accuracy: 60.78%
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.61      0.62      0.61     48222
         1.0       0.68      0.33      0.45     20541
         2.0       0.59      0.71      0.65     50572

    accuracy                           0.61    119335
   macro avg       0.63      0.55      0.57    119335
weighted avg       0.62      0.61      0.60    119335



In [26]:
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_, y_train)

# Predict on the test set
preds = model.predict(X_test_)

# Calculate accuracy both on the training and test set
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy on test data: {accuracy * 100:.2f}%')

accuracy = accuracy_score(y_train, model.predict(X_train_))
print(f'Accuracy on training data: {accuracy * 100:.2f}%')

Accuracy on test data: 43.79%
Accuracy on training data: 44.90%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
