In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

  from .autonotebook import tqdm as notebook_tqdm


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/556181
Connected. Call `.close()` to terminate connection gracefully.


In [2]:
bitcoin_fg = fs.get_feature_group(
    name='bitcoin_price_movement',
    version=2
)

In [3]:
training_data = bitcoin_fg.select_all()

In [4]:
version = 2
feature_view_training = fs.get_or_create_feature_view(
    name='bitcoin_price_movement_training_fv',
    version=version,
    query=training_data
)

In [5]:
X, _ = feature_view_training.training_data(
    description='Training data for the prediction system from the historically available data.'
)

Finished: Reading data from Hopsworks, using ArrowFlight (1.30s) 




In [13]:
import datetime as dt
from features import feature_engineering

start_date = dt.date(2016,1,1)
end_date = dt.date.today() - dt.timedelta(days=1)
X = feature_engineering.prepare_data(start_date=start_date, end_date=end_date)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
  df_merged[f'close{column_suffix}'].fillna(method='ffill', inplace=True)
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
  df_merged[f'close{column_suffix}'].fillna(method='ffill', inplace=True)
  df_merged[f'close{column_suffix}'].fillna(method='ffill', inplace=True)
  df_merged['hash_rate'].fillna(method='ffill', inplace=True)


In [14]:
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
sorted_X = X.sort_values(by='id')
sorted_X['date'] = pd.to_datetime(sorted_X['date'])

In [15]:
final_X = sorted_X.drop(columns=['id', 'date'])

In [16]:
import numpy as np
high_prices = final_X.loc[:, 'high'].values
low_prices = final_X.loc[:, 'low'].values
mid_prices = (high_prices + low_prices) / 2.0

mid_price_changes = np.diff(mid_prices) / mid_prices[:-1] * 100
mid_price_changes = np.insert(mid_price_changes, 0, 0)

features = final_X[['volume', 'ma7', 'ma21', 'bollinger_upper', 'bollinger_lower', 'volatility', 'close_usd_index', 'close_oil', 'close_gold', 'hash_rate']].values
feature_changes = np.diff(features, axis=0) / features[:-1] * 100
feature_changes = np.insert(feature_changes, 0, 0, axis=0)

combined_features = np.column_stack((mid_price_changes.reshape(-1, 1), feature_changes))

In [17]:
sequence_length = 100
sequence_data = []
sequence_labels = []

for i in range(len(combined_features) - sequence_length):
    sequence_data.append(combined_features[i:i + sequence_length])
    # Labels based on whether the next mid_price_change is positive (1) or negative (0)
    sequence_labels.append(1 if mid_price_changes[i + sequence_length] > 0 else 0)

sequence_data = np.array(sequence_data)
sequence_labels = np.array(sequence_labels)

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report

In [19]:
# Splitting the data into training and test sets, split at 80% of the data
split_index = int(len(sequence_data) * 0.8)
train_data = sequence_data[:split_index]
train_labels = sequence_labels[:split_index]
test_data = sequence_data[split_index:]
test_labels = sequence_labels[split_index:]

# We need to convert the to PyTorch tensors format in order for the model to work efficiently
train_data = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
test_data = TensorDataset(torch.from_numpy(test_data), torch.from_numpy(test_labels))

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [34]:
# Creating LSTM model class, which defines the model's structure
# We added dropout layer too, to try to tackle overfitting problem

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(self.dropout(lstm_out[:, -1, :]))
        return out

# Setting the input size of the model to match the number of features
# Setting the number of neurons (hidden size) to 50
# Setting the size of the output to 1, indicating that we will predict a single value (increase or decrease of the price)
input_size = combined_features.shape[1]
hidden_size = 30
output_size = 1

model = LSTMModel(input_size, hidden_size, output_size).to(device)

# Hyperparameters
epochs = 500
learning_rate = 0.0002

train_losses = []
test_losses = []

# Early stopping parameters, these are added because early stopping method can reduce the risk of overfitting
# Early stopping stops the training process when the model's performance doesn't improve on a validation set anymore
# The patience parameter tells us through how many epochs we wait for improvement. If no improvement can be seen after 10, the training stops
# We track the loss of the model and stop when we don't see improvement on it
patience = 10
best_loss = float('inf')
epochs_no_improve = 0

# We use Binary Cross Entropy Loss function and combine it with a sigmoid layer in one function, which is needed for the classification problem
# We use Adam optimiser to adjust the parameters of the model to minimise loss during training
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# Training loop for the model through the number of epochs, with early stopping method
for epoch in range(epochs):
    model.train()
    train_loss = 0

    for data, label in train_loader:
        data = data.float().to(device)
        label = label.unsqueeze(-1).float().to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Early stopping
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data, label in test_loader:
            data = data.float().to(device)
            label = label.unsqueeze(-1).float().to(device)
            output = model(data)
            loss = criterion(output, label)
            val_loss += loss.item()

    val_loss /= len(test_loader)
    test_losses.append(val_loss)

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), './lstm_model/best_model.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f'Early stopping at epoch {epoch + 1}')
            break

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

model.load_state_dict(torch.load('best_model.pth'))

Epoch [10/500], Train Loss: 0.6800, Validation Loss: 0.6857
Epoch [20/500], Train Loss: 0.6639, Validation Loss: 0.6810
Epoch [30/500], Train Loss: 0.6493, Validation Loss: 0.6747
Epoch [40/500], Train Loss: 0.6286, Validation Loss: 0.6689
Epoch [50/500], Train Loss: 0.6126, Validation Loss: 0.6635
Epoch [60/500], Train Loss: 0.5971, Validation Loss: 0.6600
Epoch [70/500], Train Loss: 0.5740, Validation Loss: 0.6588
Epoch [80/500], Train Loss: 0.5610, Validation Loss: 0.6572
Epoch [90/500], Train Loss: 0.5476, Validation Loss: 0.6565
Epoch [100/500], Train Loss: 0.5393, Validation Loss: 0.6577
Early stopping at epoch 105


  model.load_state_dict(torch.load('best_model.pth'))


<All keys matched successfully>

In [15]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

In [16]:
mr = project.get_model_registry()

input_schema = Schema([
    {"name": "sequence_data", "type": "tensor", "shape": [-1, sequence_length, combined_features.shape[1]]}
])
output_schema = Schema([
    {"name": "price_movement_prediction", "type": "tensor", "shape": [-1, output_size]}
])
model_schema = ModelSchema(input_schema, output_schema)

Connected. Call `.close()` to terminate connection gracefully.


In [17]:
import os
model_dir = "model"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

torch.save(model.state_dict(), model_dir + "/bitcoin_price_movement_prediction_lstm.pth")

In [19]:
lstm_model_bitcoin = mr.python.create_model(
    name="bitcoin_price_movement_prediction_model_lstm",
    metrics={
        "train_loss": train_losses[-1],
        "val_loss": test_losses[-1]
    },
    model_schema=model_schema,
    input_example={"sequence_data": torch.randn(1, sequence_length, combined_features.shape[1]).tolist()}
)
lstm_model_bitcoin.save(model_dir)

Uploading: 100.000%|██████████| 23678/23678 elapsed<00:09 remaining<00:001,  3.06it/s]
Uploading: 100.000%|██████████| 27275825/27275825 elapsed<00:58 remaining<00:0006it/s]
Uploading: 100.000%|██████████| 235427/235427 elapsed<00:01 remaining<00:00  3.06it/s]
Uploading: 100.000%|██████████| 22912/22912 elapsed<00:01 remaining<00:0045, 41.47s/it]
Uploading: 100.000%|██████████| 352/352 elapsed<00:01 remaining<00:00
Model export complete: 100%|██████████| 6/6 [01:19<00:00, 13.23s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/556181/models/bitcoin_price_movement_prediction_model_lstm/1





Model(name: 'bitcoin_price_movement_prediction_model_lstm', version: 1)