## BTC/USD Forecasting with LSTMs

##### Short dataset description
Dataset was sourced from : https://www.cryptodatadownload.com/data/bitfinex/. It cointains hourly historical data of the pair BTC/USD. Data are retrieved from Bitfinex (crypto cex). The columns of the data contain :

1) **open** price of the hourly candle
2) **close** price of the hourly candle
3) **low** price (min) of the hourly candle
4) **high** price (max) of the hourly candle
5) **volume** volume during the candle denominated in the respective currency (**btc/usd**)
6) **symbol** --> btc/usd
7) **unix timestamp**
8) **date**

In [24]:
import os
import pandas as pd
import numpy as np
import math
import datetime as dt
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder
from stockstats import wrap
from stockstats import StockDataFrame
import pickle
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer

#### Read the data

In [5]:
# Load the pickled DataFrame
with open('data_trading.pkl', 'rb') as file:
    df = pickle.load(file)

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')

In [6]:
df.head()

Unnamed: 0,date,open,high,low,close,Volume USD,Volume BTC,volume usd,volume btc,volume,...,ker,mfi,ndi,pdi,pgo,psl,rsi,rsv,stochrsi,vwma
44624,2018-05-15 06:00:00,8723.8,8793.0,8714.9,8739.0,8988053.53,1026.35,8988053.53,1026.35,1026.35,...,0.0,0.5,-0.0,0.0,0.0,0.0,100.0,30.857875,0.0,8748.966667
44623,2018-05-15 07:00:00,8739.0,8754.8,8719.3,8743.0,2288904.12,261.97,2288904.12,261.97,261.97,...,1.0,0.5,-0.0,0.0,0.03618,8.333333,100.0,35.979513,0.0,8746.946799
44622,2018-05-15 08:00:00,8743.0,8743.1,8653.2,8723.7,8891773.14,1023.31,8891773.14,1023.31,1023.31,...,0.656652,0.5,34.751866,0.0,-0.16836,8.333333,16.139044,50.429185,0.0,8729.115632
44621,2018-05-15 09:00:00,8723.7,8737.8,8701.2,8708.1,2054868.28,235.55,2054868.28,235.55,235.55,...,0.794344,0.5,28.786566,0.0,-0.34642,8.333333,9.329028,39.270386,0.0,8727.875023
44620,2018-05-15 10:00:00,8708.1,8855.7,8695.8,8784.4,17309722.58,1969.08,17309722.58,1969.08,1969.08,...,0.394097,0.5,15.925134,32.943126,0.525752,16.666667,71.863748,64.790123,68.968843,8750.005541


In [7]:
# Load the pickled DataFrame
with open('sentiment.pkl', 'rb') as file:
    df_tweets = pickle.load(file)

df_tweets['date'] = pd.to_datetime(df_tweets['date'])

In [8]:
sentiment_mapping = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}
df_tweets['sentiment_numeric'] = df_tweets['sentiment_label'].map(sentiment_mapping)

In [9]:
def count_sentiments(arr):
    return pd.Series({
        'mean_sentiment': arr.mean(),
        'count_pos': (arr == 1).sum(),
        'count_neu': (arr == 0).sum(),
        'count_neg': (arr == -1).sum()
    })

In [10]:
# Extract the date and hour components
df_tweets['date2'] = df_tweets['date'].dt.date
df_tweets['hour'] = df_tweets['date'].dt.hour

hourly_mean_sentiment = df_tweets.groupby(['date2', 'hour'])['sentiment_numeric'].apply(count_sentiments).reset_index()
hourly_mean_sentiment = hourly_mean_sentiment.pivot(index=['date2', 'hour'], columns='level_2', values='sentiment_numeric').reset_index()

# Combine date and hour columns into a datetime column
# Convert the Series to a DataFrame
hourly_mean_sentiment['hour'] = hourly_mean_sentiment['hour'].apply(lambda x: f'{x:02d}')
hourly_mean_sentiment['date'] = pd.to_datetime(hourly_mean_sentiment['date2'].astype(str) + ' ' + hourly_mean_sentiment['hour'].astype(str), format='%Y-%m-%d %H')

hourly_mean_sentiment['sent'] = hourly_mean_sentiment['mean_sentiment'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
# Drop the original 'date' and 'hour' columns
hourly_mean_sentiment = hourly_mean_sentiment.drop(columns=['date2', 'hour'], axis = 1)
df_tweets = hourly_mean_sentiment
df_tweets = df_tweets.sort_values(by='date')

In [11]:
df_tweets.head()

level_2,count_neg,count_neu,count_pos,mean_sentiment,date,sent
0,0.0,1.0,0.0,0.0,2021-02-05 10:00:00,0
1,0.0,14.0,1.0,0.066667,2021-02-05 11:00:00,1
2,1.0,12.0,2.0,0.066667,2021-02-05 12:00:00,1
3,0.0,21.0,1.0,0.045455,2021-02-05 13:00:00,1
4,0.0,22.0,4.0,0.153846,2021-02-05 14:00:00,1


### Merge dataframes

In [12]:
merged_df = pd.merge(df, df_tweets, on='date', how='inner')

In [13]:
merged_df.head()

Unnamed: 0,date,open,high,low,close,Volume USD,Volume BTC,volume usd,volume btc,volume,...,psl,rsi,rsv,stochrsi,vwma,count_neg,count_neu,count_pos,mean_sentiment,sent
0,2021-02-05 10:00:00,37280.0,37666.0,37219.21804,37435.0,3309602.0,88.409306,3309602.0,88.409306,88.409306,...,58.333333,52.59099,70.287704,68.676027,37320.809898,0.0,1.0,0.0,0.0,0
1,2021-02-05 11:00:00,37437.0,37741.0,37437.0,37729.0,3298023.0,87.413473,3298023.0,87.413473,87.413473,...,66.666667,56.575264,91.835182,93.861798,37314.125092,0.0,14.0,1.0,0.066667,1
2,2021-02-05 12:00:00,37723.0,38195.0,37576.0,37957.0,21928550.0,577.720849,21928550.0,577.720849,577.720849,...,66.666667,59.42325,79.949452,100.0,37477.120659,1.0,12.0,2.0,0.066667,1
3,2021-02-05 13:00:00,37939.0,38366.0,37870.0,38355.0,12360130.0,322.256108,12360130.0,322.256108,322.256108,...,75.0,63.876923,99.189985,100.0,37601.506846,0.0,21.0,1.0,0.045455,1
4,2021-02-05 14:00:00,38354.0,38366.0,38022.0,38128.0,6023729.0,157.987029,6023729.0,157.987029,157.987029,...,66.666667,59.842528,82.278481,81.786165,37659.328929,0.0,22.0,4.0,0.153846,1


In [14]:
df = merged_df

## Task 1 -- Trend Prediction
For this task we consider the trend to be an uptrend (y = 1) if close[t] < open[t+1] and downtrend (y = 0) if close[t] >= open[t+1].

In [17]:
def generate_trend_prediction_set(dataset):

    data = dataset.copy(deep=True)
    l = data.shape[0]
    data['price'] = [None]*l

    for index,row in data.iterrows():
        if index+1 >= l:
            continue
        data.at[index,'price'] = data.loc[index+1]['close']

    y = data['price'].values
    data = data.drop(['price','date'], axis = 1)

    x = data.values[:-1]
    y = y[:-1]

    print("X has nones:",np.any(np.isnan(x)))
    print("y has nones:", np.any(np.isnan(x)))

    return x,y

In [67]:
x, y = generate_trend_prediction_set(df)

X has nones: False
y has nones: False


### Normalization

In [68]:
scaler = RobustScaler()
x = scaler.fit_transform(x)

In [69]:
x2 = x[:, :-5] # remove sentiment

#### Hist Gradient Boosting Classifier

In [28]:
def mse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    return mse ** 2

In [32]:
# Define different feature sets
feature_sets = [x, x2]  # Replace with your feature sets

# Create your HistGradientBoostingClassifier model

# Create a KFold splitter with the desired number of folds and random seed
kf = KFold(n_splits=5, shuffle=False)  # Adjust the number of splits as needed
for i, X_feature_set in enumerate(feature_sets):
    squared_mse = 0
    for train_index, test_index in kf.split(X_feature_set):
        X_train, X_test = X_feature_set[train_index], X_feature_set[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Create and train the HistGradientBoostingRegressor
        model = HistGradientBoostingRegressor()
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Calculate squared MSE and append it to the scores list
        squared_mse += math.sqrt(mean_squared_error(y_test, y_pred))
    
    average_mse_score = squared_mse/5
    print(f"Feature Set {i+1} - Average Square root mse Score:", average_mse_score)    

Feature Set 1 - Average Square root mse Score: 1365.7183334515462
Feature Set 2 - Average Square root mse Score: 1365.0203458396577


#### DNN

In [53]:
x

array([[-0.07150282, -0.06587251, -0.06552257, ..., -0.75      ,
        -0.46740759, -1.        ],
       [-0.06478566, -0.06268807, -0.05611473, ..., -0.70833333,
         0.47068765,  0.        ],
       [-0.05254932, -0.04341157, -0.05011016, ..., -0.66666667,
         0.47068765,  0.        ],
       ...,
       [-0.70813483, -0.71337591, -0.70577563, ..., -0.08333333,
        -0.25500867,  0.        ],
       [-0.70770698, -0.70781375, -0.70854033, ...,  0.04166667,
        -0.081889  ,  0.        ],
       [-0.70428423, -0.71019147, -0.70594842, ..., -0.29166667,
        -0.53409683, -2.        ]])

In [54]:
def create_mlp(shape):
    model = Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape = (shape,)))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(1))
    return model

In [74]:
# Define different feature sets
feature_sets = [x, x2]  # Replace with your feature sets

# Create a KFold splitter with the desired number of folds and random seed
kf = KFold(n_splits=5, shuffle=False)  # Adjust the number of splits as needed

for i, X_feature_set in enumerate(feature_sets):
    squared_mse = 0
    
    for train_index, test_index in kf.split(X_feature_set):
        X_train, X_test = X_feature_set[train_index], X_feature_set[test_index]
        y_train_tt, y_test_tt = y[train_index], y[test_index]

        # Convert y_train_tt and y_test_tt to TensorFlow tensors
        y_train_t = tf.convert_to_tensor(y_train_tt, dtype=tf.float64)
        y_test_t = tf.convert_to_tensor(y_test_tt, dtype=tf.float64)

        # Create and train the HistGradientBoostingRegressor
        model = create_mlp(X_train.shape[1])
        early_stopping = EarlyStopping(monitor='val_loss', patience=100)
        checkpoint = ModelCheckpoint('best_weights2.h5', save_best_only=True)
        model.compile(optimizer='Adam', loss='mean_squared_error')

        model.fit(X_train, y_train_t, epochs=250, batch_size=256, validation_data=(X_test,y_test_t), callbacks=[early_stopping, checkpoint], verbose=0)

        model.load_weights('best_weights2.h5')

        y_pred = model.predict(X_test)

        y_pred_vals = list()
        for y2 in y_pred:
            y_pred_vals.append(y2[0])

        # Calculate squared MSE and append it to the scores list
        squared_mse += math.sqrt(mean_squared_error(y_test_tt, y_pred_vals))
    
    average_mse_score = squared_mse / 5
    print(f"Feature Set {i+1} - Average Square root mse Score:", average_mse_score)

Feature Set 1 - Average Square root mse Score: 2078.7085820965667
Feature Set 2 - Average Square root mse Score: 2000.4500314979025
