In [22]:
import numpy as np
import pandas as pd
from pathlib import Path
import talib as ta
from sklearn.model_selection import train_test_split
from pycaret.regression import *
import hvplot.pandas
import matplotlib.pyplot as plt

In [2]:
#read the btc csv
df = pd.read_csv(
    Path("./Resources/BTC_USD.csv"), 
    index_col='Date', 
    infer_datetime_format=True, 
    parse_dates=True
)
df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,value_classification
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-02-01,10288.799805,8812.280273,10237.299805,9170.540039,9959400448,9170.540039,14036.866816,7856.521897,10769.754729,11614.719069,32.50563,30,Fear
2018-02-02,9142.280273,7796.490234,9142.280273,8830.75,12726899712,8830.75,13882.201816,7889.533597,10417.208414,11349.579157,31.21476,15,Extreme Fear
2018-02-03,9430.75,8251.629883,8852.120117,9174.910156,7263790080,9174.910156,13711.562012,7923.813749,10191.336004,11142.467824,34.070623,40,Fear
2018-02-04,9334.870117,8031.220215,9175.700195,8277.009766,7073549824,8277.009766,13487.154199,7953.831648,9843.276688,10869.567056,30.511391,24,Extreme Fear
2018-02-05,8364.839844,6756.680176,8270.540039,6955.27002,9285289984,6955.27002,13243.443584,7974.519998,9318.184566,10496.776862,26.176396,11,Extreme Fear


In [3]:
#add our target, next days close
future_close = 1
df['Future Price'] = df['Close'].shift(future_close)
df.drop(['High', 'Low', 'Open', 'Adj Close', 'value_classification'], axis=1, inplace=True)

In [4]:
#test df
df.tail()

Unnamed: 0_level_0,Close,Volume,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,Future Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-11-10,17586.771484,83202283721,19584.08373,23547.594272,19118.173558,19554.70252,38.1313,22,15880.780273
2022-11-11,17034.292969,55871616488,19536.498574,23430.474194,18739.286178,19314.663515,35.836612,25,17586.771484
2022-11-12,16799.185547,29717699419,19486.529512,23323.882817,18386.540609,19075.094185,34.874802,21,17034.292969
2022-11-13,16353.365234,27209183682,19434.856582,23209.444038,18016.872359,18815.881904,33.062715,22,16799.185547
2022-11-14,16412.138672,48737193984,19387.057402,23092.635591,17725.102598,18586.953977,33.552888,24,16353.365234


In [5]:
#copy df
future_df = df.copy()
#create data set
X = np.array(future_df[df.columns])
#create target
y = np.array(df['Future Price'])
#split data train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, shuffle=False)


In [6]:
#create train dataframe
train_df = pd.DataFrame(X_train, columns = df.columns).dropna()
#show train
train_df.head()

Unnamed: 0,Close,Volume,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,Future Price
1,8830.75,12726900000.0,13882.201816,7889.533597,10417.208414,11349.579157,31.21476,15.0,9170.540039
2,9174.910156,7263790000.0,13711.562012,7923.813749,10191.336004,11142.467824,34.070623,40.0,8830.75
3,8277.009766,7073550000.0,13487.154199,7953.831648,9843.276688,10869.567056,30.511391,24.0,9174.910156
4,6955.27002,9285290000.0,13243.443584,7974.519998,9318.184566,10496.776862,26.176396,11.0,8277.009766
5,7754.0,13999800000.0,13016.2396,7999.951198,9033.787372,10235.560018,32.424583,8.0,6955.27002


In [7]:
#create test dataframe
test_df = pd.DataFrame(X_test, columns = future_df.columns).dropna()
#show data
test_df.head()

Unnamed: 0,Close,Volume,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,Future Price
0,39105.148438,17467550000.0,40386.915781,49251.008496,39238.437777,39930.829271,45.356934,26.0,39214.21875
1,37709.785156,23450130000.0,40306.432656,49211.589238,38960.500937,39719.30126,40.702034,26.0,39105.148438
2,43193.234375,35690010000.0,40332.065313,49205.413965,39730.088835,40050.152033,58.658017,20.0,37709.785156
3,44354.636719,32479050000.0,40382.732812,49188.220547,40570.915723,40460.102956,61.328984,51.0,43193.234375
4,43924.117188,29183110000.0,40406.498047,49172.356406,41180.588716,40790.009073,59.787004,52.0,44354.636719


In [8]:
#setup the model
regression_setup = setup(data = train_df, target = 'Future Price', session_id = 1, use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,1
1,Target,Future Price
2,Original Data,"(1482, 9)"
3,Missing Values,False
4,Numeric Features,8
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1037, 6)"


In [9]:
#test models by r2, sorted by r2 initially but many tied at .9969, filtered by MAE to pick the best of those
best_model = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,373.7291,451352.8312,660.0113,0.9986,0.0269,0.019,0.775
catboost,CatBoost Regressor,421.8657,498449.1464,697.4736,0.9985,0.0325,0.0238,1.212
rf,Random Forest Regressor,436.2182,640475.3924,788.8882,0.998,0.0309,0.0216,0.802
gbr,Gradient Boosting Regressor,439.6511,563600.8306,740.9148,0.9983,0.0325,0.0238,0.135
lightgbm,Light Gradient Boosting Machine,440.6026,604264.1583,765.5269,0.9981,0.0305,0.0218,0.078
xgboost,Extreme Gradient Boosting,460.202,691705.3781,817.1279,0.9979,0.0325,0.0233,0.473
dt,Decision Tree Regressor,599.1259,1286704.5583,1115.4131,0.9961,0.0395,0.0282,0.009
br,Bayesian Ridge,610.9045,901342.8232,943.0753,0.9972,0.0509,0.0383,0.005
en,Elastic Net,614.9384,900594.5688,942.7317,0.9972,0.0523,0.0391,0.007
lasso,Lasso Regression,615.1765,900574.8125,942.7239,0.9972,0.0524,0.0392,0.007


In [10]:
#create model
model = create_model(best_model)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,359.3372,414440.7807,643.7708,0.9987,0.0251,0.0176
1,464.235,609770.6726,780.8781,0.9983,0.0246,0.0193
2,313.7566,444879.4525,666.9928,0.9985,0.0266,0.0189
3,451.1158,611024.6785,781.6807,0.9981,0.0318,0.0228
4,415.7165,468530.3756,684.4928,0.9988,0.0324,0.0218
5,288.7149,272560.0605,522.0728,0.9989,0.026,0.0169
6,290.6409,212814.0001,461.3177,0.9993,0.0224,0.016
7,293.3794,259348.9594,509.2632,0.9993,0.0231,0.0157
8,375.6449,457217.4505,676.1786,0.9984,0.0248,0.0185
9,484.7496,762941.8818,873.4654,0.9979,0.032,0.0225


In [11]:
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [12]:
#test the predictions
btc_predictions = predict_model(model, data=test_df)
#print
btc_predictions

Unnamed: 0,Close,Volume,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,Future Price,Label
0,39105.148438,1.746755e+10,40386.915781,49251.008496,39238.437777,39930.829271,45.356934,26.0,39214.218750,38362.597891
1,37709.785156,2.345013e+10,40306.432656,49211.589238,38960.500937,39719.301260,40.702034,26.0,39105.148438,38315.275664
2,43193.234375,3.569001e+10,40332.065313,49205.413965,39730.088835,40050.152033,58.658017,20.0,37709.785156,38181.906523
3,44354.636719,3.247905e+10,40382.732812,49188.220547,40570.915723,40460.102956,61.328984,51.0,43193.234375,43084.474414
4,43924.117188,2.918311e+10,40406.498047,49172.356406,41180.588716,40790.009073,59.787004,52.0,44354.636719,43497.629375
...,...,...,...,...,...,...,...,...,...,...
257,17586.771484,8.320228e+10,19584.083730,23547.594272,19118.173558,19554.702520,38.131300,22.0,15880.780273,24222.057764
258,17034.292969,5.587162e+10,19536.498574,23430.474194,18739.286178,19314.663515,35.836612,25.0,17586.771484,23636.161992
259,16799.185547,2.971770e+10,19486.529512,23323.882817,18386.540609,19075.094185,34.874802,21.0,17034.292969,23150.862432
260,16353.365234,2.720918e+10,19434.856582,23209.444038,18016.872359,18815.881904,33.062715,22.0,16799.185547,22732.017783


In [13]:
#create a new column shownig the price difference
btc_predictions['Difference'] = btc_predictions['Future Price']-btc_predictions['Label']
btc_predictions.tail()

Unnamed: 0,Close,Volume,50 SMA,200 SMA,10 EMA,20 EMA,RSI,value,Future Price,Label,Difference
257,17586.771484,83202280000.0,19584.08373,23547.594272,19118.173558,19554.70252,38.1313,22.0,15880.780273,24222.057764,-8341.27749
258,17034.292969,55871620000.0,19536.498574,23430.474194,18739.286178,19314.663515,35.836612,25.0,17586.771484,23636.161992,-6049.390508
259,16799.185547,29717700000.0,19486.529512,23323.882817,18386.540609,19075.094185,34.874802,21.0,17034.292969,23150.862432,-6116.569463
260,16353.365234,27209180000.0,19434.856582,23209.444038,18016.872359,18815.881904,33.062715,22.0,16799.185547,22732.017783,-5932.832236
261,16412.138672,48737190000.0,19387.057402,23092.635591,17725.102598,18586.953977,33.552888,24.0,16353.365234,22424.740918,-6071.375684


In [14]:
#change the column name label to Prediction
btc_predictions.rename(columns = {"Label": "Prediction"}, inplace = True)

In [15]:
#plot the price difference
btc_predictions['Difference'].hvplot(
    xlabel = 'Days',
    ylabel = 'Dollars',
    title = 'Price Difference - Real vs Prediction')

In [16]:
#get a weird error when trying to plot, need to remove value
btc_predictions.drop(['value'], axis=1, inplace=True)

In [17]:
#plot future price, prediction and difference together
btc_predictions.hvplot(
    y = ['Future Price', 'Prediction', 'Difference'],
    x = 'index',
    xlabel = 'Days',
    ylabel = 'Dollars',
    title = 'Dollar Values - Real vs Prediction')