# Data Ingestion

In [1]:
# necessary dependencies
from pyforest import *
import datetime, pickle, copy, warnings
import cryptocompare
import requests
import plotly.express as px
import plotly.graph_objects as go
from time import time
from pandas import DataFrame, concat
from math import sqrt

# Schikit-learn
from sklearn import metrics
from sklearn.linear_model import ElasticNet
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import r2_score
import sklearn.externals
import joblib

# ONNX run-tine
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt


apiKey = "43b01c420b66888ce4c91b364647600814578c186e8604322152f44c641ebbc1"
url = "https://min-api.cryptocompare.com/data/histohour"

# BTC 1st 2000 datapoints
payload = {
    "api_key": apiKey, 
    "fsym": "BTC", 
    "tsym": "USD", 
    "limit": 2000
}

result = requests.get(url, params=payload).json()

BitCoin1 = DataFrame(result["Data"])

print('Raw data view (Unix timestamp):')
print(BitCoin1.head(2))

BitCoin1["time"] = pd.to_datetime(BitCoin1["time"], unit="s")

BitCoin1.set_index("time", inplace=True)

# 2nd 2000 datapoints
payload = {
    "api_key": apiKey,
    "fsym": "BTC",
    "tsym": "USD",
    "limit": 2000,
    "toTs": (1601632800),
}

result = requests.get(url, params=payload).json()

BitCoin2 = DataFrame(result["Data"])

BitCoin2["time"] = pd.to_datetime(BitCoin2["time"], unit="s")

BitCoin2.set_index("time", inplace=True)

# 3rd 2000 datapoints
payload = {
    "api_key": apiKey,
    "fsym": "BTC",
    "tsym": "USD",
    "limit": 2000,
    "toTs": (1593572400), # original time in unix format
}

result = requests.get(url, params=payload).json()

BitCoin3 = DataFrame(result["Data"])

BitCoin3["time"] = pd.to_datetime(BitCoin3["time"], unit="s")

BitCoin3.set_index("time", inplace=True)

# 4th 2000 datapoints
payload = {
    "api_key": apiKey,
    "fsym": "BTC",
    "tsym": "USD",
    "limit": 2000,
    "toTs": (1596571200),
}

result = requests.get(url, params=payload).json()

BitCoin4 = DataFrame(result["Data"])

BitCoin4["time"] = pd.to_datetime(BitCoin4["time"], unit="s")

BitCoin4.set_index("time", inplace=True)

# combining all bitcoin data (8000 data points)
combineData1 = BitCoin2.append(BitCoin1)

combineData2 = BitCoin3.append(combineData1)

BitCoin = BitCoin4.append(combineData2)  # final BitCoin dataset

print('Changed timestamp (readable view)')
print(BitCoin.tail(2))
# saving btc data set
#BitCoin.to_csv("BitCoinRaw.csv")

Raw data view (Unix timestamp):
         time      high       low      open  volumefrom     volumeto  \
0  1607119200  18877.60  18603.79  18821.67     3163.72  59255159.46   
1  1607122800  18847.04  18630.30  18678.83     2667.11  49984348.67   

      close conversionType conversionSymbol  
0  18678.83         direct                   
1  18665.00         direct                   


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Changed timestamp (readable view)
                         high       low      open  volumefrom      volumeto  \
time                                                                          
2021-02-26 05:00:00  47166.75  45504.45  46995.03     4894.15  2.259130e+08   
2021-02-26 06:00:00  46463.29  45509.65  45534.74     2271.51  1.044999e+08   

                        close conversionType conversionSymbol  
time                                                           
2021-02-26 05:00:00  45534.74         direct                   
2021-02-26 06:00:00  46049.94         direct                   


# Feature engineering

In [2]:
df = pd.read_csv("BitCoinRaw.csv")
df.set_index("time", inplace=True)

df.drop(
    columns=[
    "conversionType", 
    "conversionSymbol"
    ], 
        axis=1, inplace=True
)

values = DataFrame(df.close.values)
lags = 8
columns = [values]
for i in range(1, (lags + 1)):
    columns.append(values.shift(i))

dt = concat(columns, axis=1)

columns = ["Lag"]
for i in range(1, (lags + 1)):
    columns.append("Lag" + str(i))
dt.columns = columns
dt.index = df.index

finalDataSet = concat([df, dt], axis=1)

finalDataSet.dropna(inplace=True)

finalDataSet["S_10"] = finalDataSet["close"].rolling(window=10).mean()

finalDataSet["Corr"] = (
    finalDataSet["close"].rolling(window=10).corr(finalDataSet["S_10"])
)

finalDataSet["d_20"] = finalDataSet["close"].shift(480)

finalDataSet["5EMA"] = (
    finalDataSet["close"].ewm(span=5, adjust=True, ignore_na=True).mean()
)

finalDataSet["10EMA"] = (
    finalDataSet["close"].ewm(span=10, adjust=True, ignore_na=True).mean()
)

finalDataSet["20EMA"] = (
    finalDataSet["close"].ewm(span=20, adjust=True, ignore_na=True).mean()
)

finalDataSet["mean"] = (finalDataSet["low"] + finalDataSet["high"]) / 2

finalDataSet["returns"] = (
    (finalDataSet["close"] - finalDataSet["open"]) / finalDataSet["open"] * 100.0
)

finalDataSet["volume"] = finalDataSet["volumeto"] - finalDataSet["volumefrom"]

finalDataSet.drop(["volumefrom", "volumeto"], 1, inplace=True)

finalDataSet.dropna(inplace=True)

finalDataSet = finalDataSet.drop(["Lag"], axis=1)

finalDataSet = finalDataSet.astype(float)

finalDataSet = finalDataSet.sort_index(ascending=True)
# dataframe.head(2)

# save data
finalDataSet.to_csv("finalDataSet.csv", header=True)

print(finalDataSet.tail())

<IPython.core.display.Javascript object>

                         high       low      open     close      Lag1  \
time                                                                    
2021-02-26 02:00:00  47767.06  46166.00  46470.54  47362.70  46470.54   
2021-02-26 03:00:00  47593.68  46754.40  47362.70  47339.76  47362.70   
2021-02-26 04:00:00  47750.84  46823.35  47339.76  46995.03  47339.76   
2021-02-26 05:00:00  47166.75  45504.45  46995.03  45534.74  46995.03   
2021-02-26 06:00:00  46463.29  45509.65  45534.74  46238.90  45534.74   

                         Lag2      Lag3      Lag4      Lag5      Lag6  ...  \
time                                                                   ...   
2021-02-26 02:00:00  46818.45  47082.87  48197.61  48077.07  49174.36  ...   
2021-02-26 03:00:00  46470.54  46818.45  47082.87  48197.61  48077.07  ...   
2021-02-26 04:00:00  47362.70  46470.54  46818.45  47082.87  48197.61  ...   
2021-02-26 05:00:00  47339.76  47362.70  46470.54  46818.45  47082.87  ...   
2021-02-26 06:00:00 

## Train-Test data set

In [3]:
finalDataSet = pd.read_csv("finalDataSet.csv")
finalDataSet.set_index("time", inplace=True)
# print(df.tail())


foreCastColumn = "close"  # creating label

foreCastOut = int(12)  # prediction for next 12 hrs

finalDataSet["label"] = finalDataSet[foreCastColumn].shift(-foreCastOut)

X = np.array(finalDataSet.drop(["label"], axis=1))

# normalize data
X = preprocessing.scale(X)

X_foreCastOut = X[-foreCastOut:]

X = X[:-foreCastOut]

finalDataSet.dropna(inplace=True)

y = np.array(finalDataSet["label"])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Time-series cross validation

In [4]:
# Split the data into train and test data set
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

## Model development

In [5]:
# regression model
Model = ElasticNet(alpha=0.0001, l1_ratio=0.5, random_state=0).fit(X_train, y_train)

# cross validated accucary on train set
scores = cross_val_score(Model, X_train, y_train, cv=tscv)

print("Training Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Intercept:", Model.intercept_)
print("Slope:", Model.coef_[0])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Training Accuracy: 0.85 (+/- 0.28)
Intercept: 16360.019185850626
Slope: 8531.185865653344


  model = cd_fast.enet_coordinate_descent(


## Accuracy metrics

In [6]:
# prediction on training
trainPredict = Model.predict(X_train)
r_squared = r2_score(y_train, trainPredict)
mae = np.mean(abs(trainPredict - y_train))
rmse = np.sqrt(np.mean((trainPredict - y_train) ** 2))
rae = np.mean(abs(trainPredict - y_train)) / np.mean(abs(y_train - np.mean(y_train)))
rse = np.mean((trainPredict - y_train) ** 2) / np.mean(
    (y_train - np.mean(y_train)) ** 2
)
sumOfDf = DataFrame(
    index=[
        "R-squared",
        "Mean Absolute Error",
        "Root Mean Squared Error",
        "Relative Absolute Error",
        "Relative Squared Error",
    ]
)
sumOfDf["Training metrics"] = [r_squared, mae, rmse, rae, rse]

# prediction of test
testPredict = Model.predict(X_test)
r_squared = r2_score(y_test, testPredict)
mae = np.mean(abs(testPredict - y_test))
rmse = np.sqrt(np.mean((testPredict - y_test) ** 2))
rae = np.mean(abs(testPredict - y_test)) / np.mean(abs(y_test - np.mean(y_test)))
rse = np.mean((testPredict - y_test) ** 2) / np.mean((y_test - np.mean(y_test)) ** 2)

sumOfDf["Validation metrics"] = [r_squared, mae, rmse, rae, rse]
sumOfDf = sumOfDf.round(decimals=3)

print(sumOfDf)  # accuracy check

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

                         Training metrics  Validation metrics
R-squared                           0.991               0.945
Mean Absolute Error               164.399            1315.308
Root Mean Squared Error           461.798            1752.268
Relative Absolute Error             0.055               0.203
Relative Squared Error              0.009               0.055


## Saving model

In [7]:
# Save model to file in the current working directory
fileName = "ElasticModel.pkl"
joblib.dump(Model, fileName)

# Load from file
ElasticModel = joblib.load(fileName)
# ElasticModel.predict(X_test)
# print(r2_score(y_test, ElasticModel.predict(X_test)))

# Train, convert and predict with ONNX Runtime
## Backend
sklearn-onnx converts models in ONNX format which can be then used to compute predictions with the backend of our choice. However, there exists a way to automatically check every converter with onnxruntime, onnxruntime-gpu. Every converter is tested with this backend.

In [8]:
# Convert into ONNX format
# input tensors of model: list of ('<wanted name of tensor>', DataType('<shape>'))
initialType = [('float_input', FloatTensorType([None, 21]))]

onx = convert_sklearn(Model, initial_types=initialType)

with open("ElasticModel.onnx", "wb") as f:
    f.write(onx.SerializeToString())
    

# Compute the prediction with ONNX Runtime
session = rt.InferenceSession("ElasticModel.onnx")

inputName = session.get_inputs()[0].name

labelName = session.get_outputs()[0].name

predictionOnx = session.run([labelName], {inputName: X_test.astype(np.float32)})[0]

r2_score(y_test, predictionOnx)

<IPython.core.display.Javascript object>

0.9449481324156145

## Future forecast

In [9]:
# forecast future 12 hrs values
foreCastFutureValues = DataFrame(ElasticModel.predict(X_foreCastOut))
# print(foreCastFutureValues)

# assigning names to columns
foreCastFutureValues.rename(columns={0: "Forecast"}, inplace=True)

newDataframe = finalDataSet.tail(foreCastOut)

newDataframe.reset_index(inplace=True)

# Fixing future datetime
newDataframe = newDataframe.append(
    DataFrame(
        {
            "time": pd.date_range(
                start=newDataframe.time.iloc[-1],
                periods=(len(newDataframe) + 1),
                freq="H",
                closed="right",
            )
        }
    )
)

newDataframe.set_index("time", inplace=True)

newDataframe = newDataframe.tail(foreCastOut)

foreCastFutureValues.index = newDataframe.index

print("12 hours forecast (hourly):")
foreCastFutureValues.reset_index(inplace=True)

print(foreCastFutureValues)

<IPython.core.display.Javascript object>

12 hours forecast (hourly):
                  time      Forecast
0  2021-02-25 19:00:00  50456.296213
1  2021-02-25 20:00:00  49796.020158
2  2021-02-25 21:00:00  49394.342405
3  2021-02-25 22:00:00  49018.624804
4  2021-02-25 23:00:00  48377.972112
5  2021-02-26 00:00:00  47671.983455
6  2021-02-26 01:00:00  47673.263692
7  2021-02-26 02:00:00  48002.043169
8  2021-02-26 03:00:00  47946.976100
9  2021-02-26 04:00:00  48056.064736
10 2021-02-26 05:00:00  47100.796569
11 2021-02-26 06:00:00  46971.529322
