- Configuration

In [None]:
!pip install darts

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from datetime import datetime
from functools import reduce

from darts import TimeSeries
from darts.models import (
    NaiveSeasonal,
    NaiveDrift,
    Prophet,
    ExponentialSmoothing,
    ARIMA,
    AutoARIMA,
    RegressionEnsembleModel,
    RegressionModel,
    FFT
)

from darts.metrics import mape, mase
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis, plot_hist

import warnings
warnings.filterwarnings("ignore")
import logging
logging.disable(logging.CRITICAL)

- Utils

In [None]:
def group_trading_timeseries(df, time_step_size):
    #Copied from https://www.kaggle.com/alexisalvarez/eda-resampling-the-power-of-technical-analysis
    df = df.sort_index()
    df = df.resample('{}'.format(time_step_size)).agg({
            'Count': lambda df: df.sum(),
            'Open': lambda s: s.iloc[0] ,
            'High': lambda df: df.max(), 
            'Low': lambda df: df.min(),
            'Close': lambda df: df.iloc[-1], 
            'Volume': lambda df: df.sum(),
            'VWAP' : lambda df: df.mean(),
            'Target' : lambda df: df.mean()
            }) 
    return df

# Preprocessing of the data

In [None]:
df = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
asset = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
df["timestamp"] = df["timestamp"].apply(lambda x: datetime.fromtimestamp(x))
df["date"] = df["timestamp"].astype('datetime64[s]')
df.head()

In [None]:
df = pd.merge(df, asset, on="Asset_ID")
df.set_index("date", inplace=True)
df = df[["Asset_Name", "Weight", "Count",
         "Open", "High", "Low",
         "Close", "Volume", "VWAP",
         "Target"]]
df.head()

In [None]:
df.isnull().mean() * 100

In [None]:
df.dropna(inplace=True)

## First trials with Bitcoin

In [None]:
df_btc = df.loc[df["Asset_Name"] == "Bitcoin"]
df_btc = group_trading_timeseries(df_btc, time_step_size="1d")
df_btc.head()

In [None]:
fig = go.Figure(data=[go.Candlestick(x=df_btc.index,
                                     open=df_btc['Open'],
                                     high=df_btc['High'],
                                     low=df_btc['Low'],
                                     close=df_btc['Close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(
    title='Bitcoin from Janv. 2018 to Sep. 2021',
    yaxis_title='USD',
    shapes = [dict(
        x0='2020-03-11', x1='2020-03-11', y0=0, y1=1, xref='x', yref='paper',
        line_width=2)],
    annotations=[dict(
        x='2020-03-11', y=0.05, xref='x', yref='paper',
        showarrow=False, xanchor='left', text='Covid Sanitary Crisis Begin')]
)
fig.show()

In [None]:
for i in df_btc.columns:
    print('\033[1m'+i+'\033[0m')
    print("Mean: ", round(df_btc[i].mean(), 3))
    print("Median: ", round(df_btc[i].median(), 3))
    print("Standard Deviation: ", round(df_btc[i].std(), 3))
    print("Variance: ", round(df_btc[i].var(), 3))
    print("-------------------")

In [None]:
for i in df_btc.columns:
    sns.displot(x=i,
                data=df_btc,
                kde=True)
    plt.title("Distribution of the variable: " +  i)
    plt.show()

## Modelisation in Opening data

In [None]:
df_btc_open = df_btc[["Open"]]

In [None]:
df_btc_open = TimeSeries.from_dataframe(df_btc_open)

In [None]:
df_btc_open.plot();

### Creating train/val set

In [None]:
train_btc, val_btc = df_btc_open.split_before(pd.Timestamp('20200921'))
train_btc.plot(label='training')
val_btc.plot(label='validation')
plt.legend();

### Naive Models

In [None]:
naive_model = NaiveSeasonal(K=1)
naive_model.fit(train_btc)
naive_forecast = naive_model.predict(365)

df_btc_open.plot(label='actual')
naive_forecast.plot(label='naive forecast (K=1)')
plt.legend();

In [None]:
seasonal_model = NaiveSeasonal(K=12)
seasonal_model.fit(train_btc)
seasonal_forecast = seasonal_model.predict(365)

df_btc_open.plot(label='actual')
seasonal_forecast.plot(label='naive forecast (K=12)')
plt.legend();

In [None]:
drift_model = NaiveDrift()
drift_model.fit(train_btc)
drift_forecast = drift_model.predict(365)

combined_forecast = drift_forecast + seasonal_forecast - train_btc.last_value()

df_btc_open.plot()
combined_forecast.plot(label='combined')
drift_forecast.plot(label='drift')
plt.legend();

In [None]:
print("Mean absolute percentage error for the combined naive drift + seasonal: {:.2f}%.".format(
      mape(df_btc_open, combined_forecast)))

### Probabilistic Methods

In [None]:
model_es = ExponentialSmoothing()
model_es.fit(train_btc)
probabilistic_forecast = model_es.predict(len(val_btc), num_samples=500)

df_btc_open.plot(label='actual')
probabilistic_forecast.plot(label='probabilistic forecast')
plt.legend()
plt.show()

In [None]:
probabilistic_forecast.plot(low_quantile=0.01, high_quantile=0.99, label='1-99th percentiles')
probabilistic_forecast.plot(low_quantile=0.2, high_quantile=0.8, label='20-80th percentiles')

### Regression approaches

In [None]:
models = [NaiveSeasonal(6), NaiveSeasonal(12), NaiveDrift()]

model_predictions = [m.historical_forecasts(df_btc_open,
                                            start=pd.Timestamp('20200921'),
                                            forecast_horizon=12,
                                            stride=12,
                                            last_points_only=False,
                                            verbose=True)
                     for m in models]

model_predictions = [reduce((lambda a, b: a.append(b)), model_pred) for model_pred in model_predictions]

In [None]:
model_predictions_stacked = model_predictions[0]
for model_prediction in model_predictions[1:]:
    model_predictions_stacked = model_predictions_stacked.stack(model_prediction)

In [None]:
""" We build the regression model, and tell it to use the current predictions
"""
regr_model = RegressionModel(lags=None, lags_future_covariates=[0])

""" Our target series is what we want to predict (the actual data)
    It has to have the same time index as the features series:
"""
series_target = df_btc_open.slice_intersect(model_predictions[0])

""" Here we backtest our regression model
"""
ensemble_pred = regr_model.historical_forecasts(
    series=series_target, future_covariates=model_predictions_stacked,
    start=pd.Timestamp('20200922'), forecast_horizon=3, verbose=True
)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(12,6))
ax = ax.ravel()

for i, m in enumerate(models):
    df_btc_open.plot(label='actual', ax=ax[i])
    model_predictions[i].plot(label=str(m), ax=ax[i])

    # intersect last part, to compare all the methods over the duration of the ensemble forecast
    model_pred = model_predictions[i].slice_intersect(ensemble_pred)

    mape_model = mape(df_btc_open, model_pred)
    ax[i].set_title('\nMAPE: {:.2f}%'.format(mape_model))
    ax[i].legend()

df_btc_open.plot(label='actual', ax=ax[3])
ensemble_pred.plot(label='Ensemble', ax=ax[3])
ax[3].set_title('\nMAPE, ensemble: {:.2f}%'.format(mape(df_btc_open, ensemble_pred)))
ax[3].legend()

print('\nRegression coefficients for the individual models:')
for i, m in enumerate(models):
    print('Learned coefficient for {}: {:.2f}'.format(m, regr_model.model.coef_[i]))
plt.tight_layout();

In [None]:
ensemble_model = RegressionEnsembleModel(
    forecasting_models=[NaiveSeasonal(6), NaiveSeasonal(12), NaiveDrift()],
    regression_train_n_points=12)

ensemble_model.fit(train_btc)
ensemble_pred = ensemble_model.predict(365)

df_btc_open.plot(label='actual')
ensemble_pred.plot(label='Ensemble forecast')
plt.title('MAPE = {:.2f}%'.format(mape(ensemble_pred, df_btc_open)))
plt.legend();

In [None]:
ensemble_pred_hist = ensemble_model.historical_forecasts(df_btc_open,
                                                    start=pd.Timestamp('20200922'),
                                                    forecast_horizon=3,
                                                    verbose=True)
df_btc_open.plot(label='actual')
ensemble_pred_hist.plot(label='Ensemble forecast')
plt.title('Historical forecast: MAPE = {:.2f}%'.format(mape(ensemble_pred_hist, df_btc_open)))
plt.legend();