# EDA and Holt-Winters methods on Bitcoin prediction during 365 days

- Utils

In [None]:
# system libraries
import warnings
warnings.filterwarnings('ignore')

# data manipulation libraries
import pandas as pd
import numpy as np

# graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# modelisation libraries
from datetime import datetime
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose

# metric evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
def group_trading_timeseries(df, time_step_size):
    #Copied from https://www.kaggle.com/alexisalvarez/eda-resampling-the-power-of-technical-analysis
    df = df.sort_index()
    df = df.resample('{}'.format(time_step_size)).agg({
            'Count': lambda df: df.sum(),
            'Open': lambda s: s.iloc[0] ,
            'High': lambda df: df.max(), 
            'Low': lambda df: df.min(),
            'Close': lambda df: df.iloc[-1], 
            'Volume': lambda df: df.sum(),
            'VWAP' : lambda df: df.mean(),
            'Target' : lambda df: df.mean()
            }) 
    return df

def check_stationarity(series, asset_id):
    # Copied and edited from https://machinelearningmastery.com/time-series-data-stationary-python/

    result = adfuller(series.values)
    if (result[1] <= 0.05) & (result[4]['5%'] > result[0]):
        print(f"{asset[asset.Asset_ID == asset_id].Asset_Name.values[0]}: \u001b[32mStationary\u001b[0m")
    else:
        print(f"{asset[asset.Asset_ID == asset_id].Asset_Name.values[0]}: \x1b[31mNon-stationary\x1b[0m")
       
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    print('\n')

def metric_evaluation(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('MAE = ', mae)
    print('MSE = ', mse)
    print('RMSE = ', rmse)
    print('MAPE = ', mape)

## Data Preprocessing

In [None]:
data = pd.read_csv("../input/g-research-crypto-forecasting/train.csv", encoding="utf-8")
asset = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv", encoding="utf-8")

data["timestamp"] = data["timestamp"].apply(lambda x: datetime.fromtimestamp(x))
data["date"] = data["timestamp"].astype('datetime64[s]')
data.head()

In [None]:
data = pd.merge(data, asset, on="Asset_ID")
data.set_index("date", inplace=True)
data = data[["Asset_Name", "Weight", "Count",
             "Open", "High", "Low",
             "Close", "Volume", "VWAP",
             "Target"]]
data.head()

Columns of the dataset :
* timestamp - A timestamp for the minute covered by the row.
* Asset_ID - An ID code for the cryptoasset.
* Count - The number of trades that took place this minute.
* Open - The USD price at the beginning of the minute.
* High - The highest USD price during the minute.
* Low - The lowest USD price during the minute.
* Close - The USD price at the end of the minute.
* Volume - The number of cryptoasset units traded during the minute.
* VWAP - The volume weighted average price for the minute.
* Target - 15 minute residualized returns.

In [None]:
data.isnull().mean() * 100

In [None]:
data[data.duplicated()]

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.dropna(inplace=True)

In [None]:
data.head()

# Exploratory Analysis

In this trial, we only use Bitcoin. 

In [None]:
df_btc = data.loc[data["Asset_Name"] == "Bitcoin"]
df_btc = group_trading_timeseries(df_btc, time_step_size="1d")
df_btc.head()

## Descriptive Analysis

In [None]:
fig = go.Figure(data=[go.Candlestick(x=df_btc.index,
                                     open=df_btc['Open'],
                                     high=df_btc['High'],
                                     low=df_btc['Low'],
                                     close=df_btc['Close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(
    title='Bitcoin from Janv. 2018 to Sep. 2021',
    yaxis_title='USD',
    shapes = [dict(
        x0='2020-03-11', x1='2020-03-11', y0=0, y1=1, xref='x', yref='paper',
        line_width=2)],
    annotations=[dict(
        x='2020-03-11', y=0.05, xref='x', yref='paper',
        showarrow=False, xanchor='left', text='Covid Sanitary Crisis Begin')]
)
fig.show()

In [None]:
for i in df_btc.columns:
    print('\033[1m'+i+'\033[0m')
    print("Mean: ", round(df_btc[i].mean(), 3))
    print("Median: ", round(df_btc[i].median(), 3))
    print("Standard Deviation: ", round(df_btc[i].std(), 3))
    print("Variance: ", round(df_btc[i].var(), 3))
    print("-------------------")

In [None]:
for i in df_btc.columns:
    sns.displot(x=i,
                data=df_btc,
                kde=True)
    plt.title("Distribution of the variable: " +  i)
    plt.show()

## Stationarity

In [None]:
for i in df_btc.columns:
    print(i)
    check_stationarity(df_btc[i], 0)

In [None]:
non_stat = ["Count", "Open", "High",
            "Low", "Close", "VWAP"]

for i in non_stat:
    df_btc[i] = df_btc[i].apply(lambda x: np.log(x))
    df_btc[i] = df_btc[i].diff(1)
df_btc.dropna(inplace=True)

In [None]:
df_btc.head()

In [None]:
for i in df_btc.columns:
    print(i)
    check_stationarity(df_btc[i], 0)

In [None]:
fig = go.Figure(data=[go.Candlestick(x=df_btc.index,
                                     open=df_btc['Open'],
                                     high=df_btc['High'],
                                     low=df_btc['Low'],
                                     close=df_btc['Close'])])
fig.update_layout(xaxis_rangeslider_visible=False)
fig.update_layout(
    title='Bitcoin from Janv. 2018 to Sep. 2021',
    yaxis_title='USD',
    shapes = [dict(
        x0='2020-03-11', x1='2020-03-11', y0=0, y1=1, xref='x', yref='paper',
        line_width=2)],
    annotations=[dict(
        x='2020-03-11', y=0.05, xref='x', yref='paper',
        showarrow=False, xanchor='left', text='Covid Sanitary Crisis Begin')]
)
fig.show()

### Seasonal decomposition

In [None]:
print("\033[1mSeasonal decomposition of each variables\033[0m")
for i in df_btc.columns:
    decomp_x = seasonal_decompose(df_btc[i],  model='add')
    decomp_x.plot()
    plt.show()
    print("\033[1m---------------------------------------------------------\033[0m")

### Moving averages 

In [None]:
window_size = 362

for i in df_btc.columns:
    windows = df_btc[i].rolling(window_size)
    moving_averages = windows.mean()
    
    moving_averages_list = moving_averages.tolist()
    without_nans = moving_averages_list[window_size - 1:]
    
    plt.figure(figsize=(15,5))
    plt.plot(df_btc[i], label=i)
    plt.plot(moving_averages, label='Moving Average')
    plt.title(i+" in function of time")
    plt.xlabel('Time')
    plt.legend(loc='best')
    plt.show()
    print("\033[1m---------------------------------------------------------\033[0m")

## Predictions with ML models

### Exponential Smoothing

In [None]:
for i in df_btc.columns:
    hw = ExponentialSmoothing(np.asarray(df_btc[i]),
                                         trend="add",
                                         seasonal="add",
                                         seasonal_periods=7).fit()
    hw_pred = hw.forecast(365)
    
    plt.figure(figsize=(14,6))
    plt.plot(df_btc[i], label=i, color='green')
    plt.plot(pd.date_range(df_btc.index[len(df_btc)-1], periods=365, freq='D'),
             hw_pred, label='Prediction', color='red')
    plt.title(i + " and its prediction for the next year")
    plt.xlabel('Time')
    plt.legend(loc='best')
    plt.show()
    print("\033[1m--------------------------------------------------------------------------------------------------------------------\033[0m")

In [None]:
df_2 = df_btc.drop(df_btc.index[-365:])

print("\033[1mEvaluation of Exponential Smoothing prediction\033[0m")

for i in df_btc.columns:
    hw = ExponentialSmoothing(np.asarray(df_2[i]), seasonal_periods=7,
                              trend='add', seasonal='add').fit()
    hw_pred = hw.forecast(365)

    plt.figure(figsize=(14,6))
    plt.plot(df_btc[i], label=i, color='gray')
    plt.plot(pd.date_range(df_2.index[len(df_2)-1], periods=365, freq='D'), hw_pred, 
             label='Prediction', color='green')
    plt.xlabel('Time')
    plt.legend(loc='best')
    plt.show()
    print("*****************************\n")
    y_pred = hw_pred
    y_true = np.asarray(df_2[i].iloc[-365:])
    
    metric_evaluation(y_true, y_pred)
    print("-----------------------------")

## Ressources

- https://www.kaggle.com/iamleonie/to-the-moon-g-research-crypto-forecasting-eda