# ARIMA Model with Exponential Smoothing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# https://www.kaggle.com/sohier/detailed-api-introduction

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load dependencies

In [None]:
import numpy as np
import pandas as pd
import warnings    # `do not disturb`
warnings.filterwarnings('ignore')
import pickle
import time

# plots
import matplotlib.pyplot as plt
%matplotlib inline

# statistics and econometrics
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.tsa.arima.model
import statsmodels.api as sm
import scipy.stats as scs
from scipy.stats import pearsonr
from scipy.stats import spearmanr

# model performance
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima_model import ARIMAResults

# Load data

In [None]:
# # Bring in training data and asset information
data_folder = "/kaggle/input/g-research-crypto-forecasting/"
asset_details = pd.read_csv(data_folder + 'asset_details.csv')
train = pd.read_csv(data_folder + "train.csv")

# Define custom functions

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def moving_average(series, n=15):
    """
    Calculate average of last n observations
    """
    return np.average(series[-n:])

def weighted_average(series, weights):
    """
    Calculate weighted average on the series.
    Assuming weights are sorted in descending order
    (larger weights are assigned to more recent observations).
    """
    result = 0.0
    for n in range(len(weights)):
        result += series.iloc[-n-1] * weights[n]
    return float(result)

def exponential_smoothing(series, alpha):
    """
    series - dataset with timestamps
    alpha - float [0.0, 1.0], smoothing parameter
    """
    values = series.values
    result = [values[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * values[n] + (1 - alpha) * result[n-1])
    return result

def plotExponentialSmoothing(series, alphas, plot_intervals = False, scale = 1.96, plot_anomalies=False):
    """
    Plots exponential smoothing with different alphas

    series - dataset with timestamps
    alphas - list of floats, smoothing parameters
        
    """  
    plt.figure(figsize=(15, 5))
    for alpha in alphas:
        plt.plot(exponential_smoothing(series, alpha), c="orange", label="Alpha {}".format(alpha))
    plt.plot(series.values, "b", label = "Actual")
    plt.title("Moving Exponential Average\n Alpha = {}".format(alphas))
    plt.legend(loc="best")
    plt.axis('tight')
    
    if plot_intervals:
        mae = mean_absolute_error(
            series, exponential_smoothing(series, alpha)
        )
        
        deviation = np.std(series - exponential_smoothing(series, alpha))
        lower_bound = exponential_smoothing(series, alpha) - (mae + scale * deviation)
        upper_bound = exponential_smoothing(series, alpha) + (mae + scale * deviation)
        plt.plot(upper_bound, "r--", label="Upper Bound / Lower Bound")
        plt.plot(lower_bound, "r--")
        
        # Having the intervals, find abnormal values
        if plot_anomalies:
            anomalies = pd.Series(index=series.index, name=series.name)
            anomalies[series<lower_bound] = series[series<lower_bound]
            print(str(len(anomalies[series<lower_bound])) + str(" lower-bound anomalies found"))
            anomalies[series>upper_bound] = series[series>upper_bound]
            print(str(len(anomalies[series<lower_bound])) + str(" upper-bound anomalies found"))
            plt.plot(anomalies, "black", markersize=10)
        
    plt.legend(loc="best")
    plt.grid(b=True)
    
def tsplot(y, lags=None, figsize=(12, 7), style='bmh'):
    """
    Plot time series, its ACF and PACF, calculate Dickeyâ€“Fuller test

    y - timeseries
    lags - how many lags to include in ACF, PACF calculation
    """
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    
    
    with plt.style.context(style):   
        fig = plt.figure(figsize=figsize)
        layout = (2, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        
        y.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(y)[1]
        ts_ax.set_title(f'Time Series Analysis Plots\n Dickey-Fuller: p={p_value:.5f}')
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        plt.tight_layout()

# Training Loop
- Exponential smoothing (alpha=0.005)
- ARIMA model with p=2, d=0, q=13

In [None]:
n_steps_to_train = 500
n_steps_to_plot = 100
n_steps_to_predict = 3

# ARIMA Model params
p=2
d=0
q=13
alpha=0.005

# Initialize dict objects to store models, errors, and forecasts
models = dict()

# Create separate groups for each asset id/name
for i in range(len(asset_details)):
    current_asset_id = asset_details.Asset_ID[i]
    current_asset_name = asset_details.Asset_Name[i]
    current_dataset = train[train["Asset_ID"] == current_asset_id].set_index("timestamp")
    current_dataset.dropna(axis=0, inplace=True)
    
    current_dataset = current_dataset.reindex(
        range(
            current_dataset.index[0], current_dataset.index[-1] + 60, 60
        ), method='pad'
    )
    
    # Ensure consistency in distribution of newly created datasets
    comparison = train[train["Asset_ID"] == current_asset_id].set_index("timestamp")
    
    n_steps_to_correlate = 50000
    first_apple = current_dataset.Close.values[-n_steps_to_correlate:]
    second_apple = comparison.Close.values[-n_steps_to_correlate:]
    
    assert len(first_apple) == len(second_apple), "The input lengths do not match"
    
    print(f"Calculating correlations for: {current_asset_name}")
    
    corr, _ = pearsonr(first_apple, second_apple)
    corr, _ = spearmanr(first_apple, second_apple)
    
    print("Pearsons: %.3f" % corr, "Spearmans: %.3f \n" % corr)

    # Calculate exponential smoothing
    current_dataset_mini = current_dataset.copy()[-n_steps_to_train:]
    current_dataset_mini["Smoothed"] = exponential_smoothing(current_dataset_mini.Target, alpha=alpha)
    data = current_dataset_mini["Smoothed"]
    
    # Train model
    model = statsmodels.tsa.arima.model.ARIMA(endog=data.values, exog=None, order=(p,d,q)).fit()
    aic = model.aic
    actual = np.array(data.values)
    
    # Save model
    models[current_asset_id] = model

# Prediction Loop

In [None]:
# for i, (test_df, sample_prediction_df) in enumerate(iter_test):
#     sample_prediction_df["Target"] = 0.0
#     env.predict(sample_prediction_df)

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
# from tqdm import tqdm

prediction_step = 0
for (test_df, sample_prediction_df) in iter_test:
    
    # initialize predictions list
    y_preds = []
    
    # always sort by time, then by Asset_ID
    test_df = test_df.sort_values(by=['timestamp', 'Asset_ID'])
    
    # Map asset_id to row_id
    asset_id_row_id_map = {asset_id: row_id for asset_id, row_id in test_df[["Asset_ID", "row_id"]].values}
    
    # Loop through mapping
    for key in asset_id_row_id_map.keys():
        model = models[key]
        y_preds.append(
            float(
                model.predict(
                    start = n_steps_to_train + prediction_step, end = n_steps_to_train + prediction_step
                )
            )
        )
    
    # Increment prediction step
    prediction_step += 1
    
    # Update targets and submit
    sample_prediction_df.Target = y_preds
    env.predict(sample_prediction_df)