### Forecasting with ARIMA model

Idea is to take previously explored ARIMA model and add a gaussian process to make some short term forecasts.

In [68]:
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.graph_objects as go

import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

import time

2.6.0+cu126
12.6
90501
CUDA available: True
GPU: NVIDIA GeForce RTX 4070 SUPER


In [69]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Path to the .env file inside 01 - Documentation
env_path = Path("01 - Documentation") / "keys.env"

# Load the .env file from that path
load_dotenv(dotenv_path=env_path)

# Now you can access your keys
API_KEY = os.getenv("ALPACA_API_KEY")
SECRET_KEY = os.getenv("ALPACA_SECRET_KEY")

print(API_KEY[:4] + "****")  # optional: verify it's working


PKD5****


In [70]:
# Config
eq_symbol = "AAPL"  
startYear = 2017
endYear = 2024
split_adjustment = True  # Toggle this to True/False as needed

# Create data client
client = StockHistoricalDataClient(API_KEY, SECRET_KEY)

# Define request
request_params = StockBarsRequest(
    symbol_or_symbols=[eq_symbol],
    timeframe=TimeFrame.Day,
    start=datetime(startYear, 1, 1),
    end=datetime(endYear, 12, 31)
)

# Fetch data
bars = client.get_stock_bars(request_params)
stockData = bars.df.reset_index()

# Optional: Adjust for known stock split
if split_adjustment and eq_symbol.upper() == "AAPL":
    split_date = pd.to_datetime("2020-08-31")
    split_ratio = 4.0
    price_cols = ['open', 'high', 'low', 'close']

    # Ensure timestamp is datetime and timezone-naive
    stockData['timestamp'] = pd.to_datetime(stockData['timestamp']).dt.tz_localize(None)

    # Apply backward adjustment
    mask = stockData['timestamp'] < split_date
    for col in price_cols:
        stockData.loc[mask, col] = stockData.loc[mask, col] / split_ratio
    print("Applied 4:1 split adjustment for AAPL before 2020-08-31")

# Inspect output
print(stockData.head())
print(stockData.columns)
print(stockData.shape)



Applied 4:1 split adjustment for AAPL before 2020-08-31
  symbol           timestamp     open      high      low    close      volume  \
0   AAPL 2017-01-03 05:00:00  28.9375  29.08250  28.6900  29.0375  31713319.0   
1   AAPL 2017-01-04 05:00:00  28.9750  29.12750  28.9375  29.0050  23234449.0   
2   AAPL 2017-01-05 05:00:00  28.9925  29.21605  28.9525  29.1525  23572537.0   
3   AAPL 2017-01-06 05:00:00  29.1875  29.54000  29.1175  29.4775  33913959.0   
4   AAPL 2017-01-09 05:00:00  29.5000  29.85750  29.4850  29.7475  36262611.0   

   trade_count        vwap  
0     158508.0  115.645505  
1     129948.0  116.122258  
2     136223.0  116.458208  
3     177799.0  117.732691  
4     184632.0  118.972293  
Index(['symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume',
       'trade_count', 'vwap'],
      dtype='object')
(2011, 9)


In [71]:
import plotly.express as px
# Plot AAPL closing price over time
fig = px.line(stockData, x='timestamp', y='close', title = eq_symbol + ' Closing Prices (' + str(startYear) + '-' + str(endYear) + ')')
fig.update_layout(xaxis_title='Date', yaxis_title='Close Price (USD)', width=1000, height=600)
fig.show()

### Time Series Split

In [72]:
# Use clean close price series
close_series = stockData['close'].dropna().reset_index(drop=True)

# Define split ratio
split_ratio = 0.9
split_index = int(len(close_series) * split_ratio)

train_series = close_series[:split_index]
test_series = close_series[split_index:]

print(f"Train size: {len(train_series)}")
print(f"Test size: {len(test_series)}")

Train size: 1809
Test size: 202


### ARIMA Model - Auto Regressive Integrated Moving Average

The **ARIMA (AutoRegressive Integrated Moving Average)** model is a powerful time series forecasting method that combines autoregressive terms, differencing to remove trends, and moving averages of past errors. It is well-suited for modeling non-stationary data with underlying patterns but no clear seasonality.

$$
\Delta^d y_t = c + \sum_{i=1}^{p} \phi_i \Delta^d y_{t-i} + \sum_{j=1}^{q} \theta_j \epsilon_{t-j} + \epsilon_t
$$

\begin{aligned}
\textbf{where:} \\
\Delta^d y_t &= \text{the } d\text{-th order differenced series} \\
c &= \text{constant term} \\
\phi_i &= \text{AR coefficient} \\
\theta_j &= \text{MA coefficient} \\
\epsilon_t &= \text{white noise error at time } t \\
p &= \text{number of autoregressive lags} \\
d &= \text{number of differences (to make the series stationary)} \\
q &= \text{number of moving average lags}
\end{aligned}

The Auto-ARIMA function implementes the Box-Jenkins framework to iterate through finding the best ARIMA model to fit this data.

In [105]:
from statsmodels.tsa.arima.model import ARIMA as ManualARIMA
from pmdarima import auto_arima

use_auto_arima = True  # Toggle this flag

start = time.time()

if use_auto_arima:
    ARIMA_model = auto_arima(train_series,
                             seasonal=False,
                             stepwise=True,
                             suppress_warnings=True,
                             trace=True)

    train_preds = ARIMA_model.predict_in_sample()
    train_preds = train_preds[1:]  # Adjust for d=1 lag

    forecast_horizon = len(test_series)
    test_preds = ARIMA_model.predict(n_periods=forecast_horizon)

    model_aic = ARIMA_model.aic()

else:
    order = (2, 0, 2)  # Adjust manually as needed
    model_fit = ManualARIMA(train_series, order=order).fit()

    # In-sample prediction (skip first value due to d=1)
    train_preds = model_fit.predict(start=1, end=len(train_series)-1)

    # Forecast on test period
    test_preds = model_fit.forecast(steps=len(test_series))

    model_aic = model_fit.aic

arima_time = time.time() - start


Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=7727.556, Time=1.42 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=7750.868, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=7751.662, Time=0.10 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=7751.611, Time=0.13 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=7751.572, Time=0.03 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=7754.623, Time=0.42 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=7754.618, Time=0.61 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=7752.238, Time=1.60 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=7752.657, Time=1.46 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=7752.727, Time=0.40 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=7750.029, Time=0.60 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=7756.538, Time=0.46 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=7755.742, Time=2.22 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=7728.197, Time=0.90 sec

Best model:  ARIMA(2,1,2)(0,0,0)

In [106]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=stockData['timestamp'], y=close_series,
                         mode='lines', name='Actual'))
# Training predictions (aligned to train period)
train_timestamps = stockData['timestamp'][:len(train_preds)]
fig.add_trace(go.Scatter(x=train_timestamps, y=train_preds,
                         mode='lines', name='ARIMA Train Prediction'))
# Testing predictions (forecast beyond training)
# Get the last date from training
train_end_index = len(train_series)
test_timestamps = stockData['timestamp'][train_end_index - 1: train_end_index + len(test_preds)]

fig.add_trace(go.Scatter(x=test_timestamps, y=test_preds,
                         mode='lines', name='ARIMA Forecast (Test)'))

fig.update_layout(title='ARIMA Train/Test Forecast vs Actual',
                  xaxis_title='Date', yaxis_title='Price (USD)',
                  width=1000, height=600)

fig.show()

### Error reporting

In [107]:
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd

metrics = {}

test_true = test_series[:len(test_preds)]  # handles variable horizon automatically

metrics = {
    "ARIMA": {
        "AIC": ARIMA_model.aic(),
        "Train R2": r2_score(train_series[1:], train_preds),
        "Train MSE": mean_squared_error(train_series[1:], train_preds),
        "Test R2": r2_score(test_true, test_preds),
        "Test MSE": mean_squared_error(test_true, test_preds),
        "Time (s)": arima_time
    }
}


metrics_df = pd.DataFrame(metrics).T
metrics_df = metrics_df[["AIC", "Train R2", "Train MSE", "Test R2", "Test MSE", "Time (s)"]]
metrics_df


Unnamed: 0,AIC,Train R2,Train MSE,Test R2,Test MSE,Time (s)
ARIMA,7727.555984,0.998603,4.176211,-1.322272,1461.515453,10.410632
