# ISLP - Chapter 10 - Exercise 9
### Author: pzuehlke

As in the lab, using an $ AR(5) $ model we need to predict the log volume $ v_t $
on day $ t $ using a linear function of $ 5 $ lagged values of three features,
namely: `log_volume` itself, `DJ_return` and `log_volatility`. Thus, our model will
have a total of $ 15 $ predictors, plus an intercept.

Note that in this exercise no neural networks are involved, despite it being in
chapter $ 10 $.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from ISLP import load_data

In [None]:
dow_jones = load_data("NYSE")
dow_jones.info()

In [None]:
dow_jones.head()

Let's create a dataframe with all needed features, add the lag structure and
extract month information:

In [None]:
cols = ["DJ_return", "log_volume", "log_volatility"]
data = dow_jones[cols].copy()

lag = 5
for i in range(1, lag + 1):
    for col in cols:
        data[f"{col}_{i}"] = data[col].shift(i)
data = data.dropna()

data.index = pd.to_datetime(data.index)
data["month"] = data.index.month  # 1 = January, 12 = December
data.head()

Since these are time series data, we'll use a chronological split as recommended
in the text.  More precisely, we choose the split so that test data consists of
dates on or after January $ 2 $, $ 1980 $ (see Figure $ 10.14 $, p. $ 421 $).

In [None]:
cutoff_date = pd.to_datetime("1980-01-02")
train_data = data[data.index < cutoff_date]
test_data  = data[data.index >= cutoff_date]

y_train = train_data["log_volume"]
y_test = test_data["log_volume"]

# Create feature matrix X with lag features only:
lag_cols = [f"{col}_{i}" for i in range(1, lag + 1) for col in cols]
X_train = train_data[lag_cols]
X_test = test_data[lag_cols]

We are finally ready to build our first autoregressive model (not including the
month):

In [None]:
ar5_model = LinearRegression()
ar5_model.fit(X_train, y_train)

y_train_pred = ar5_model.predict(X_train)
y_test_pred = ar5_model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"AR(5) with three predictors\nTrain MSE: {train_mse:.4e},\tTest MSE: {test_mse:.4e}")
print(f"Test R^2: {r2:.4f}")

Our base model has a test $ R^2 $ of $ 41.29\% $, exactly as in the book (p. $
461 $).  Now we do the same for the model including `month` to see if the
situation improves. 

In [None]:
encoder = OneHotEncoder(sparse_output=False, drop="first")  # drop January to avoid collinearity
month_train = encoder.fit_transform(train_data[["month"]])
month_test = encoder.transform(test_data[["month"]])

month_names = [f"month_{i + 2}" for i in range(11)]  # 2-12, since January was dropped

X_train_month = np.hstack((X_train, month_train))
X_test_month = np.hstack((X_test, month_test))

X_train_month_df = pd.DataFrame(
    X_train_month,
    columns=X_train.columns.tolist() + month_names,
    index=X_train.index
)

X_test_month_df = pd.DataFrame(
    X_test_month,
    columns=X_test.columns.tolist() + month_names,
    index=X_test.index
)

In [None]:
ar5_month_model = LinearRegression()
ar5_month_model.fit(X_train_month_df, y_train)

y_train_pred_month = ar5_month_model.predict(X_train_month_df)
y_test_pred_month = ar5_month_model.predict(X_test_month_df)

train_mse_month = mean_squared_error(y_train, y_train_pred_month)
test_mse_month = mean_squared_error(y_test, y_test_pred_month)
r2_month = r2_score(y_test, y_test_pred_month)
print("AR(5) model with month dummy variables"
      f"\nTrain MSE: {train_mse_month:.4e},\tTest MSE: {test_mse_month:.4e}")
print(f"Test R^2: {r2_month:.4f}")


This model's test $ R^2 $ has improved by only $ 0.4\% $
when compared to our base model.  Therefore (to answer the question in the
statement), adding a categorical variable representing the month to the lag-$ 5
$ autoregressive model does not seem to improve performance.

Let's visualize the actual and predicted returns using both models:

In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(test_data.index, y_test, label="Actual", color="black", alpha=0.5)
ax.plot(test_data.index, y_test_pred,
        label="$ AR(5) $ prediction", color="blue", alpha=0.7)
ax.plot(test_data.index, y_test_pred_month,
        label="$ AR(5) $ with month prediction", color="red", alpha=0.7)
ax.set_title("Dow Jones log volume: actual vs predicted")
ax.set_xlabel("Date")
ax.set_ylabel("Log volume")
ax.legend()
ax.grid(True)
fig.tight_layout()
plt.show()

Finally, let's make a residual plot for each of the models:

In [None]:
residuals_base = y_test - y_test_pred
residuals_month = y_test - y_test_pred_month

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5))

ax1.scatter(y_test_pred, residuals_base, alpha=0.5, facecolor="none", edgecolor="royalblue")
ax1.axhline(y=0, color="red", linestyle="--")
ax1.set_xlabel("Predicted values - $ AR(5) $")
ax1.set_ylabel("Residuals")
ax1.set_title("Residuals vs predicted values - $ AR(5) $")
ax1.grid(True)

ax2.scatter(y_test_pred_month, residuals_month, alpha=0.5, facecolor="none", edgecolor="royalblue")
ax2.axhline(y=0, color="red", linestyle="--")
ax2.set_xlabel("Predicted values - $ AR(5) $ with month")
ax2.set_ylabel("Residuals")
ax2.set_title("Residuals vs predicted values - $ AR(5) $ with month")
ax2.grid(True)

fig.tight_layout()
plt.show()

Incidentally, I also tried to use predict _returns_ instead of volumes using
only the previous $ 5 $ days' returns as predictors, or these returns plus the
month, but both models result in negative test $ R^2 $, meaning that we could do
better simply by predicting the mean of the test data.  This illustrates how
hard it is to outperform the total stock market.