In [76]:
import yfinance as yf
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import xgboost as xgb
import shap

import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [77]:
dis_df = yf.download("DIS", start="2010-01-01", end="2025-05-30", auto_adjust=True)
dis_df.columns = dis_df.columns.get_level_values(0)

sp500_df = yf.download("^GSPC", start="2010-01-01", end="2025-05-30", auto_adjust=True)
sp500_df.columns = sp500_df.columns.get_level_values(0)

vix_df = yf.download("^VIX", start="2010-01-01", end="2025-05-30", auto_adjust=True)
vix_df.columns = vix_df.columns.get_level_values(0)

data = dis_df[["Open", "High", "Low", "Close", "Volume"]].copy()
data.rename(columns={"Close": "Adj Close"}, inplace=True)

sp500_adj = sp500_df["Close"].rename("SP500")
vix_adj   = vix_df["Close"].rename("VIX")

data = data.join(sp500_adj, how="inner").join(vix_adj, how="inner")
data.dropna(inplace=True)

data.tail()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Adj Close,Volume,SP500,VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-05-22,109.980003,111.800003,109.980003,111.129997,7903700,5842.009766,20.280001
2025-05-23,109.599998,110.349998,108.779999,109.720001,8086100,5802.819824,22.290001
2025-05-27,111.029999,112.889999,110.150002,112.360001,10201100,5921.540039,18.959999
2025-05-28,112.199997,112.470001,111.169998,111.519997,5787900,5888.549805,19.309999
2025-05-29,112.010002,112.260002,110.519997,112.019997,9682100,5912.169922,19.18


In [78]:
def compute_technical_indicators(df):
    """
    Enhanced feature‐engineering pipeline for DIS forecasting. Returns a DataFrame
    with original price/volume columns plus expanded technical/exogenous indicators.
    """
    df = df.copy()

    # 1. Basic log‐returns
    df["log_ret"] = np.log(df["Adj Close"] / df["Adj Close"].shift(1))

    # 2. Simple and exponential moving averages
    df["SMA_5"]   = df["Adj Close"].rolling(window=5).mean()
    df["SMA_10"]  = df["Adj Close"].rolling(window=10).mean()
    df["SMA_20"]  = df["Adj Close"].rolling(window=20).mean()
    df["EMA_10"]  = df["Adj Close"].ewm(span=10, adjust=False).mean()
    df["EMA_20"]  = df["Adj Close"].ewm(span=20, adjust=False).mean()

    # 3. Momentum (10‐day)
    df["MOM_10"]  = df["Adj Close"] / df["Adj Close"].shift(10) - 1

    # 4. Moving Average Convergence/Divergence (MACD: 12,26,9)
    ema12 = df["Adj Close"].ewm(span=12, adjust=False).mean()
    ema26 = df["Adj Close"].ewm(span=26, adjust=False).mean()
    df["MACD"]   = ema12 - ema26
    df["MACD_SIG"] = df["MACD"].ewm(span=9, adjust=False).mean()

    # 5. Bollinger Bands (20‐day, 2σ)
    rolling20 = df["Adj Close"].rolling(window=20)
    df["BB_MID"] = rolling20.mean()
    df["BB_UP"]  = rolling20.mean() + 2 * rolling20.std()
    df["BB_LOW"] = rolling20.mean() - 2 * rolling20.std()
    df["BB_WIDTH"] = (df["BB_UP"] - df["BB_LOW"]) / df["BB_MID"]

    # 6. RSI (14)
    delta     = df["Adj Close"].diff()
    up        = delta.clip(lower=0)
    down      = -delta.clip(upper=0)
    roll_up   = up.rolling(window=14).mean()
    roll_down = down.rolling(window=14).mean()
    rs        = roll_up / roll_down
    df["RSI_14"] = 100 - (100 / (1 + rs))

    # 7. Average True Range (ATR, 14)
    high_low         = df["High"] - df["Low"]
    high_close_prev  = (df["High"] - df["Adj Close"].shift(1)).abs()
    low_close_prev   = (df["Low"] - df["Adj Close"].shift(1)).abs()
    tr               = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
    df["ATR_14"]     = tr.rolling(window=14).mean()

    # 8. On‐Balance Volume (OBV)
    direction = np.sign(df["Adj Close"].diff()).fillna(0)
    df["OBV"] = (direction * df["Volume"]).fillna(0).cumsum()

    # 9. Rolling volatility and higher moments (10‐day)
    rolling_ret = df["log_ret"].rolling(window=10)
    df["Vol_10"]     = rolling_ret.std()
    df["Skew_10"]    = rolling_ret.apply(lambda x: x.skew(), raw=False)
    df["Kurt_10"]    = rolling_ret.apply(lambda x: x.kurt(), raw=False)

    # 10. Volume-based indicators: Volume‐Weighted Average Price (VWAP, 14‐day)
    pv = df["Adj Close"] * df["Volume"]
    df["VWAP_14"] = pv.rolling(window=14).sum() / df["Volume"].rolling(window=14).sum()

    # 11. Exponential volume‐weighted moving average (EVWMA, 14)
    df["EVWMA_14"] = (pv.ewm(span=14, adjust=False).mean() /
                      df["Volume"].ewm(span=14, adjust=False).mean())

    # 12. Exogenous log‐returns (SP500 & VIX)
    df["SP500_ret"] = np.log(df["SP500"] / df["SP500"].shift(1))
    df["VIX_ret"]   = np.log(df["VIX"] / df["VIX"].shift(1))

    # 13. Interaction features
    df["SMA5_x_VOL10"]   = df["SMA_5"] * df["Vol_10"]
    df["RSI14_x_ATR14"]  = df["RSI_14"] * df["ATR_14"]
    df["MACD_x_MACD_SIG"] = df["MACD"] * df["MACD_SIG"]

    # 14. Drop intermediate NaNs (due to rolling/lookback windows)
    df.dropna(inplace=True)
    return df


In [79]:
features = compute_technical_indicators(data)
features.dropna(inplace=True)

In [80]:
features["target"] = features["log_ret"].shift(-1)
features.dropna(inplace=True)

In [81]:
feature_cols = [
    "SMA_5", "SMA_10", "SMA_20",
    "EMA_10", "EMA_20",
    "MOM_10",
    "MACD", "MACD_SIG",
    "BB_MID", "BB_UP", "BB_LOW", "BB_WIDTH",
    "RSI_14",
    "ATR_14",
    "OBV",
    "Vol_10", "Skew_10", "Kurt_10",
    "VWAP_14",
    "SP500_ret", "VIX_ret"
]


In [82]:
features[["Adj Close"] + feature_cols + ["target"]].head()

Unnamed: 0_level_0,Adj Close,SMA_5,SMA_10,SMA_20,EMA_10,EMA_20,MOM_10,MACD,MACD_SIG,BB_MID,...,RSI_14,ATR_14,OBV,Vol_10,Skew_10,Kurt_10,VWAP_14,SP500_ret,VIX_ret,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-01,25.399025,25.364608,25.821482,26.448284,25.764169,26.241193,-0.035294,-0.557844,-0.44169,26.448284,...,29.185662,0.552501,-69210500.0,0.011194,-0.230578,-0.908046,25.993952,0.014165,-0.086052,0.015463
2010-02-02,25.794811,25.419674,25.732861,26.358372,25.76974,26.19868,-0.033215,-0.523105,-0.457973,26.358372,...,40.322703,0.559261,-59109600.0,0.011569,-0.091154,-0.713922,25.93926,0.01289,-0.050385,0.025359
2010-02-03,26.457317,25.665749,25.695003,26.305027,25.894754,26.223312,-0.014107,-0.437076,-0.453794,26.305027,...,44.181008,0.591833,-43133400.0,0.014561,0.479918,-0.236411,25.932829,-0.005489,0.005571,-0.035754
2010-02-04,25.528084,25.720815,25.614125,26.212534,25.828087,26.1571,-0.030709,-0.438821,-0.450799,26.212534,...,37.614715,0.63854,-56540500.0,0.017502,-0.271576,0.369967,25.866522,-0.031636,0.188475,-0.004391
2010-02-05,25.416233,25.719094,25.576267,26.114018,25.753204,26.086541,-0.014677,-0.44411,-0.449461,26.114018,...,39.728725,0.645301,-68496600.0,0.016395,-0.532995,1.698868,25.802731,0.002893,0.00115,-0.002033


In [83]:
X = features[feature_cols].values
y = features["target"].values

tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))

train_idx, val_idx = splits[0]
print(f"Fold 1 → train: {train_idx.min()}–{train_idx.max()}, val: {val_idx.min()}–{val_idx.max()}")

scaler = StandardScaler()
X_train = X[train_idx]
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled   = scaler.transform(X[val_idx])

y_train = y[train_idx]
y_val   = y[val_idx]

Fold 1 → train: 0–644, val: 645–1286


In [None]:
# Cell 5: Manual hyperparameter search for XGBoost with TimeSeriesSplit and early stopping

import itertools
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Define parameter grid
param_grid = {
    "max_depth": [3,5,7,9],
    "learning_rate": [0.001, 0.003, 0.005],
    "subsample": [0.7,0.9],
    "colsample_bytree": [0.7,0.9],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [0, 0.1, 1, 5],
    "n_estimators": [1000, 2000, 3000]
}

# 2. Prepare X/y arrays from our precomputed 'features' DataFrame
X = features[feature_cols].values
y = features["target"].values

# 3. TimeSeriesSplit (same as before)
tscv = TimeSeriesSplit(n_splits=5)

# 4. Storage for results
search_results = []

# 5. Iterate over all combinations of hyperparameters
for max_depth, lr, subsample, colsample, reg_alpha, reg_lambda, n_estimators in itertools.product(
    param_grid["max_depth"],
    param_grid["learning_rate"],
    param_grid["subsample"],
    param_grid["colsample_bytree"],
    param_grid["reg_alpha"],
    param_grid["reg_lambda"],
    param_grid["n_estimators"]
):
    params = {
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "verbosity": 0,
        "max_depth": max_depth,
        "eta": lr,
        "subsample": subsample,
        "colsample_bytree": colsample,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
    }
    fold_rmse = []

    # 6. Perform time-series cross-validation
    for train_idx, val_idx in tscv.split(X):
        # Split into this fold's train/val
        X_train_fold = X[train_idx]
        y_train_fold = y[train_idx]
        X_val_fold   = X[val_idx]
        y_val_fold   = y[val_idx]

        # Scale using StandardScaler fitted on training fold only
        scaler_fold = StandardScaler()
        scaler_fold.fit(X_train_fold)
        X_train_sc = scaler_fold.transform(X_train_fold)
        X_val_sc   = scaler_fold.transform(X_val_fold)

        # Create DMatrix objects
        dtrain = xgb.DMatrix(X_train_sc, label=y_train_fold)
        dval   = xgb.DMatrix(X_val_sc,   label=y_val_fold)

        # Early-stopping settings
        evals = [(dtrain, "train"), (dval, "validation")]
        bst = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=n_estimators,
            evals=evals,
            early_stopping_rounds=50,
            verbose_eval=False
        )

        # Predict and compute RMSE on this fold's validation set
        preds = bst.predict(dval)
        rmse = np.sqrt(mean_squared_error(y_val_fold, preds))
        fold_rmse.append(rmse)

    # Average RMSE across folds
    avg_rmse = np.mean(fold_rmse)
    search_results.append({
        "max_depth": max_depth,
        "learning_rate": lr,
        "subsample": subsample,
        "colsample_bytree": colsample,
        "n_estimators": n_estimators,
        "avg_rmse": avg_rmse
    })
    print(f"Params: md={max_depth}, lr={lr}, ss={subsample}, cs={colsample}, reg_alpha={reg_alpha}, reg_lambda={reg_lambda}, ne={n_estimators} → Avg RMSE: {avg_rmse:.5f}")

best_result = min(search_results, key=lambda x: x["avg_rmse"])
print("\nBest hyperparameters:", best_result)


Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0, ne=1000 → Avg RMSE: 0.01634
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0, ne=2000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0, ne=3000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0.1, ne=1000 → Avg RMSE: 0.01634
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0.1, ne=2000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=0.1, ne=3000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=1, ne=1000 → Avg RMSE: 0.01634
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=1, ne=2000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=1, ne=3000 → Avg RMSE: 0.01633
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_alpha=0, reg_lambda=5, ne=1000 → Avg RMSE: 0.01635
Params: md=3, lr=0.001, ss=0.7, cs=0.7, reg_

In [None]:
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# 1. Use the same best_params & best_n_estimators
params_full = {
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "verbosity": 0,
    "max_depth": best_result["max_depth"],
    "eta": best_result["learning_rate"],
    "subsample": best_result["subsample"],
    "colsample_bytree": best_result["colsample_bytree"],
    "reg_alpha": best_result["reg_alpha"],
    "reg_lambda": best_result["reg_lambda"],
}
num_boost_round = best_result["n_estimators"]

# 2. Build X_all / y_all from the entire features DataFrame
X_all = features[feature_cols].values
y_all = features["target"].values

# 3. Split last 10 % of time-ordered data for early stopping
split_idx = int(len(X_all) * 0.9)
X_train_full = X_all[:split_idx]
y_train_full = y_all[:split_idx]
X_val_full   = X_all[split_idx:]
y_val_full   = y_all[split_idx:]

# 4. Scale on full training portion, apply to both
scaler_full = StandardScaler()
scaler_full.fit(X_train_full)
X_train_full_sc = scaler_full.transform(X_train_full)
X_val_full_sc   = scaler_full.transform(X_val_full)

# 5. Create DMatrix objects
dtrain_full = xgb.DMatrix(X_train_full_sc, label=y_train_full)
dval_full   = xgb.DMatrix(X_val_full_sc,   label=y_val_full)

# 6. Train with early stopping
evals_full = [(dtrain_full, "train"), (dval_full, "validation")]
bst_full = xgb.train(
    params=params_full,
    dtrain=dtrain_full,
    num_boost_round=num_boost_round,
    evals=evals_full,
    early_stopping_rounds=50,
    verbose_eval=True
)

# 7. Evaluate on the held-out 10 % validation
pred_full_val = bst_full.predict(dval_full)
rmse_full = np.sqrt(mean_squared_error(y_val_full, pred_full_val))
mae_full  = mean_absolute_error(y_val_full, pred_full_val)
hit_full  = np.mean(np.sign(pred_full_val) == np.sign(y_val_full))

print(f"Full-data → RMSE: {rmse_full:.5f}, MAE: {mae_full:.5f}, Hit Ratio: {hit_full:.3f}")

[0]	train-rmse:0.01648	validation-rmse:0.01859
[1]	train-rmse:0.01647	validation-rmse:0.01858
[2]	train-rmse:0.01646	validation-rmse:0.01858
[3]	train-rmse:0.01645	validation-rmse:0.01858
[4]	train-rmse:0.01645	validation-rmse:0.01857
[5]	train-rmse:0.01644	validation-rmse:0.01857
[6]	train-rmse:0.01643	validation-rmse:0.01857
[7]	train-rmse:0.01643	validation-rmse:0.01856
[8]	train-rmse:0.01642	validation-rmse:0.01855
[9]	train-rmse:0.01641	validation-rmse:0.01855
[10]	train-rmse:0.01641	validation-rmse:0.01855
[11]	train-rmse:0.01640	validation-rmse:0.01855
[12]	train-rmse:0.01639	validation-rmse:0.01855
[13]	train-rmse:0.01638	validation-rmse:0.01854
[14]	train-rmse:0.01637	validation-rmse:0.01853
[15]	train-rmse:0.01636	validation-rmse:0.01853
[16]	train-rmse:0.01636	validation-rmse:0.01853
[17]	train-rmse:0.01635	validation-rmse:0.01854
[18]	train-rmse:0.01634	validation-rmse:0.01854
[19]	train-rmse:0.01634	validation-rmse:0.01854
[20]	train-rmse:0.01633	validation-rmse:0.01854
[2