In [7]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import logging
import warnings
warnings.filterwarnings("ignore")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load datasets
data_path = "data"
file_names = {
    "bitcoin": "bitcoin.csv",
    "gold": "gold.csv",
    "google_trends": "google_trends.csv",
    "sp500": "sp500.csv",
    "treasury_3m": "treasury_3m.csv",
    "treasury_10y": "treasury_10y.csv",
    "copper": "copper.csv",
    "oil": "oil.csv",
    "unemployment": "unemployment.csv",
    "moex": "MOEX.csv",
    "sse": "SSE.csv",
    "stoxx": "STOXX_600.csv",
    "vix": "vix.csv",
    "spgsci": "spgsci.csv"}

data = {}
for key, file in file_names.items():
    file_path = os.path.join(data_path, file)
    if os.path.exists(file_path):
        logging.info(f"Loading {file}")
        df = pd.read_csv(file_path)
        if "timestamp" not in df.columns:
            logging.warning(f"{file} skipped: no 'timestamp' column")
            continue
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        df.replace({'.': np.nan}, inplace=True)
        for col in df.columns:
            if col != "timestamp":
                df[col] = pd.to_numeric(df[col], errors='coerce')
        data[key] = df
    else:
        logging.warning(f"{file} not found")

# Rename columns
rename_map = {
    "bitcoin": {"Close": "bitcoin_close", "Open": "bitcoin_open", "High": "bitcoin_high", "Low": "bitcoin_low", "Volume": "bitcoin_volume"},
    "gold": {"Close": "gold_close", "Open": "gold_open", "High": "gold_high", "Low": "gold_low", "Volume": "gold_volume"},
    "oil": {"Close": "oil_close", "Open": "oil_open", "High": "oil_high", "Low": "oil_low", "Volume": "oil_volume"},
    "copper": {"price": "copper_price"},
    "google_trends": {"SPX": "google_spx", "ETF": "google_etf", "index fund": "google_index_fund", "sp500": "google_sp500"},
    "unemployment": {"Unemployment": "unemployment_rate"},
    "treasury_3m": {"Close": "treasury_3m"},
    "treasury_10y": {"Close": "treasury_10y"},
    "sp500": {"Close": "sp500_close"},
    "moex": {"Close": "moex_close"},
    "sse": {"Close": "sse_close"},
    "stoxx": {"Close": "stoxx_close"},
    "vix": {"Close": "vix_close"},
    "spgsci": {"Close": "spgsci_close"}
}

for key, renames in rename_map.items():
    if key in data:
        data[key] = data[key].rename(columns=renames)

# Merge and clean data
logging.info("Merging all dataframes")
sp500 = data["sp500"]
all_data = sp500[["timestamp", "sp500_close"]]
for key, df in data.items():
    if key != "sp500":
        all_data = all_data.merge(df, on="timestamp", how="left")

all_data.sort_values("timestamp", inplace=True)
all_data.fillna(method="ffill", inplace=True)
all_data.dropna(inplace=True)
all_data = all_data.loc[:, all_data.nunique() > 1]
logging.info("Merged dataset shape: %s", all_data.shape)

# Define target and features
target = "sp500_close"
exclude_cols = ["timestamp", target]
exog_vars = [col for col in all_data.columns if col not in exclude_cols]

# Monthly SARIMAX backtest with reduced grid
results = []
start_date = pd.Timestamp("2023-01-01")
end_date = pd.Timestamp("2025-03-01")
current_date = start_date

while current_date <= end_date:
    logging.info(f"Backtest for month starting {current_date.strftime('%Y-%m-%d')}")
    next_month = current_date + pd.DateOffset(months=1)

    train = all_data[all_data["timestamp"] < current_date]
    test = all_data[(all_data["timestamp"] >= current_date) & (all_data["timestamp"] < next_month)]

    if len(train) < 100 or len(test) < 10:
        logging.warning("Skipping month due to insufficient data")
        current_date += pd.DateOffset(months=1)
        continue

    y_train = train[target]
    y_test = test[target]
    X_train = train[exog_vars]
    X_test = test[exog_vars]

    # Standardize and apply PCA
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # SARIMAX parameter tuning with reduced grid
    best_aic = np.inf
    best_model = None
    best_order = None
    best_seasonal_order = None

    for p in [0, 1]:
        for q in [0, 1]:
            for P in [0, 1]:
                for Q in [0, 1]:
                    try:
                        model = SARIMAX(
                            y_train,
                            exog=X_train_pca,
                            order=(p, 1, q),
                            seasonal_order=(P, 0, Q, 12),
                            enforce_stationarity=False,
                            enforce_invertibility=False
                        )
                        model_fit = model.fit(disp=False)
                        if model_fit.aic < best_aic:
                            best_aic = model_fit.aic
                            best_model = model_fit
                            best_order = (p, 1, q)
                            best_seasonal_order = (P, 0, Q, 12)
                    except Exception as e:
                        logging.debug(f"Model ({p},{1},{q}) x ({P},0,{Q},12) failed: {e}")
                        continue

    if best_model:
        logging.info(f"Best model: order={best_order}, seasonal_order={best_seasonal_order}, AIC={best_aic:.2f}")
        forecast = best_model.forecast(steps=len(X_test_pca), exog=X_test_pca)
        r2 = r2_score(y_test, forecast)
        mae = mean_absolute_error(y_test, forecast)
        rmse = np.sqrt(mean_squared_error(y_test, forecast))

        results.append({
            "Backtest": len(results) + 1,
            "Train Size": len(train),
            "Test Start": current_date.strftime("%Y-%m-%d"),
            "R2": r2,
            "MAE": mae,
            "RMSE": rmse,
            "Order": best_order,
            "Seasonal Order": best_seasonal_order,
            "AIC": best_aic
        })

    current_date += pd.DateOffset(months=1)

# Save and print results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
results_df.to_csv("results/sarimax_backtest_auto_reduced.csv", index=False)
logging.info("Saved results to results/sarimax_backtest_auto_reduced.csv")
print(results_df)


2025-04-14 20:55:03,011 - INFO - Loading bitcoin.csv
2025-04-14 20:55:03,019 - INFO - Loading gold.csv
2025-04-14 20:55:03,022 - INFO - Loading google_trends.csv
2025-04-14 20:55:03,027 - INFO - Loading sp500.csv
2025-04-14 20:55:03,029 - INFO - Loading treasury_3m.csv
2025-04-14 20:55:03,033 - INFO - Loading treasury_10y.csv
2025-04-14 20:55:03,035 - INFO - Loading copper.csv
2025-04-14 20:55:03,042 - INFO - Loading oil.csv
2025-04-14 20:55:03,045 - INFO - Loading unemployment.csv
2025-04-14 20:55:03,051 - INFO - Loading MOEX.csv
2025-04-14 20:55:03,054 - INFO - Loading SSE.csv
2025-04-14 20:55:03,056 - INFO - Loading STOXX_600.csv
2025-04-14 20:55:03,058 - INFO - Loading vix.csv
2025-04-14 20:55:03,060 - INFO - Loading spgsci.csv
2025-04-14 20:55:03,062 - INFO - Loading garch.csv
2025-04-14 20:55:03,067 - INFO - Merging all dataframes
2025-04-14 20:55:03,100 - INFO - Merged dataset shape: (702, 42)
2025-04-14 20:55:03,101 - INFO - Backtest for month starting 2023-01-01
2025-04-14 20:

    Backtest  Train Size  Test Start        R2        MAE       RMSE  \
0          1         151  2023-01-01 -1.767402  12.151256  13.019613   
1          2         171  2023-02-01  0.067772   5.329989   6.392007   
2          3         190  2023-03-01 -0.830220   6.865098   7.958667   
3          4         213  2023-04-01 -7.380131   7.944599   8.631613   
4          5         232  2023-05-01 -2.644561   5.514038   7.463596   
5          6         254  2023-06-01 -5.578045  13.083268  13.951804   
6          7         275  2023-07-01 -0.495170   6.509184   7.464287   
7          8         295  2023-08-01  0.616849   2.612633   3.356781   
8          9         318  2023-09-01  0.535612   4.607550   5.906930   
9         10         338  2023-10-01 -1.883472   9.397173  12.941565   
10        11         360  2023-11-01  0.142341   8.430222   9.158392   
11        12         381  2023-12-01 -0.338220   7.225438   8.636208   
12        13         401  2024-01-01 -3.168997  13.840831  14.78