In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("model_features.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df.head()

Unnamed: 0,Date,gold,dxy,crude,us10y,sp500,gold_ret,dxy_ret,crude_ret,us10y_ret,...,gold_ret_lag5,dxy_ret_lag5,us10y_ret_lag5,gold_ret_lag10,dxy_ret_lag10,us10y_ret_lag10,gold_ret_lag21,dxy_ret_lag21,us10y_ret_lag21,target_next_gold_ret
0,2021-03-18,162.559998,91.849998,60.0,1.73,3915.459961,-0.005827,0.004911,-0.07387,0.052816,...,-0.000866,-0.004039,0.004595,-0.009886,0.007449,0.052992,-0.011418,0.00485,0.001538,0.004174
1,2021-03-19,163.240005,91.919998,61.419998,1.732,3913.100098,0.004174,0.000762,0.023391,0.001155,...,-0.000186,0.00284,0.068338,0.000629,0.003812,0.002577,-6e-05,-0.003966,-0.010819,-0.001471
2,2021-03-22,163.0,91.800003,61.549999,1.684,3940.590088,-0.001471,-0.001306,0.002114,-0.028105,...,0.004387,0.001635,-0.017274,-0.010422,0.003581,0.026668,0.00414,-0.002542,0.04408,-0.007327
3,2021-03-23,161.809998,92.339996,57.759998,1.638,3910.52002,-0.007327,0.005865,-0.063553,-0.027696,...,0.000924,0.000436,0.008674,0.021048,-0.003799,-0.03183,0.014858,-0.003881,0.018417,0.003455
4,2021-03-24,162.369995,92.529999,61.18,1.614,3889.139893,0.003455,0.002056,0.057524,-0.01476,...,0.00712,-0.005129,0.012263,0.005085,-0.00185,-0.016961,-0.002303,0.001776,-0.005857,-0.00364


In [2]:
target = "target_next_gold_ret"

feature_cols = [c for c in df.columns if c not in ["Date", target]]

X = df[feature_cols]
y = df[target]

print("Features:", len(feature_cols))

Features: 27


In [3]:
target = "target_next_gold_ret"

feature_cols = [c for c in df.columns if c not in ["Date", target]]

X = df[feature_cols]
y = df[target]

print("Features:", len(feature_cols))

Features: 27


In [4]:
split_idx = int(len(df) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
dates_test = df["Date"].iloc[split_idx:]

X_train.shape, X_test.shape

((991, 27), (248, 27))

In [5]:
split_idx = int(len(df) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
dates_test = df["Date"].iloc[split_idx:]

X_train.shape, X_test.shape

((991, 27), (248, 27))

In [6]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

rmse = mean_squared_error(y_test, pred_rf) ** 0.5
mae = mean_absolute_error(y_test, pred_rf)

print("RF RMSE:", rmse)
print("RF MAE:", mae)

RF RMSE: 0.01701323545854134
RF MAE: 0.012096685465387155


In [8]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

pred_ridge = ridge.predict(X_test)

In [9]:
def directional_accuracy(y_true, y_pred):
    return (np.sign(y_true) == np.sign(y_pred)).mean()

print("Ridge Directional Acc:", directional_accuracy(y_test.values, pred_ridge))
print("RF Directional Acc:", directional_accuracy(y_test.values, pred_rf))

Ridge Directional Acc: 0.42338709677419356
RF Directional Acc: 0.41935483870967744


In [10]:
out = pd.DataFrame({
    "Date": dates_test.values,
    "actual_next_gold_ret": y_test.values,
    "pred_ridge": pred_ridge,
    "pred_rf": pred_rf
})

out.to_csv("predictions.csv", index=False)
print("Saved: predictions.csv")
out.head()

Saved: predictions.csv


Unnamed: 0,Date,actual_next_gold_ret,pred_ridge,pred_rf
0,2025-02-27,-0.006286,-0.001212,-0.000366
1,2025-02-28,0.013094,-0.000724,-0.000147
2,2025-03-03,0.00866,-0.001587,-0.001484
3,2025-03-04,0.002079,-0.001933,-0.002933
4,2025-03-05,-0.005094,-0.001429,-0.00074
