In [1]:
from pathlib import Path
import pandas as pd, numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error

DATA_DIR = Path("data")
ohlcv = pd.read_csv(DATA_DIR/"ohlcv.csv", parse_dates=["date"]).sort_values(["ticker","date"])
ind   = pd.read_csv(DATA_DIR/"indicators.csv", parse_dates=["date"])

# build wide indicators for features
Xw = ind.pivot_table(index=["date","ticker"], columns="metric", values="value")
# target: next-day return
ohlcv["ret1"] = ohlcv.groupby("ticker")["close"].pct_change().shift(-1)
y = ohlcv.set_index(["date","ticker"])["ret1"]
# align
XY = Xw.join(y, how="inner").dropna()
X, y = XY.drop(columns=["ret1"]), XY["ret1"]

# simple time split -> last 20% test
cut = int(len(X)*0.8)
Xtr, Xte = X.iloc[:cut], X.iloc[cut:]
ytr, yte = y.iloc[:cut], y.iloc[cut:]

model = Ridge(alpha=5.0)
model.fit(Xtr, ytr)
pred = model.predict(Xte)

print(f"Test R^2: {r2_score(yte, pred):.4f}")
print(f"Test MAE: {mean_absolute_error(yte, pred):.6f}")
coef = pd.Series(model.coef_, index=X.columns).sort_values(key=abs, ascending=False).head(12)
coef

Test R^2: -0.0190
Test MAE: 0.013504


macd_hist        -2.606729e-03
vol60             2.031033e-03
macd_signal       1.815209e-03
sma20            -1.591197e-03
ema20             8.966484e-04
macd_line        -7.915201e-04
ema50             3.826827e-04
sma50             3.096382e-04
vol20             2.555021e-04
rollcorr60_spy   -2.306057e-04
rsi14            -1.188324e-08
dtype: float64