In [3]:
pip install --force-reinstall lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Collecting numpy>=1.17.0 (from lightgbm)
  Using cached numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (171 kB)
Collecting scipy (from lightgbm)
  Using cached scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (61 kB)
Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
Using cached numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl (14.2 MB)
Using cached scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl (28.5 MB)
Installing collected packages: numpy, scipy, lightgbm
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 2.3.1
[2K    Uninstalling numpy-2.3.1:
[2K      Successfully uninstalled numpy-2.3.1
[2K  Attempting uninstall: scipy━━━━━━━━━━━━━━━━━━━[0m [32m0/3[0m [numpy]
[2K    Found existing installation: scipy 1.16.0[0m [32m0/3[0m [numpy]
[2K    Uninstalling scipy-1.16.0:╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
# ── AUC-check cell (updated for N_AHEAD) ────────────────────────────
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

from algo.config   import load_config
from algo.broker   import KiteWrapper
from research.features import add_indicators, FEATURES
from research.model    import load_or_train, LOOKBACK, N_AHEAD        # ← import constants

# 1) Fetch 6-month history & build indicators
cfg    = load_config()
broker = KiteWrapper(cfg)
hist   = broker.history(days=180, interval="3minute", tradingsymbol="IDEA")
df_all = add_indicators(hist).ffill()

# 2) Binary target: price N_AHEAD bars ahead higher than now?
df_all["y"] = (df_all["close"].shift(-N_AHEAD) > df_all["close"]).astype(int)

# 3) Build sliding windows (feature matrix X) that match live pipeline
windows, labels = [], []
for i in range(LOOKBACK, len(df_all) - N_AHEAD):
    win = df_all.iloc[i - LOOKBACK : i][FEATURES].to_numpy().ravel()
    windows.append(win)
    labels.append(df_all["y"].iat[i])

X = np.asarray(windows, dtype="float32")
y = np.asarray(labels, dtype="int8")

# 4) Train / reload LightGBM pipeline (set retrain=True once after feature edits)
model = load_or_train(df_all.iloc[: -(LOOKBACK + N_AHEAD)], retrain=True)

# 5) Forward-walk CV AUC
tscv = TimeSeriesSplit(n_splits=10)
auc_scores = cross_val_score(model, X, y, cv=tscv, scoring="roc_auc", n_jobs=-1)

print("Fold AUCs :", np.round(auc_scores, 3))
print("Median AUC:", round(np.median(auc_scores), 3))


[KiteWrapper] initialized: symbol=RELIANCE on exch=NSE
[history] start: days=180, interval=3minute, symbol=IDEA
[history] range UTC-naive: 2025-01-09 03:42:04.067771 → 2025-07-08 03:42:04.067771
[history] token=3677697
[history] got 8375 bars, cursor→2025-04-17 15:30:00
[history] got 6875 bars, cursor→2025-07-07 15:30:00
[history] empty data for 2025-07-07 15:30:00->2025-07-08 03:42:04.067771, breaking loop
[history] complete 15250 bars 2025-01-09 09:15:00 → 2025-07-07 15:27:00
🔧  Training started at 09:12:08
✅  Training finished in 9.4s
Validation accuracy: 0.557


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fold AUCs : [0.499 0.495 0.555 0.504 0.501 0.449 0.543 0.494 0.521 0.53 ]
Median AUC: 0.503




In [7]:
# research/test_auc_with_imbalance.py

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

# 1) Load the merged DataFrame you just created
df = pd.read_csv(
    "merged_RELIANCE_imb_3m_2025-07-08.csv",
    parse_dates=["timestamp"],
    index_col="timestamp",
)

# 2) Import your research-model pipeline & constants
from research.model    import load_or_train, LOOKBACK, N_AHEAD
from research.features import FEATURES
from research.config   import load_config
from algo.broker       import KiteWrapper

# 3) (Re)train on the full merged history
#    — so your pipeline now *sees* imb_mean / imb_std
cfg     = load_config()
broker  = KiteWrapper(cfg)
hist_full = broker.history(days=180, interval="3minute", tradingsymbol=cfg.tradingsymbol)
df_full   = pd.read_csv(
    "merged_RELIANCE_imb_3m_2025-07-08.csv",
    parse_dates=["timestamp"],
    index_col="timestamp"
).pipe(lambda d: d.ffill())  # fill any NaNs

model = load_or_train(df_full, retrain=True)  # retrain=True because we changed FEATURES

# 4) Prepare X, y for the most recent day (or full period)
X = df_full[FEATURES]
y = (df_full["close"].shift(-N_AHEAD) > df_full["close"]).astype(int)

# drop the last N_AHEAD rows (they have no target)
X, y = X.iloc[:-N_AHEAD], y.iloc[:-N_AHEAD]

# 5) TimeSeriesSplit → cross-val AUC
tscv = TimeSeriesSplit(n_splits=10)
auc_scores = cross_val_score(
    model, X, y,
    cv=tscv,
    scoring="roc_auc",
    n_jobs=-1,
)

# 6) Report
print("Fold AUCs :", np.round(auc_scores, 3))
print("Median AUC:", np.round(np.median(auc_scores), 3))


[KiteWrapper] initialized: symbol=RELIANCE on exch=NSE
[history] start: days=180, interval=3minute, symbol=RELIANCE
[history] range UTC-naive: 2025-01-09 05:53:22.446389 → 2025-07-08 05:53:22.446389
[history] token=738561
[history] got 8375 bars, cursor→2025-04-17 15:30:00
[history] got 6875 bars, cursor→2025-07-07 15:30:00
[history] empty data for 2025-07-07 15:30:00->2025-07-08 05:53:22.446389, breaking loop
[history] complete 15250 bars 2025-01-09 09:15:00 → 2025-07-07 15:27:00
🔧  Training started at 11:23:23
✅  Training finished in 9.1s
Validation accuracy: 0.484


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fold AUCs : [0.499 0.573 0.557 0.522 0.492 0.542 0.531 0.5   0.523 0.543]
Median AUC: 0.527




In [6]:
import pandas as pd

# 1) load whatever’s in the CSV so far:
df = pd.read_csv(
    "imb_tape_2025-07-08.csv",
    parse_dates=["ts_utc"],
    index_col="ts_utc"
)

# 2) resample to 3-min and compute mean/std
bars = df["imb"].resample("3T").agg(["mean","std"])

print(f"Bars so far: {len(bars)} (out of 125 expected)")
print(bars.head(), "\n…\n", bars.tail())


Bars so far: 13 (out of 125 expected)
                         mean       std
ts_utc                                 
2025-07-08 05:03:00 -0.002628  0.652623
2025-07-08 05:06:00 -0.103327  0.621978
2025-07-08 05:09:00 -0.162325  0.629816
2025-07-08 05:12:00 -0.245400  0.535625
2025-07-08 05:15:00  0.039151  0.583979 
…
                          mean       std
ts_utc                                 
2025-07-08 05:27:00 -0.313418  0.561300
2025-07-08 05:30:00 -0.416137  0.487249
2025-07-08 05:33:00 -0.185370  0.631623
2025-07-08 05:36:00 -0.367081  0.584939
2025-07-08 05:39:00 -0.422671  0.447051


  bars = df["imb"].resample("3T").agg(["mean","std"])
