In [3]:
pip install --force-reinstall lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Collecting numpy>=1.17.0 (from lightgbm)
  Using cached numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (171 kB)
Collecting scipy (from lightgbm)
  Using cached scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (61 kB)
Using cached lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
Using cached numpy-2.3.1-cp313-cp313-macosx_11_0_arm64.whl (14.2 MB)
Using cached scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl (28.5 MB)
Installing collected packages: numpy, scipy, lightgbm
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 2.3.1
[2K    Uninstalling numpy-2.3.1:
[2K      Successfully uninstalled numpy-2.3.1
[2K  Attempting uninstall: scipy━━━━━━━━━━━━━━━━━━━[0m [32m0/3[0m [numpy]
[2K    Found existing installation: scipy 1.16.0[0m [32m0/3[0m [numpy]
[2K    Uninstalling scipy-1.16.0:╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
# ── AUC-check cell (updated for N_AHEAD) ────────────────────────────
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

from algo import load_config
from algo import KiteWrapper
from research.features import add_indicators, FEATURES
from research.model    import load_or_train, LOOKBACK, N_AHEAD        # ← import constants

# 1) Fetch 6-month history & build indicators
cfg    = load_config()
broker = KiteWrapper(cfg)
hist   = broker.history(days=180, interval="3minute", tradingsymbol="IDEA")
df_all = add_indicators(hist).ffill()

# 2) Binary target: price N_AHEAD bars ahead higher than now?
df_all["y"] = (df_all["close"].shift(-N_AHEAD) > df_all["close"]).astype(int)

# 3) Build sliding windows (feature matrix X) that match live pipeline
windows, labels = [], []
for i in range(LOOKBACK, len(df_all) - N_AHEAD):
    win = df_all.iloc[i - LOOKBACK : i][FEATURES].to_numpy().ravel()
    windows.append(win)
    labels.append(df_all["y"].iat[i])

X = np.asarray(windows, dtype="float32")
y = np.asarray(labels, dtype="int8")

# 4) Train / reload LightGBM pipeline (set retrain=True once after feature edits)
model = load_or_train(df_all.iloc[: -(LOOKBACK + N_AHEAD)], retrain=True)

# 5) Forward-walk CV AUC
tscv = TimeSeriesSplit(n_splits=10)
auc_scores = cross_val_score(model, X, y, cv=tscv, scoring="roc_auc", n_jobs=-1)

print("Fold AUCs :", np.round(auc_scores, 3))
print("Median AUC:", round(np.median(auc_scores), 3))


[KiteWrapper] initialized: symbol=RELIANCE on exch=NSE
[history] start: days=180, interval=3minute, symbol=IDEA
[history] range UTC-naive: 2025-01-09 03:42:04.067771 → 2025-07-08 03:42:04.067771
[history] token=3677697
[history] got 8375 bars, cursor→2025-04-17 15:30:00
[history] got 6875 bars, cursor→2025-07-07 15:30:00
[history] empty data for 2025-07-07 15:30:00->2025-07-08 03:42:04.067771, breaking loop
[history] complete 15250 bars 2025-01-09 09:15:00 → 2025-07-07 15:27:00
🔧  Training started at 09:12:08
✅  Training finished in 9.4s
Validation accuracy: 0.557


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fold AUCs : [0.499 0.495 0.555 0.504 0.501 0.449 0.543 0.494 0.521 0.53 ]
Median AUC: 0.503




In [7]:
# research/test_auc_with_imbalance.py

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

# 1) Load the merged DataFrame you just created
df = pd.read_csv(
    "merged_RELIANCE_imb_3m_2025-07-08.csv",
    parse_dates=["timestamp"],
    index_col="timestamp",
)

# 2) Import your research-model pipeline & constants
from research.model    import load_or_train, N_AHEAD
from research.features import FEATURES
from research.config   import load_config
from algo import KiteWrapper

# 3) (Re)train on the full merged history
#    — so your pipeline now *sees* imb_mean / imb_std
cfg     = load_config()
broker  = KiteWrapper(cfg)
hist_full = broker.history(days=180, interval="3minute", tradingsymbol=cfg.tradingsymbol)
df_full   = pd.read_csv(
    "merged_RELIANCE_imb_3m_2025-07-08.csv",
    parse_dates=["timestamp"],
    index_col="timestamp"
).pipe(lambda d: d.ffill())  # fill any NaNs

model = load_or_train(df_full, retrain=True)  # retrain=True because we changed FEATURES

# 4) Prepare X, y for the most recent day (or full period)
X = df_full[FEATURES]
y = (df_full["close"].shift(-N_AHEAD) > df_full["close"]).astype(int)

# drop the last N_AHEAD rows (they have no target)
X, y = X.iloc[:-N_AHEAD], y.iloc[:-N_AHEAD]

# 5) TimeSeriesSplit → cross-val AUC
tscv = TimeSeriesSplit(n_splits=10)
auc_scores = cross_val_score(
    model, X, y,
    cv=tscv,
    scoring="roc_auc",
    n_jobs=-1,
)

# 6) Report
print("Fold AUCs :", np.round(auc_scores, 3))
print("Median AUC:", np.round(np.median(auc_scores), 3))


[KiteWrapper] initialized: symbol=RELIANCE on exch=NSE
[history] start: days=180, interval=3minute, symbol=RELIANCE
[history] range UTC-naive: 2025-01-09 05:53:22.446389 → 2025-07-08 05:53:22.446389
[history] token=738561
[history] got 8375 bars, cursor→2025-04-17 15:30:00
[history] got 6875 bars, cursor→2025-07-07 15:30:00
[history] empty data for 2025-07-07 15:30:00->2025-07-08 05:53:22.446389, breaking loop
[history] complete 15250 bars 2025-01-09 09:15:00 → 2025-07-07 15:27:00
🔧  Training started at 11:23:23
✅  Training finished in 9.1s
Validation accuracy: 0.484


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Fold AUCs : [0.499 0.573 0.557 0.522 0.492 0.542 0.531 0.5   0.523 0.543]
Median AUC: 0.527




In [6]:
import pandas as pd

# 1) load whatever’s in the CSV so far:
df = pd.read_csv(
    "imb_tape_2025-07-08.csv",
    parse_dates=["ts_utc"],
    index_col="ts_utc"
)

# 2) resample to 3-min and compute mean/std
bars = df["imb"].resample("3T").agg(["mean","std"])

print(f"Bars so far: {len(bars)} (out of 125 expected)")
print(bars.head(), "\n…\n", bars.tail())


Bars so far: 13 (out of 125 expected)
                         mean       std
ts_utc                                 
2025-07-08 05:03:00 -0.002628  0.652623
2025-07-08 05:06:00 -0.103327  0.621978
2025-07-08 05:09:00 -0.162325  0.629816
2025-07-08 05:12:00 -0.245400  0.535625
2025-07-08 05:15:00  0.039151  0.583979 
…
                          mean       std
ts_utc                                 
2025-07-08 05:27:00 -0.313418  0.561300
2025-07-08 05:30:00 -0.416137  0.487249
2025-07-08 05:33:00 -0.185370  0.631623
2025-07-08 05:36:00 -0.367081  0.584939
2025-07-08 05:39:00 -0.422671  0.447051


  bars = df["imb"].resample("3T").agg(["mean","std"])


In [None]:


# %%bash
# ensure your project root is on PYTHONPATH if needed
# export PYTHONPATH="${PYTHONPATH}:/path/to/your/project"

# %%python
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import randint, uniform

# adjust this import if your code lives elsewhere
from research.model import _prepare_xy

# 1️⃣ Load your data from CSV instead of Parquet
csv_path = "merged_RELIANCE_imb_3m_2025-07-08.csv"  # ← change to your file
# If your datetime column is named 'timestamp':
df = pd.read_csv(csv_path, parse_dates=['timestamp'], index_col='timestamp')
# Alternatively, if datetime is the first column:
# df = pd.read_csv(csv_path, index_col=0, parse_dates=True)

print("✅ Loaded DataFrame:", df.shape)
display(df.head())  # if you want to peek at columns

# 2️⃣ Prepare features & labels
X, y = _prepare_xy(df)
print(f"🔧 Prepared X shape = {X.shape}, y shape = {y.shape}")

# 3️⃣ Define the hyperparameter tuner
def tune_hyperparameters(X: np.ndarray, y: np.ndarray, n_iter: int = 50):
    pipe = _build_pipe()
    param_dist = {
        "gbm__num_leaves":       randint(16, 128),
        "gbm__learning_rate":    uniform(0.01, 0.19),
        "gbm__n_estimators":     randint(100, 1000),
        "gbm__subsample":        uniform(0.5, 0.5),
        "gbm__colsample_bytree": uniform(0.5, 0.5),
        "gbm__reg_alpha":        uniform(0.0, 1.0),
        "gbm__reg_lambda":       uniform(0.0, 1.0),
    }
    tscv = TimeSeriesSplit(n_splits=5)
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring="roc_auc",
        cv=tscv,
        n_jobs=-1,
        verbose=2,
        random_state=42,
    )
    print("🔎 Starting hyperparameter search…")
    search.fit(X, y)
    print(f"\n🏆 Best CV AUC: {search.best_score_:.4f}")
    print("✨ Best hyperparameters:")
    for k, v in search.best_params_.items():
        print(f"   • {k} = {v}")
    return search.best_estimator_

# 4️⃣ Run the search
best_pipe = tune_hyperparameters(X, y, n_iter=100)

# 5️⃣ Save the tuned pipeline
joblib.dump(best_pipe, "best_lgbm_pipeline.pkl")
print("💾 Saved tuned model to best_lgbm_pipeline.pkl")


In [None]:
import joblib
import pandas as pd
from sklearn.metrics import roc_auc_score

from research.model import _build_pipe, _prepare_xy

# 1️⃣ Load your “all‐in” training data (same CSV you tuned on)
train_csv = "merged_RELIANCE_imb_3m_2025-07-08.csv"
df_train = pd.read_csv(train_csv, parse_dates=['timestamp'], index_col='timestamp')

# 2️⃣ Prepare X_train, y_train
X_train, y_train = _prepare_xy(df_train)

# 3️⃣ Build pipeline with tuned hyperparameters
best_params = {
    'gbm__colsample_bytree': 0.5961445094043354,
    'gbm__learning_rate':    0.017765037090630986,
    'gbm__n_estimators':     104,
    'gbm__num_leaves':       97,
    'gbm__reg_alpha':        0.2785903390319586,
    'gbm__reg_lambda':       0.17701048427674682,
    'gbm__subsample':        0.5443512668785278,
}

pipe_final = _build_pipe()
pipe_final.set_params(**best_params)

# 4️⃣ Fit on all training data
pipe_final.fit(X_train, y_train)

# 5️⃣ Save your final model
joblib.dump(pipe_final, "final_lgbm_pipeline.pkl")
print("✅ Final model trained and saved as final_lgbm_pipeline.pkl")

# 6️⃣ (Optional) Evaluate on a hold-out CSV
test_csv = "imb_3m_2025-07-08.csv"  # swap in your test file
df_test = pd.read_csv(test_csv, parse_dates=['timestamp'], index_col='timestamp')
X_test, y_test = _prepare_xy(df_test)
y_pred = pipe_final.predict_proba(X_test)[:, 1]
print("🏁 Hold-out AUC:", roc_auc_score(y_test, y_pred))


In [None]:
# 1️⃣ Peek at the first few rows to see what your datetime column is called
import pandas as pd

test_csv = "imb_3m_2025-07-08.csv"
df_preview = pd.read_csv(test_csv, nrows=5)
print("Columns in test file:", list(df_preview.columns))
display(df_preview.head())

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import roc_auc_score
from research.model import _prepare_xy

# 1️⃣ Load your merged half-day CSV
merged_csv = "merged_RELIANCE_imb_3m_2025-07-08.csv"
df = pd.read_csv(merged_csv, index_col=0, parse_dates=True)

# 2️⃣ Split by row‐count (first 70% train, last 30% test)
n = len(df)
cut = int(0.7 * n)
df_train, df_test = df.iloc[:cut], df.iloc[cut:]

# 3️⃣ Build X/y
X_tr, y_tr = _prepare_xy(df_train)
X_te, y_te = _prepare_xy(df_test)

# 4️⃣ Fit your tuned pipeline on the first 70%
pipe = joblib.load("best_lgbm_pipeline.pkl")
pipe.fit(X_tr, y_tr)

# 5️⃣ Score on the last 30%
y_pred = pipe.predict_proba(X_te)[:,1]
print("Hold-out (last 30%) AUC:", roc_auc_score(y_te, y_pred))


In [None]:
# %%python
import joblib
import pandas as pd
import matplotlib.pyplot as plt

# 1️⃣ Import your constants and model
from research.model import LOOKBACK         # number of bars per window (60) :contentReference[oaicite:0]{index=0}
from features import FEATURES               # list of feature names (length=14) :contentReference[oaicite:1]{index=1}

pipe = joblib.load("best_lgbm_pipeline.pkl")
model = pipe.named_steps["gbm"]

# 2️⃣ Raw importances
importances = model.feature_importances_    # shape = (LOOKBACK * len(FEATURES),)

# 3️⃣ Reshape into matrix: rows=lag steps, cols=features
imp_matrix = importances.reshape(LOOKBACK, len(FEATURES))

# 4️⃣ Aggregate: mean importance per feature across all lags
mean_imp = pd.Series(imp_matrix.mean(axis=0), index=FEATURES)
mean_imp = mean_imp.sort_values()

# 5️⃣ Plot
mean_imp.plot.barh(figsize=(8,6))
plt.title("Mean LGBM Feature Importance (averaged over 60-bar window)")
plt.xlabel("Mean importance")
plt.tight_layout()


In [None]:
from features import add_indicators
import pandas as pd

# 1️⃣ Preview the merged file’s columns so you know what to load
merged_csv = "merged_RELIANCE_imb_3m_2025-07-08.csv"
print("Columns in merged file:", pd.read_csv(merged_csv, nrows=0).columns.tolist())

# 2️⃣ Load the first 30 bars of just the OHLCV columns
#    (swap 'open','high','low','close','volume' for whatever your column names actually are)
df_raw = pd.read_csv(
    merged_csv,
    parse_dates=["ts_utc"],    # this is your datetime column
    index_col="ts_utc"
)
df_slice = df_raw[["open","high","low","close","volume"]].iloc[:30]

# 3️⃣ Run add_indicators in debug mode
df_debug = add_indicators(df_slice, debug=True)

# 4️⃣ Inspect the printed output and df_debug to verify each series
display(df_debug.head())
