In [9]:
!pip install -U xgboost


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 4.3 MB/s eta 0:00:17
    --------------------------------------- 1.6/72.0 MB 3.2 MB/s eta 0:00:22
    --------------------------------------- 1.6/72.0 MB 3.2 MB/s eta 0:00:22
   - -------------------------------------- 1.8/72.0 MB 1.9 MB/s eta 0:00:37
   - -------------------------------------- 2.1/72.0 MB 2.1 MB/s eta 0:00:34
   - -------------------------------------- 3.1/72.0 MB 2.3 MB/s eta 0:00:30
   -- ------------------------------------- 4.2/72.0 MB 2.8 MB/s eta 0:00:25
   --- -------------------

  You can safely remove it manually.

[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# ---------------------------------------
#          Train_AND_Save
# ----------------------------------------

# It connect all the three file
     - data_collection.py
     - feature_engineering.py
     - Model_training.py

In [3]:
from datetime import datetime
import os
import warnings

from data_collection import get_price_data
from Feature_engineering import FeatureBuilder, DEFAULT_FEATURES
from Model_training import PriceClassifier


# ---- config -------
TICKER = "RELIANCE.NS"
START_DATE = "2016-01-01"
HORIZON = 1
MODEL_PATH = "price_classifier.joblib"
PRED_CSV = "predictions_tail.csv"


# ----- Safety: Check start date not in future -----
today = datetime.now().date()
if datetime.fromisoformat(START_DATE).date() > today:
    raise ValueError(f"START_DATE {START_DATE} is in the future (today is {today}). Use a past date.")


# 1) Download data
print("Downloading price data...")
df = get_price_data(TICKER, start=START_DATE)
print(f"Downloaded {len(df)} rows, date range: {df.index.min()} to {df.index.max()}")


# 2) Feature engineering
print("Building features...")
fb = FeatureBuilder(min_periods=5, dropna=False)
X, y_cls, y_reg, feats, aligned = fb.build_dataset(df, horizon=HORIZON)
print("Features built. X shape:", X.shape)
print("Label distribution:\n", y_cls.value_counts(normalize=True))


# Quick guard: ensure we have rows
if X.shape[0] == 0:
    raise RuntimeError("No rows available after feature engineering. Check min_periods/Start date/data source.")


# 3) Train with single time split (default behavior)
print("\nTraining PriceClassifier with a single time-based split...")
pc = PriceClassifier(feature_columns=feats)

# --- robust fit with early stopping (portable across xgboost versions) ---
fit_kwargs = {"verbose": False}

# preserve eval_set if available
try:
    fit_kwargs["eval_set"] = [(X.tail(int(len(X) * 0.2)), y_cls.tail(int(len(y_cls) * 0.2)))]
except Exception:
    pass

# optional: set eval_metric if you want an explicit metric for early stopping
# fit_kwargs["eval_metric"] = "logloss"

early_stopping_rounds = 25
early_rounds = early_stopping_rounds if (early_stopping_rounds and early_stopping_rounds > 0) else None

if early_rounds:
    try:
        import xgboost as xgb
        # prefer callback API (newer xgboost)
        fit_kwargs["callbacks"] = [xgb.callback.EarlyStopping(rounds=early_rounds, save_best=True)]
    except Exception:
        # fallback to legacy kwarg if callback API not available
        fit_kwargs["early_stopping_rounds"] = early_rounds

# attempt to fit; if the model complains about unexpected legacy kwarg, retry without it
try:
    pc.model.fit(X, y_cls, **fit_kwargs)
except TypeError as e:
    msg = str(e).lower()
    if "early_stopping_rounds" in msg and "unexpected" in msg:
        fit_kwargs.pop("early_stopping_rounds", None)
        try:
            import xgboost as xgb
            fit_kwargs["callbacks"] = [xgb.callback.EarlyStopping(rounds=early_rounds, save_best=True)]
        except Exception:
            fit_kwargs.pop("callbacks", None)
        # retry (may run without early stopping if neither approach is supported)
        pc.model.fit(X, y_cls, **fit_kwargs)
    else:
        # unknown problem â€” re-raise so it's visible
        raise
# --- end robust fit wrapper ---

print("Model training completed successfully.")

# 4) Save the model
print(f"\nSaving trained model to {MODEL_PATH}...")
pc.save(MODEL_PATH)
print("Model saved.")


# 5) Example: load and predict on last rows
print("\nLoading model and predicting on last 20 rows...")
pc2 = PriceClassifier.load(MODEL_PATH)

# Make sure we have at least 20 rows; else use available
n_tail = min(20, len(X))
X_tail = X.tail(n_tail).copy()
probs = pc2.predict_proba(X_tail)
preds = pc2.predict(X_tail, threshold=0.55)

out = X_tail.copy()
out["pred_proba"] = probs
out["pred_label"] = preds
out["true_label"] = y_cls.loc[out.index]

# Save predictions to CSV for inspection
out.to_csv(PRED_CSV, index=True)
print(f"Saved tail predictions to {PRED_CSV}")
print(out[["pred_proba", "pred_label", "true_label"]])


# 6) Optional: TimeSeriesSplit cross-validation example
print("\nOptional: running TimeSeriesSplit cross-validation (this is slower).")
pc_cv = PriceClassifier(feature_columns=feats)
cv_result = pc_cv.fit(
    X,
    y_cls,
    timeseries_cv=True,
    n_splits=5,
    val_split=0.2,
    retrain_final=True,
    early_stopping_rounds=25,
    dropna=True,
)
print("TimeSeriesSplit CV result summary:")
avg = cv_result.get("avg", {})
if avg:
    print("Average metrics across folds:")
    for m in ("accuracy", "precision", "recall", "f1"):
        print(f"  {m}: {avg.get(m):.4f}")


Downloading price data...
Downloaded 2436 rows, date range: 2016-01-01 00:00:00 to 2025-11-11 00:00:00
Building features...
Features built. X shape: (2435, 20)
Label distribution:
 target
1    0.520739
0    0.479261
Name: proportion, dtype: float64

Training PriceClassifier with a single time-based split...


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'callbacks'