In [None]:
import joblib
import numpy as np
import pandas as pd
from typing import Optional, Tuple, List, Dict, Any
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier

from Feature_engineering import DEFAULT_FEATURES

DEFAULT_XGB_PARAMS = dict(
    objective="binary:logistic",
    eval_metric="logloss",
    learning_rate=0.05,
    max_depth=4,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
)


def _compute_metric(y_true: pd.Series, y_pred: np.ndarray, metric: str) -> float:
    if metric == "accuracy":
        return float(accuracy_score(y_true, y_pred))
    if metric == "precision":
        return float(precision_score(y_true, y_pred, zero_division=0))
    if metric == "recall":
        return float(recall_score(y_true, y_pred, zero_division=0))
    # default to f1
    return float(f1_score(y_true, y_pred, zero_division=0))


class PriceClassifier:
    """
    PriceClassifier supporting time-series cross-validation.

    Usage:
      - clf = PriceClassifier()
      - clf.fit(X, y)                      # single time-ordered split
      - clf.fit_tscv(X, y, n_splits=5)     # picks best fold model from TimeSeriesSplit
      - clf.predict(X_new)
      - clf.save("model.pkl"), PriceClassifier.load("model.pkl")
    """

    def __init__(self, params: Optional[dict] = None, feature_columns: Optional[List[str]] = None):
        self.params = (params or DEFAULT_XGB_PARAMS.copy()).copy()
        if feature_columns is not None:
            self.feature_columns = list(feature_columns)
        elif DEFAULT_FEATURES is not None:
            self.feature_columns = list(DEFAULT_FEATURES)
        else:
            self.feature_columns = None

        self.model = XGBClassifier(**self.params)
        self.pipeline = Pipeline([("scaler", StandardScaler()), ("model", self.model)])
        self.fitted = False
        self.best_iteration_ = None
        self._last_tscv_info: Optional[Dict[str, Any]] = None

    def _ensure_features_present(self, X: pd.DataFrame):
        if self.feature_columns is None:
            raise ValueError("feature_columns not set. Provide at init or ensure DEFAULT_FEATURES is available.")
        missing = [c for c in self.feature_columns if c not in X.columns]
        if missing:
            raise ValueError(f"Missing required feature columns in input: {missing}")

    def _prepare_Xy(self, X: pd.DataFrame, y: pd.Series, time_col: Optional[str]) -> Tuple[pd.DataFrame, pd.Series]:
        """Sort by time_col if provided; otherwise return copies."""
        Xc = X.copy()
        yc = y.copy()
        if time_col is not None:
            if time_col not in Xc.columns:
                raise ValueError(f"time_col '{time_col}' not found in X columns.")
            # sort by time_col ascending
            Xc = Xc.sort_values(time_col).reset_index(drop=True)
            # align y with Xc order if y has matching index values
            if isinstance(yc, pd.Series) and not yc.index.equals(X.index):
                # assume yc corresponds row-wise; reset yc to sequential to match Xc
                yc = pd.Series(yc.values, index=Xc.index)
            else:
                yc = yc.reset_index(drop=True)
        return Xc, yc

    def fit(self,
            X: pd.DataFrame,
            y: pd.Series,
            val_split: float = 0.2,
            early_stopping_rounds: Optional[int] = 25,
            time_col: Optional[str] = None) -> "PriceClassifier":
        """
        Fit using a single time-ordered holdout (train on earliest portion, validate on most recent val_split).
        If time_col is provided, rows are sorted by it before splitting.
        """
        Xc, yc = self._prepare_Xy(X, y, time_col)

        if self.feature_columns is None:
            self.feature_columns = list(Xc.columns)

        Xf = Xc[self.feature_columns]
        ycopy = yc.copy()

        X_tr, X_va, y_tr, y_va = train_test_split(Xf, ycopy, test_size=val_split, shuffle=False, random_state=42)

        scaler = self.pipeline.named_steps["scaler"]
        scaler.fit(X_tr)
        Xtr_scaled = scaler.transform(X_tr)
        Xva_scaled = scaler.transform(X_va)

        self.model.set_params(**self.params)
        self.model.fit(
            Xtr_scaled,
            y_tr,
            eval_set=[(Xva_scaled, y_va)] if early_stopping_rounds else None,
            early_stopping_rounds=early_stopping_rounds,
            verbose=False,
        )

        self.pipeline = Pipeline([("scaler", scaler), ("model", self.model)])
        self.fitted = True
        if hasattr(self.model, "best_iteration_"):
            try:
                self.best_iteration_ = int(self.model.best_iteration_)
            except Exception:
                self.best_iteration_ = None
        return self

    def fit_tscv(self,
                 X: pd.DataFrame,
                 y: pd.Series,
                 n_splits: int = 5,
                 metric: str = "f1",
                 early_stopping_rounds: Optional[int] = 25,
                 time_col: Optional[str] = None,
                 group_col: Optional[str] = None) -> "PriceClassifier":
        """
        Fit using TimeSeriesSplit and keep the model from the fold with best validation metric.

        Parameters
        ----------
        X, y : data
        n_splits : int
            Number of folds for TimeSeriesSplit. Must be >=2.
        metric : str
            Metric to select best fold ("accuracy", "precision", "recall", "f1").
        early_stopping_rounds : Optional[int]
            Passed to xgboost fit for each fold.
        time_col : Optional[str]
            If provided, data is sorted by this column before splitting.
        group_col : Optional[str]
            If provided, performs group-wise fold assignment: entire groups are assigned to folds in chronological
            order of group start times. This prevents leakage for panel data (multiple time series).
        """
        if n_splits < 2:
            raise ValueError("n_splits must be >= 2 for TimeSeriesSplit")

        # Prepare / sort
        Xc, yc = self._prepare_Xy(X, y, time_col)

        # Determine feature columns
        if self.feature_columns is None:
            self.feature_columns = list(Xc.columns)

        Xf = Xc[self.feature_columns].reset_index(drop=True)
        ycopy = yc.reset_index(drop=True)

        # Build fold indices
        if group_col is not None:
            if group_col not in Xc.columns:
                raise ValueError(f"group_col '{group_col}' not found in X columns.")
            # Order groups by min time_col if provided else by group label order
            if time_col is not None:
                grp_order = (Xc.groupby(group_col)[time_col].min()
                             .sort_values()
                             .index
                             .tolist())
            else:
                grp_order = Xc[group_col].drop_duplicates().tolist()

            # Split groups into n_splits sequential buckets (approx equal by count of groups)
            groups = grp_order
            group_buckets = [[] for _ in range(n_splits)]
            for i, g in enumerate(groups):
                group_buckets[i % n_splits].append(g)

            # Map group -> indices
            group_to_indices = {g: sub.index.tolist() for g, sub in Xc.groupby(group_col)}

            # Build fold index pairs (train_idx, val_idx)
            fold_indices: List[Tuple[np.ndarray, np.ndarray]] = []
            # For time-series behavior, use increasing training window: for i in range(1, n_splits)
            for i in range(1, n_splits):
                train_groups = [g for bucket in group_buckets[:i] for g in bucket]
                val_groups = group_buckets[i]
                train_idx = [idx for g in train_groups for idx in group_to_indices.get(g, [])]
                val_idx = [idx for g in val_groups for idx in group_to_indices.get(g, [])]
                if len(train_idx) == 0 or len(val_idx) == 0:
                    continue
                fold_indices.append((np.array(sorted(train_idx)), np.array(sorted(val_idx))))
        else:
            tscv = TimeSeriesSplit(n_splits=n_splits)
            fold_indices = list(tscv.split(Xf))

        # Iterate folds, train & evaluate, keep best model
        best_score = -np.inf
        best_model = None
        best_scaler = None
        best_fold_info = None

        for fold_num, (train_idx, val_idx) in enumerate(fold_indices):
            X_train, X_val = Xf.iloc[train_idx], Xf.iloc[val_idx]
            y_train, y_val = ycopy.iloc[train_idx], ycopy.iloc[val_idx]

            # Fit scaler on X_train
            scaler = StandardScaler()
            scaler.fit(X_train)
            Xtr_scaled = scaler.transform(X_train)
            Xva_scaled = scaler.transform(X_val)

            # Train xgboost on this fold
            model = XGBClassifier(**self.params)
            model.fit(
                Xtr_scaled,
                y_train,
                eval_set=[(Xva_scaled, y_val)] if early_stopping_rounds else None,
                early_stopping_rounds=early_stopping_rounds,
                verbose=False,
            )

            # Evaluate on validation fold using chosen metric (threshold 0.5)
            y_val_pred = (model.predict_proba(Xva_scaled)[:, 1] >= 0.5).astype(int)
            score = _compute_metric(y_val, y_val_pred, metric)

            if score > best_score:
                best_score = score
                best_model = model
                best_scaler = scaler
                best_fold_info = {
                    "fold": fold_num,
                    "train_size": int(len(train_idx)),
                    "val_size": int(len(val_idx)),
                    "metric": metric,
                    "score": float(score),
                }

        if best_model is None:
            raise RuntimeError("No valid folds produced a trained model. Check n_splits and your data.")

        # Set best model and scaler as the official model
        self.model = best_model
        self.pipeline = Pipeline([("scaler", best_scaler), ("model", self.model)])
        self.fitted = True
        if hasattr(self.model, "best_iteration_"):
            try:
                self.best_iteration_ = int(self.model.best_iteration_)
            except Exception:
                self.best_iteration_ = None

        self._last_tscv_info = best_fold_info
        return self

    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        if not self.fitted:
            raise AssertionError("Model not fitted. Call .fit() or .fit_tscv() first or .load() a saved model.")
        self._ensure_features_present(X)
        Xs = X[self.feature_columns]
        scaler = self.pipeline.named_steps["scaler"]
        Xs_scaled = scaler.transform(Xs)
        proba = self.model.predict_proba(Xs_scaled)
        return proba[:, 1] if proba.ndim > 1 else proba

    def predict(self, X: pd.DataFrame, threshold: float = 0.55) -> np.ndarray:
        p = self.predict_proba(X)
        return (p >= threshold).astype(int)

    def evaluate(self, X: pd.DataFrame, y_true: pd.Series, threshold: float = 0.55) -> Dict[str, Any]:
        y_pred = self.predict(X, threshold=threshold)
        return {
            "accuracy": float(accuracy_score(y_true, y_pred)),
            "precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "f1": float(f1_score(y_true, y_pred, zero_division=0)),
            "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        }

    def save(self, path: str) -> None:
        payload = {
            "params": self.params,
            "feature_columns": self.feature_columns,
            "model": self.model,
            "scaler": self.pipeline.named_steps["scaler"],
            "best_iteration_": self.best_iteration_,
        }
        joblib.dump(payload, path)

    @classmethod
    def load(cls, path: str) -> "PriceClassifier":
        obj = joblib.load(path)
        pc = cls(params=obj.get("params"), feature_columns=obj.get("feature_columns"))
        pc.model = obj["model"]
        saved_scaler = obj.get("scaler", StandardScaler())
        pc.pipeline = Pipeline([("scaler", saved_scaler), ("model", pc.model)])
        pc.fitted = True
        pc.best_iteration_ = obj.get("best_iteration_", None)
        return pc

    def classification_report(self, y_true: pd.Series, y_pred: np.ndarray) -> dict:
        return {
            "accuracy": float(accuracy_score(y_true, y_pred)),
            "precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "f1": float(f1_score(y_true, y_pred, zero_division=0)),
            "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        }

In [None]:
# model.py
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score
)

from xgboost import XGBClassifier, XGBRegressor

# -------------------------
# Utility: direction accuracy for regression (optional)
# -------------------------
def direction_accuracy(y_true, y_pred):
    """Fraction of samples where sign(pred) == sign(true)."""
    s_true = np.sign(y_true)
    s_pred = np.sign(y_pred)
    return np.mean(s_true == s_pred)

# -------------------------
# Classification: time-aware trainer
# -------------------------
def train_classification_timeaware(
    X, y_cls,
    save_dir="models",
    test_size=0.2,
    use_timeseries_cv=True,
    n_splits=5,
    cls_params=None,
    random_state=42
):
    if cls_params is None:
        cls_params = {
            "n_estimators": 300,
            "max_depth": 4,
            "learning_rate": 0.05,
            "verbosity": 0,
            "random_state": random_state,
            "use_label_encoder": False,
            "eval_metric": "logloss",
        }

    X_df = X.copy() if not isinstance(X, np.ndarray) else pd.DataFrame(X)
    y = pd.Series(y_cls).copy()
    mask = ~y.isna()
    X_df = X_df.loc[mask]
    y = y.loc[mask]

    os.makedirs(save_dir, exist_ok=True)

    if use_timeseries_cv:
        tscv = TimeSeriesSplit(n_splits=n_splits)
        fold_metrics = []
        last_model = None
        last_scaler = None

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X_df)):
            X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train)
            X_val_s = scaler.transform(X_val)

            clf = XGBClassifier(**cls_params)
            clf.fit(X_train_s, y_train, eval_set=[(X_val_s, y_val)],
                    early_stopping_rounds=25, verbose=False)

            preds = clf.predict(X_val_s)

            m = {
                "accuracy": accuracy_score(y_val, preds),
                "precision": precision_score(y_val, preds, zero_division=0),
                "recall": recall_score(y_val, preds, zero_division=0),
                "f1": f1_score(y_val, preds, zero_division=0)
            }
            fold_metrics.append(m)

            last_model = clf
            last_scaler = scaler

        avg_metrics = {k: np.mean([fm[k] for fm in fold_metrics]) for k in fold_metrics[0]}
        joblib.dump(last_model, os.path.join(save_dir, "trained_classifier.pkl"))
        joblib.dump(last_scaler, os.path.join(save_dir, "classifier_scaler.pkl"))

        print("Classification (TimeSeriesSplit) complete. Avg metrics across folds:")
        for k, v in avg_metrics.items():
            print(f"{k}: {v:.6f}")
        print(f"Saved classifier -> {os.path.join(save_dir, 'trained_classifier.pkl')}")

        return last_model, last_scaler, avg_metrics

    else:
        n = len(X_df)
        test_n = int(n * test_size)
        train_n = n - test_n

        X_train = X_df.iloc[:train_n]
        X_test = X_df.iloc[train_n:]
        y_train = y.iloc[:train_n]
        y_test = y.iloc[train_n:]

        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s = scaler.transform(X_test)

        clf = XGBClassifier(**cls_params)
        clf.fit(X_train_s, y_train, eval_set=[(X_test_s, y_test)],
                early_stopping_rounds=25, verbose=False)

        preds = clf.predict(X_test_s)
        metrics = {
            "accuracy": accuracy_score(y_test, preds),
            "precision": precision_score(y_test, preds, zero_division=0),
            "recall": recall_score(y_test, preds, zero_division=0),
            "f1": f1_score(y_test, preds, zero_division=0)
        }

        joblib.dump(clf, os.path.join(save_dir, "trained_classifier.pkl"))
        joblib.dump(scaler, os.path.join(save_dir, "classifier_scaler.pkl"))

        print("Classification (chronological split) complete. Metrics (test set):")
        for k, v in metrics.items():
            print(f"{k}: {v:.6f}")
        print(f"Saved classifier -> {os.path.join(save_dir, 'trained_classifier.pkl')}")

        return clf, scaler, metrics

# -------------------------
# Regression: time-aware trainer
# -------------------------
def train_regression_timeaware(
    X, y_reg,
    save_dir="models",
    test_size=0.2,
    use_timeseries_cv=True,
    n_splits=5,
    xgb_params=None,
    random_state=42
):
    if xgb_params is None:
        xgb_params = {
            "n_estimators": 500,
            "max_depth": 4,
            "learning_rate": 0.03,
            "verbosity": 0,
            "random_state": random_state,
            "objective": "reg:squarederror",
        }

    X_df = X.copy() if not isinstance(X, np.ndarray) else pd.DataFrame(X)
    y = pd.Series(y_reg).copy()
    mask = ~y.isna()
    X_df = X_df.loc[mask]
    y = y.loc[mask]

    os.makedirs(save_dir, exist_ok=True)

    if use_timeseries_cv:
        tscv = TimeSeriesSplit(n_splits=n_splits)
        val_metrics = []
        last_model = None
        last_scaler = None

        for fold, (train_idx, val_idx) in enumerate(tscv.split(X_df)):
            X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train)
            X_val_s = scaler.transform(X_val)

            model = XGBRegressor(**xgb_params)
            model.fit(X_train_s, y_train, eval_set=[(X_val_s, y_val)],
                      early_stopping_rounds=25, verbose=False)

            preds = model.predict(X_val_s)
            mae = mean_absolute_error(y_val, preds)
            mse = mean_squared_error(y_val, preds)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_val, preds)
            dir_acc = direction_accuracy(y_val.values, preds)

            val_metrics.append({"mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "dir_acc": dir_acc})

            last_model = model
            last_scaler = scaler

        avg_metrics = {k: np.mean([m[k] for m in val_metrics]) for k in val_metrics[0]}

        joblib.dump(last_model, os.path.join(save_dir, "trained_regressor.pkl"))
        joblib.dump(last_scaler, os.path.join(save_dir, "regressor_scaler.pkl"))

        print("Regression (TimeSeriesSplit) complete. Avg metrics across folds:")
        for k, v in avg_metrics.items():
            print(f"{k}: {v:.6f}")
        print(f"Saved regressor -> {os.path.join(save_dir, 'trained_regressor.pkl')}")

        return last_model, last_scaler, avg_metrics

    else:
        n = len(X_df)
        test_n = int(n * test_size)
        train_n = n - test_n

        X_train = X_df.iloc[:train_n]
        X_test = X_df.iloc[train_n:]
        y_train = y.iloc[:train_n]
        y_test = y.iloc[train_n:]

        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s = scaler.transform(X_test)

        model = XGBRegressor(**xgb_params)
        model.fit(X_train_s, y_train, eval_set=[(X_test_s, y_test)],
                  early_stopping_rounds=25, verbose=False)

        preds = model.predict(X_test_s)
        mae = mean_absolute_error(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, preds)
        dir_acc = direction_accuracy(y_test.values, preds)

        metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "dir_acc": dir_acc}

        joblib.dump(model, os.path.join(save_dir, "trained_regressor.pkl"))
        joblib.dump(scaler, os.path.join(save_dir, "regressor_scaler.pkl"))

        print("Regression (chronological split) complete. Metrics (test set):")
        for k, v in metrics.items():
            print(f"{k}: {v:.6f}")
        print(f"Saved regressor -> {os.path.join(save_dir, 'trained_regressor.pkl')}")

        return model, scaler, metrics

# -------------------------
# Convenience: train both models
# -------------------------
def train_both(
    X, y_cls, y_reg,
    save_dir="models",
    cls_use_timeseries_cv=True,
    reg_use_timeseries_cv=True,
    n_splits=5
):
    out = {}
    print("=== Training Classification ===")
    clf, scaler_clf, clf_metrics = train_classification_timeaware(
        X, y_cls, save_dir=save_dir,
        use_timeseries_cv=cls_use_timeseries_cv, n_splits=n_splits
    )
    out["classifier"] = {"model": clf, "scaler": scaler_clf, "metrics": clf_metrics}

    print("\n=== Training Regression ===")
    reg, scaler_reg, reg_metrics = train_regression_timeaware(
        X, y_reg, save_dir=save_dir,
        use_timeseries_cv=reg_use_timeseries_cv, n_splits=n_splits
    )
    out["regressor"] = {"model": reg, "scaler": scaler_reg, "metrics": reg_metrics}

    return out

# -------------------------
# Prediction helpers
# -------------------------
def predict_regression(X_new, scaler_path="models/regressor_scaler.pkl", model_path="models/trained_regressor.pkl"):
    scaler = joblib.load(scaler_path)
    model = joblib.load(model_path)
    Xs = scaler.transform(X_new)
    preds = model.predict(Xs)
    return preds

def predict_classification(X_new, scaler_path="models/classifier_scaler.pkl", model_path="models/trained_classifier.pkl"):
    scaler = joblib.load(scaler_path)
    model = joblib.load(model_path)
    Xs = scaler.transform(X_new)
    preds = model.predict(Xs)
    return preds

# -------------------------
# Main usage message (no automatic download or demo)
# -------------------------
if __name__ == "__main__":
    print("model.py loaded. This module exposes:")
    print(" - train_classification_timeaware(X, y_cls, ...)")
    print(" - train_regression_timeaware(X, y_reg, ...)")
    print(" - train_both(X, y_cls, y_reg, ...)")
    print(" - predict_classification(X_new, ...)")
    print(" - predict_regression(X_new, ...)")
    print("\nRun these from a notebook where you prepare X, y_cls, y_reg (no automatic data download).")


In [None]:
import pandas as pd
import numpy as np
from Feature_engineering import FeatureBuilder  # adjust import if filename differs

# -------------- load CSV and prepare ----------------
df = pd.read_csv("RELIANCE.csv", parse_dates=["Date"], dayfirst=False)  # change filename if needed
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())

# If 'Date' column exists, set it as index so it's not part of numeric columns
if "Date" in df.columns:
    df = df.set_index("Date")
    print("Set 'Date' as index. Index dtype:", df.index.dtype)

# Identify non-numeric columns (these would break .astype(float))
non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print("Non-numeric columns found (they will be dropped):", non_numeric)
    df = df.drop(columns=non_numeric)

# Quick check for common required columns
required_cols = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    print("WARNING â€” missing these expected columns:", missing)
    print("Your FeatureBuilder may expect these exact names. Rename or create them if needed.")
else:
    print("All expected OHLCV columns present.")

# Optionally show first rows
print(df.head())

# -------------- build dataset ----------------
fb = FeatureBuilder()
# FeatureBuilder expects a DataFrame without datetime columns; index can be datetime (good)
X, y_cls, y_reg, features, aligned_df = fb.build_dataset(df)

print("X shape:", X.shape)
print("y_cls shape:", None if y_cls is None else y_cls.shape)
print("y_reg shape:", None if y_reg is None else y_reg.shape)
print("Features used:", features)

